In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import defaultdict
import re

import os
print(os.listdir("../input"))


import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.externals import joblib

#nn
from keras.models import Sequential
from keras.layers import Dense, Dropout

#svm
from  sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier


# Any results you write to the current directory are saved as output.

**Opening files containing the dataset**

In [None]:
df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")

**Getting rid of all recipes that consist of just one ingredient**

In [None]:
df['num_ingredients'] = df['ingredients'].apply(lambda x: len(x))
df = df[df['num_ingredients'] > 1]

**Preprocessing: **
* lowering all words
* deleting words with weird characters
* lemmatization -
* droping words shorter than 2 characters

In [None]:
lemmatizer = WordNetLemmatizer()
def preprocess(ingredients):
    ingredients_text = ' '.join(ingredients)
    ingredients_text = ingredients_text.lower()
    ingredients_text = ingredients_text.replace('-', ' ')
    ingredients
    words = []
    for word in ingredients_text.split():
        if re.findall('[0-9]', word): 
            continue
        if '’' in word: 
            continue
        if '®' in word:
            continue
        if ',' in word:
            continue
        if '.' in word:
            continue
        if '(' in word:
            continue
        if ')' in word:
            continue
        if '™' in word:
            continue
        
        word = lemmatizer.lemmatize(word)
        if len(word) > 2:
            words.append(word)
    return ' '.join(words)



In [None]:
df['x'] = df['ingredients'].apply(lambda ingredients: preprocess(ingredients))
test_df['x'] = test_df['ingredients'].apply(lambda ingredients: preprocess(ingredients))

** Creating list of unique words in dataset and then transforming it to list of unique nouns. **

In [None]:
ingredients = []
for rec in df["x"].values:
    for el in rec.split():
        if len(el) > 2:
            if el not in ingredients:        
                ingredients.append(el)

is_noun = lambda pos: pos[:2] == 'NN'

n = []              
for ingredient in ingredients:
    tokenized = nltk.word_tokenize(ingredient)
    tagged = nltk.pos_tag(tokenized)
    word = tagged[0][0]
    pos = tagged[0][1]
    if is_noun(pos) and len(word) > 2:
        n.append(word)

** Transforming every list of ingredients in every recipe to set of words that are present in unique nouns list**

In [None]:
#train
new_array = []
for rec in df["x"].values:
    coded_ingredients = []
    for el in rec.split():
        if el in n:
            coded_ingredients.append(el)
    new_array.append(' '.join(coded_ingredients))
    
df["nouns"] = pd.Series((new_array), index=df.index)


#test
new_array = []
for rec in test_df["x"].values:
    coded_ingredients = []
    for el in rec.split():
        if el in n:
            coded_ingredients.append(el)
    new_array.append(' '.join(coded_ingredients))
    
test_df["nouns"] = pd.Series((new_array), index=test_df.index)

** Encoding labels in the dataset to numerical representation for the model. **

In [None]:
le = LabelEncoder()
le.fit_transform(df["cuisine"].values)
cuisines = dict(zip(le.classes_, le.transform(le.classes_)))

y = le.transform(df['cuisine'].values)

** Transforming list of ingredients to vectors that can be feed to the model **

Firstly I've tried doing it myself but doing it using ** TfidVectorizer** reduced learning time a lot.

In [None]:
cvectorizer = make_pipeline(
    TfidfVectorizer(sublinear_tf=True),
    FunctionTransformer(lambda x: x.astype('float16'), validate=False)
)

X = vectorizer.fit_transform(df['nouns'].values)

X_test = vectorizer.transform(test_df['nouns'].values)


In [None]:
#environment issues 
%env JOBLIB_TEMP_FOLDER=/tmp

** Used model is OneVsRestClassifier with Support Vector Machine classifier as estimator. **

In [None]:

estimator = SVC( C=50, 
                 kernel='rbf',
                 gamma=1.4,
                 coef0=1,
                 shrinking=True,
                 tol=0.001, 
                 max_iter=-1 )
model= OneVsRestClassifier(estimator, n_jobs=4)

** Training the model **

In [None]:
model.fit(X, y)

** Predicting labels for submission **

In [None]:
prediction = model.predict(X_test)
prediction = le.inverse_transform(prediction)
prediction = pd.Series(prediction, name='cuisine')

** Saving submission csv **

In [None]:
submission = pd.concat([test_df.id, prediction], axis = 1)
submission.to_csv("SVCSubmission_2.csv", index = False)