In [1]:
import json
import os

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

### Import data

In [2]:
data_folder = '../data'
train_file = os.path.join(data_folder, 'train.json')
test_file = os.path.join(data_folder, 'test.json')

In [3]:
train = json.load(open(train_file)) 
test = json.load(open(test_file)) 

In [4]:
len(train), len(test)

(39774, 9944)

In [5]:
train[:1]

[{'id': 10259,
  'cuisine': 'greek',
  'ingredients': ['romaine lettuce',
   'black olives',
   'grape tomatoes',
   'garlic',
   'pepper',
   'purple onion',
   'seasoning',
   'garbanzo beans',
   'feta cheese crumbles']}]

### Prepare data

In [6]:
train_text = [" ".join(doc['ingredients']) for doc in train]
y = [doc['cuisine'] for doc in train]
test_text = [" ".join(doc['ingredients']) for doc in test]

In [8]:
len(train_text), len(y)

(39774, 39774)

In [9]:
train_text[:10]

['romaine lettuce black olives grape tomatoes garlic pepper purple onion seasoning garbanzo beans feta cheese crumbles',
 'plain flour ground pepper salt tomatoes ground black pepper thyme eggs green tomatoes yellow corn meal milk vegetable oil',
 'eggs pepper salt mayonaise cooking oil green chilies grilled chicken breasts garlic powder yellow onion soy sauce butter chicken livers',
 'water vegetable oil wheat salt',
 'black pepper shallots cornflour cayenne pepper onions garlic paste milk butter salt lemon juice water chili powder passata oil ground cumin boneless chicken skinless thigh garam masala double cream natural yogurt bay leaf',
 'plain flour sugar butter eggs fresh ginger root salt ground cinnamon milk vanilla extract ground ginger powdered sugar baking powder',
 'olive oil salt medium shrimp pepper garlic chopped cilantro jalapeno chilies flat leaf parsley skirt steak white vinegar sea salt bay leaf chorizo sausage',
 'sugar pistachio nuts white almond bark flour vanilla e

In [11]:
from collections import Counter
Counter(y)

Counter({'greek': 1175,
         'southern_us': 4320,
         'filipino': 755,
         'indian': 3003,
         'jamaican': 526,
         'spanish': 989,
         'italian': 7838,
         'mexican': 6438,
         'chinese': 2673,
         'british': 804,
         'thai': 1539,
         'vietnamese': 825,
         'cajun_creole': 1546,
         'brazilian': 467,
         'french': 2646,
         'japanese': 1423,
         'irish': 667,
         'korean': 830,
         'moroccan': 821,
         'russian': 489})

### Feature engineering

In [12]:
#pre-processing
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [13]:
X = []
for i in range(len(train_text)):
    X.append(clean_str(train_text[i]))
# y = np.array(df["cuisine"])


In [14]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [15]:
model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))
])

In [16]:
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

In [17]:
# gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
# gs_clf_svm = gs_clf_svm.fit(X, y)
# print(gs_clf_svm.best_score_)
# print(gs_clf_svm.best_params_)
# {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}

In [18]:
#preparing the final pipeline using the selected parameters
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [19]:
#fit model with training data
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ..._class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))])

In [20]:
#evaluation on test data
pred = model.predict(X_test)

In [21]:
model.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'], dtype='<U12')

In [22]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[  68,    0,    0,    0,    7,    1,    1,    2,    0,    2,    2,
           2,    0,    0,    0,    0,    3,    2,    4,    1],
       [   0,  114,    2,    1,    2,   17,    2,    3,   16,    8,    4,
           3,    1,    4,    0,    9,   23,    1,    2,    2],
       [   5,    4,  340,    3,    1,    8,    1,    2,    0,   10,    2,
           1,    0,    6,    1,    3,   62,    5,    0,    0],
       [   1,    1,    3,  667,   22,    3,    1,    5,    2,    4,    4,
          51,   40,    9,    1,    1,    1,    1,   25,   29],
       [   5,    3,    1,    5,  146,    4,    0,    3,    0,    1,    1,
           6,    1,    4,    0,    1,    6,    3,    4,    7],
       [   4,   35,   18,    3,    3,  493,   10,    5,   23,  126,    2,
           7,    1,   29,    6,   25,   55,   29,    0,    0],
       [   0,    5,    0,    1,    0,   11,  249,    5,    3,   31,    1,
           0,    0,    4,   11,    4,    3,    3,    1,    0],
       [   3,    4,    1,    5,    2,    

In [23]:
accuracy_score(y_test, pred)

0.780356993212101

In [32]:
#save the model
from sklearn.externals import joblib
joblib.dump(model, 'wider_model.joblib', compress=1)

['wider_model.joblib']

In [33]:
from sklearn.externals import joblib
model = joblib.load('wider_model.joblib')

In [34]:
test_recipe = "1 2 1/2  to 3 pound boneless pork shoulder or butt, trimmed and cut in half 1 small butternut squash (about 1 1/2 pounds)—peeled, seeded, and cut into 1 inch pieces 1 14.5 ounce can diced tomatoes 1 jalapeño pepper, seeded and chopped 2 cloves garlic, chopped 1 tablespoon chili powder kosher salt 4 6 inch corn tortillas, cut into 1/2 inch wide strips 1 tablespoon canola oil sliced radishes, cilantro sprigs, and lime wedges, for serving"

In [35]:
model.predict([test_recipe])[0]

'mexican'

In [36]:
steak_hache = "1 tbsp vegetable oil 4 shallots  , very finely chopped 600g freshly ground beef   (ask the butcher for something with roughly 15% fat - we used chuck) 8 thyme sprigs, leaves picked and chopped 2 tsp Dijon mustard 2 tbsp plain flour 200ml crème fraîche 1 egg yolk 6 tarragon   sprigs, leaves picked and finely chopped dressed green salad, to serve"

In [37]:
model.predict([steak_hache])[0]

'french'

In [38]:
toad_in_the_hole = "140g plain flour 3 eggs  300ml milk  2 tsp Dijon mustard 2 tbsp vegetable oil 8 Cumberland sausages 8 sage   leaves 4 rosemary   sprigs"

In [39]:
model.predict([toad_in_the_hole])[0]

'british'