In [1]:
import json
import os

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
# from sklearn.svm import SVC
# from sklearn.multiclass import OneVsRestClassifier

### Import data

In [2]:
data_folder = '../data'
train_file = os.path.join(data_folder, 'train.json')
test_file = os.path.join(data_folder, 'test.json')

In [3]:
train = json.load(open(train_file)) 
test = json.load(open(test_file)) 

In [4]:
len(train), len(test)

(39774, 9944)

In [5]:
train[:1]

[{'id': 10259,
  'cuisine': 'greek',
  'ingredients': ['romaine lettuce',
   'black olives',
   'grape tomatoes',
   'garlic',
   'pepper',
   'purple onion',
   'seasoning',
   'garbanzo beans',
   'feta cheese crumbles']}]

### Prepare data

In [6]:
train_text = [" ".join(doc['ingredients']).lower() for doc in train]
target = [doc['cuisine'] for doc in train]
test_text = [" ".join(doc['ingredients']).lower() for doc in test]

In [7]:
train_text[:10]

['romaine lettuce black olives grape tomatoes garlic pepper purple onion seasoning garbanzo beans feta cheese crumbles',
 'plain flour ground pepper salt tomatoes ground black pepper thyme eggs green tomatoes yellow corn meal milk vegetable oil',
 'eggs pepper salt mayonaise cooking oil green chilies grilled chicken breasts garlic powder yellow onion soy sauce butter chicken livers',
 'water vegetable oil wheat salt',
 'black pepper shallots cornflour cayenne pepper onions garlic paste milk butter salt lemon juice water chili powder passata oil ground cumin boneless chicken skinless thigh garam masala double cream natural yogurt bay leaf',
 'plain flour sugar butter eggs fresh ginger root salt ground cinnamon milk vanilla extract ground ginger powdered sugar baking powder',
 'olive oil salt medium shrimp pepper garlic chopped cilantro jalapeno chilies flat leaf parsley skirt steak white vinegar sea salt bay leaf chorizo sausage',
 'sugar pistachio nuts white almond bark flour vanilla e

In [15]:
from collections import Counter
Counter(target)

Counter({'greek': 1175,
         'southern_us': 4320,
         'filipino': 755,
         'indian': 3003,
         'jamaican': 526,
         'spanish': 989,
         'italian': 7838,
         'mexican': 6438,
         'chinese': 2673,
         'british': 804,
         'thai': 1539,
         'vietnamese': 825,
         'cajun_creole': 1546,
         'brazilian': 467,
         'french': 2646,
         'japanese': 1423,
         'irish': 667,
         'korean': 830,
         'moroccan': 821,
         'russian': 489})

### Feature engineering

In [16]:
tfidf = TfidfVectorizer(binary=True, stop_words='english', min_df=2)

In [10]:
X = tfidf.fit_transform(train_text)
X = X.astype('float16')

In [11]:
X

<39774x3010 sparse matrix of type '<class 'numpy.float16'>'
	with 761951 stored elements in Compressed Sparse Row format>

In [13]:
lb = LabelEncoder()
y = lb.fit_transform(target)

In [14]:
y

array([ 6, 16,  4, ...,  8,  3, 13])

### Build the model

In [19]:
classifier = SVC(C=100, # penalty parameter
    kernel='rbf', # kernel type, rbf working fine here
    degree=3, # default value
    gamma=1, # kernel coefficient
    coef0=1, # change to 1 from default value of 0.0
    shrinking=True, # using shrinking heuristics
    tol=0.001, # stopping criterion tolerance 
    probability=False, # no need to enable probability estimates
    cache_size=200, # 200 MB cache size
    class_weight=None, # all classes are treated equally 
    verbose=False, # print the logs 
    max_iter=1000, # no limit, let it run
#     max_iter=-1, # no limit, let it run
    decision_function_shape=None, # will use one vs rest explicitly 
    random_state=None)
model = OneVsRestClassifier(classifier, n_jobs=4)

In [20]:
## Model Tuning 
# parameters = {"estimator__gamma":[0.01, 0.5, 0.1, 2, 5]}
# grid_search = GridSearchCV(model, param_grid=parameters)
# grid_search.fit(X, y)
# print grid_search.best_score_
# print grid_search.best_params_
####

In [21]:
model.fit(X, y)

OneVsRestClassifier(estimator=SVC(C=100, cache_size=200, class_weight=None, coef0=1,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=4)

In [22]:
y_predict = model.predict(X)

In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[ 467,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,  804,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0, 1546,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0, 2666,    0,    2,    0,    0,    0,    2,    0,
           0,    0,    0,    0,    0,    3,    0,    0,    0],
       [   0,    0,    0,    0,  754,    0,    0,    0,    0,    1,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    1,    0,    1,    0, 2445,    0,    0,    0,  116,    0,
           0,    0,    6,    0,    0,   77,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0, 1173,    0,    0,    2,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    