In [1]:
import pandas as pd
import numpy as np

In [2]:
def make_features(df):
    df["num_ingredients"] = df["ingredients"].apply(len)
    df["ingredient_length"] = df["ingredients"].apply(
        lambda x: np.mean([len(item) for item in x])
    )
    df["ingredients_str"] = df["ingredients"].astype(str)
    return df

In [3]:
train = make_features(pd.read_json("train.json"))
new = make_features(pd.read_json("test.json"))

In [4]:
train.head()

Unnamed: 0,id,cuisine,ingredients,num_ingredients,ingredient_length,ingredients_str
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",9,12.0,"['romaine lettuce', 'black olives', 'grape tom..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,22213,indian,"[water, vegetable oil, wheat, salt]",4,6.75,"['water', 'vegetable oil', 'wheat', 'salt']"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1,"['black pepper', 'shallots', 'cornflour', 'cay..."


In [5]:
train.shape

(39774, 6)

In [6]:
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredient_length,ingredients_str
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,"['baking powder', 'eggs', 'all-purpose flour',..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,"['sugar', 'egg yolks', 'corn starch', 'cream o..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,"['sausage links', 'fennel bulb', 'fronds', 'ol..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,"['meat cuts', 'file powder', 'smoked sausage',..."
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,"['ground black pepper', 'salt', 'sausage casin..."


In [7]:
new.shape

(9944, 5)

In [8]:
X = train["ingredients_str"]
y = train["cuisine"]

In [9]:
X.head()

0    ['romaine lettuce', 'black olives', 'grape tom...
1    ['plain flour', 'ground pepper', 'salt', 'toma...
2    ['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...
3          ['water', 'vegetable oil', 'wheat', 'salt']
4    ['black pepper', 'shallots', 'cornflour', 'cay...
Name: ingredients_str, dtype: object

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(token_pattern = r"'([a-z ]+)'")

In [11]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [12]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(vect, nb)

In [13]:
pipe.steps

[('countvectorizer', CountVectorizer(token_pattern="'([a-z ]+)'")),
 ('multinomialnb', MultinomialNB())]

In [14]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv = 5,
                scoring = "accuracy", n_jobs = -1).mean()

0.7323126392849393

In [15]:
#named step__parameter
param_grid = {}
param_grid["countvectorizer__token_pattern"] = [
    r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid["multinomialnb__alpha"] = [0.5, 1]
param_grid

{'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"],
 'multinomialnb__alpha': [0.5, 1]}

In [16]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, param_grid,
                    cv = 5, scoring = "accuracy",
                    n_jobs = -1)

In [17]:
%time grid.fit(X, y)

CPU times: user 1.72 s, sys: 190 ms, total: 1.91 s
Wall time: 13.1 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(token_pattern="'([a-z "
                                                                      "]+)'")),
                                       ('multinomialnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'countvectorizer__token_pattern': ['\\b\\w\\w+\\b',
                                                            "'([a-z ]+)'"],
                         'multinomialnb__alpha': [0.5, 1]},
             scoring='accuracy')

In [18]:
grid.best_score_

0.7476492724428822

In [19]:
from sklearn.model_selection import RandomizedSearchCV
import scipy as sp

In [20]:
param_grid = {}
param_grid["countvectorizer__token_pattern"] = [
    r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid["countvectorizer__min_df"] = [1, 2, 3]
param_grid["multinomialnb__alpha"] = sp.stats.uniform(scale = 1)
param_grid

{'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"],
 'countvectorizer__min_df': [1, 2, 3],
 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen at 0x7f84630904c0>}

In [21]:
np.random.seed(1)

In [22]:
rand = RandomizedSearchCV(pipe, param_grid, cv = 5,
                          scoring = "accuracy", n_jobs = -1,
                          n_iter = 10, random_state = 1)

In [23]:
%time rand.fit(X, y)

CPU times: user 3.01 s, sys: 432 ms, total: 3.45 s
Wall time: 29.7 s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('countvectorizer',
                                              CountVectorizer(token_pattern="'([a-z "
                                                                            "]+)'")),
                                             ('multinomialnb',
                                              MultinomialNB())]),
                   n_jobs=-1,
                   param_distributions={'countvectorizer__min_df': [1, 2, 3],
                                        'countvectorizer__token_pattern': ['\\b\\w\\w+\\b',
                                                                           "'([a-z "
                                                                           "]+)'"],
                                        'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f84630904c0>},
                   random_state=1, scoring='accuracy')

In [24]:
rand.best_score_

0.7510685656857541

In [25]:
X = new["ingredients_str"]

In [26]:
rand.best_estimator_

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(min_df=2, token_pattern="'([a-z ]+)'")),
                ('multinomialnb', MultinomialNB(alpha=0.14038693859523377))])

In [27]:
new_pred_class_rand = rand.predict(X)
new_pred_class_rand

array(['british', 'southern_us', 'italian', ..., 'italian', 'southern_us',
       'mexican'], dtype='<U12')

In [33]:
X = train

In [34]:
X_dtm = vect.fit_transform(X["ingredients_str"])
X_dtm.shape

(39774, 6250)

In [35]:
X_manual = train.loc[:, ["num_ingredients", "ingredient_length"]]
X_manual.shape

(39774, 2)

In [36]:
#Creating a sparse matrix out of X_manual dataframe
X_manual_sparse = sp.sparse.csr_matrix(X_manual)

In [38]:
#Add manual sparse matrix to X_dtm which is already
#a sparse matrix
X_dtm_manual = sp.sparse.hstack([X_dtm, X_manual_sparse])
X_dtm_manual.shape

(39774, 6252)

### Converting a function to a transformer

In [39]:
def get_manual(df):
    return df.loc[:, ["num_ingredients", "ingredient_length"]]

In [40]:
get_manual(train).head()

Unnamed: 0,num_ingredients,ingredient_length
0,9,12.0
1,11,10.090909
2,12,10.333333
3,4,6.75
4,20,10.1


In [41]:
from sklearn.preprocessing import FunctionTransformer

In [42]:
get_manual_ft = FunctionTransformer(get_manual, validate = False)
type(get_manual_ft)

sklearn.preprocessing._function_transformer.FunctionTransformer

In [43]:
#How to use the converted function into transform
get_manual_ft.transform(train).head()

Unnamed: 0,num_ingredients,ingredient_length
0,9,12.0
1,11,10.090909
2,12,10.333333
3,4,6.75
4,20,10.1


In [44]:
def get_text(df):
    return df["ingredients_str"]

In [45]:
get_text_ft = FunctionTransformer(get_text, validate = False)
get_text_ft.transform(train).head()

0    ['romaine lettuce', 'black olives', 'grape tom...
1    ['plain flour', 'ground pepper', 'salt', 'toma...
2    ['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...
3          ['water', 'vegetable oil', 'wheat', 'salt']
4    ['black pepper', 'shallots', 'cornflour', 'cay...
Name: ingredients_str, dtype: object

### Combining tranformed function
#### FeatureUnion works in parallel, not sequential. Like ColumnTransfomer

In [46]:
from  sklearn.pipeline import make_union

In [50]:
X_dtm = vect.fit_transform(X["ingredients_str"])
X_dtm.shape

(39774, 6250)

In [51]:
union = make_union(make_pipeline(get_text_ft, vect), get_manual_ft)
X_dtm_manual = union.fit_transform(train)
X_dtm_manual.shape

(39774, 6252)

In [52]:
type(X_dtm_manual)

scipy.sparse.csr.csr_matrix

In [53]:
cross_val_score(nb, X_dtm_manual, y,
                cv = 5, scoring = "accuracy",
                n_jobs = -1).mean()

0.7103887990767782

In [55]:
pipe = make_pipeline(union, nb)
cross_val_score(pipe, train, y,
                cv = 5, scoring = "accuracy",
                n_jobs = -1).mean()

0.7134309868165586

In [57]:
pipe.named_steps

{'featureunion': FeatureUnion(transformer_list=[('pipeline',
                                 Pipeline(steps=[('functiontransformer',
                                                  FunctionTransformer(func=<function get_text at 0x7f8465f6f790>)),
                                                 ('countvectorizer',
                                                  CountVectorizer(token_pattern="'([a-z "
                                                                                "]+)'"))])),
                                ('functiontransformer',
                                 FunctionTransformer(func=<function get_manual at 0x7f84656a0ee0>))]),
 'multinomialnb': MultinomialNB()}

In [58]:
param_grid = {}
param_grid["featureunion__pipeline__countvectorizer__token_pattern"] = [
    r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid["multinomialnb__alpha"] = [0.5, 1]
param_grid

{'featureunion__pipeline__countvectorizer__token_pattern': ['\\b\\w\\w+\\b',
  "'([a-z ]+)'"],
 'multinomialnb__alpha': [0.5, 1]}

In [60]:
grid = GridSearchCV(pipe, param_grid, cv = 5,
                    scoring = "accuracy", n_jobs = -1)
%time grid.fit(train, y)

CPU times: user 6.33 s, sys: 662 ms, total: 6.99 s
Wall time: 20.8 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('featureunion',
                                        FeatureUnion(transformer_list=[('pipeline',
                                                                        Pipeline(steps=[('functiontransformer',
                                                                                         FunctionTransformer(func=<function get_text at 0x7f8465f6f790>)),
                                                                                        ('countvectorizer',
                                                                                         CountVectorizer(token_pattern="'([a-z "
                                                                                                                       "]+)'"))])),
                                                                       ('functiontransformer',
                                                                        FunctionTransformer(func=<function get

In [61]:
grid.best_score_

0.7426710530869912

In [62]:
grid.best_params_

{'featureunion__pipeline__countvectorizer__token_pattern': "'([a-z ]+)'",
 'multinomialnb__alpha': 0.5}

### Ensembling models

In [63]:
feature_cols = ["num_ingredients", "ingredient_length"]
X = train[feature_cols]
y = train["cuisine"]

In [64]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 800)

In [65]:
knn.fit(X, y)

KNeighborsClassifier(n_neighbors=800)

In [66]:
X_new = new[feature_cols]

In [68]:
new_pred_prob_knn = knn.predict_proba(X_new)
new_pred_prob_knn.shape

(9944, 20)

In [69]:
new_pred_prob_knn[0, :]

array([0.02625, 0.0275 , 0.01375, 0.04375, 0.03375, 0.08   , 0.0175 ,
       0.075  , 0.0275 , 0.135  , 0.01   , 0.075  , 0.01875, 0.165  ,
       0.00875, 0.0125 , 0.1525 , 0.025  , 0.0275 , 0.025  ])

In [72]:
new_pred_prob_knn[0, :].sum()

1.0

In [73]:
rand.best_estimator_

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(min_df=2, token_pattern="'([a-z ]+)'")),
                ('multinomialnb', MultinomialNB(alpha=0.14038693859523377))])

In [74]:
X_new = new["ingredients_str"]

In [76]:
new_pred_prob_rand = rand.predict_proba(X_new)
new_pred_prob_rand.shape

(9944, 20)

In [77]:
new_pred_prob_rand[0, :]

array([8.25629219e-04, 5.39506733e-01, 3.95675398e-05, 5.49166780e-05,
       3.78889216e-03, 1.06232990e-03, 3.78349379e-04, 2.35518525e-04,
       2.16854482e-01, 2.69652504e-03, 2.90265174e-04, 4.70943215e-04,
       3.62257946e-06, 6.33030455e-04, 7.44933967e-06, 3.89643766e-02,
       1.93898832e-01, 2.87636779e-04, 7.97189705e-07, 1.02361868e-07])

In [79]:
#Ensembling(averaging) manually
(new_pred_prob_rand[0, :] +
 new_pred_prob_knn[0, :]) / 2

array([0.01353781, 0.28350337, 0.00689478, 0.02190246, 0.01876945,
       0.04053116, 0.00893917, 0.03761776, 0.12217724, 0.06884826,
       0.00514513, 0.03773547, 0.00937681, 0.08281652, 0.00437872,
       0.02573219, 0.17319942, 0.01264382, 0.0137504 , 0.01250005])

In [82]:
new_pred_prob = pd.DataFrame(
    (new_pred_prob_rand + new_pred_prob_knn) / 2 ,
    columns = knn.classes_)
new_pred_prob.head()

Unnamed: 0,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,jamaican,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese
0,0.013538,0.283503,0.006895,0.021902,0.018769,0.040531,0.008939,0.037618,0.122177,0.068848,0.005145,0.037735,0.009377,0.082817,0.004379,0.025732,0.173199,0.012644,0.01375,0.0125
1,0.008751,0.010552,0.016875,0.045,0.018127,0.023782,0.015625,0.04625,0.010627,0.070625,0.005626,0.027501,0.021875,0.066875,0.008125,0.00875,0.548785,0.0075,0.025625,0.013125
2,0.012753,0.009377,0.006893,0.02,0.015001,0.039914,0.009569,0.029375,0.013211,0.387224,0.005625,0.03875,0.0075,0.080627,0.025596,0.008164,0.079375,0.182919,0.015625,0.012501
3,0.003125,0.004375,0.53375,0.03875,0.001875,0.023125,0.00625,0.075625,0.00125,0.051875,0.011875,0.008125,0.003125,0.1075,0.029375,0.001875,0.025,0.0075,0.038125,0.0275
4,0.001878,0.01006,0.020104,0.02125,0.003125,0.044661,0.0175,0.01375,0.012535,0.640837,0.003751,0.0075,0.00375,0.083128,0.004376,0.003133,0.072882,0.01828,0.014375,0.003125


In [84]:
#For each row, find the max
new_pred_class = new_pred_prob.apply(np.argmax, axis = 1)
new_pred_class.head()

0     1
1    16
2     9
3     2
4     9
dtype: int64