In [20]:
import pandas as pd
import numpy as np

In [21]:
def make_features(df):
    df["num_ingredients"] = df["ingredients"].apply(len)
    df["ingredient_length"] = df["ingredients"].apply(
        lambda x: np.mean([len(item) for item in x])
    )
    df["ingredients_str"] = df["ingredients"].astype(str)
    return df

In [22]:
train = make_features(pd.read_json("train.json"))
new = make_features(pd.read_json("test.json"))

In [23]:
train.head()

Unnamed: 0,id,cuisine,ingredients,num_ingredients,ingredient_length,ingredients_str
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",9,12.0,"['romaine lettuce', 'black olives', 'grape tom..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,22213,indian,"[water, vegetable oil, wheat, salt]",4,6.75,"['water', 'vegetable oil', 'wheat', 'salt']"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1,"['black pepper', 'shallots', 'cornflour', 'cay..."


In [24]:
train.shape

(39774, 6)

In [25]:
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredient_length,ingredients_str
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,"['baking powder', 'eggs', 'all-purpose flour',..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,"['sugar', 'egg yolks', 'corn starch', 'cream o..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,"['sausage links', 'fennel bulb', 'fronds', 'ol..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,"['meat cuts', 'file powder', 'smoked sausage',..."
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,"['ground black pepper', 'salt', 'sausage casin..."


In [26]:
new.shape

(9944, 5)

In [27]:
X = train["ingredients_str"]
y = train["cuisine"]

In [28]:
X.head()

0    ['romaine lettuce', 'black olives', 'grape tom...
1    ['plain flour', 'ground pepper', 'salt', 'toma...
2    ['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...
3          ['water', 'vegetable oil', 'wheat', 'salt']
4    ['black pepper', 'shallots', 'cornflour', 'cay...
Name: ingredients_str, dtype: object

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(token_pattern = r"'([a-z ]+)'")

In [30]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [31]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(vect, nb)

In [32]:
pipe.steps

[('countvectorizer', CountVectorizer(token_pattern="'([a-z ]+)'")),
 ('multinomialnb', MultinomialNB())]

In [33]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv = 5,
                scoring = "accuracy", n_jobs = -1).mean()

0.7323126392849393

In [34]:
#named step__parameter
param_grid = {}
param_grid["countvectorizer__token_pattern"] = [
    r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid["multinomialnb__alpha"] = [0.5, 1]
param_grid

{'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"],
 'multinomialnb__alpha': [0.5, 1]}

In [35]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, param_grid,
                    cv = 5, scoring = "accuracy",
                    n_jobs = -1)

In [36]:
%time grid.fit(X, y)

CPU times: user 2.01 s, sys: 245 ms, total: 2.26 s
Wall time: 16.2 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(token_pattern="'([a-z "
                                                                      "]+)'")),
                                       ('multinomialnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'countvectorizer__token_pattern': ['\\b\\w\\w+\\b',
                                                            "'([a-z ]+)'"],
                         'multinomialnb__alpha': [0.5, 1]},
             scoring='accuracy')

In [None]:
y_pred_class = grid.predict()