In [71]:
import pandas as pd
import numpy as np

#Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,  LogisticRegressionCV
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, recall_score, precision_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_json('../data/cleaned_v1.json')

In [3]:
df.dtypes

id                   int64
cuisine             object
ingredients         object
ingredient_count     int64
dtype: object

In [4]:
X = df['ingredients']
y = df['cuisine']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [12]:
pipe = Pipeline([('cvec', CountVectorizer()), ('log', LogisticRegression(max_iter = 2111))])

In [13]:
pipe_params_cvec = {'cvec__max_features': [500, 1000],
    'cvec__min_df': [2],
    'cvec__max_df': [.90, .95],
    'cvec__ngram_range': [(1,2), (2,3)]
}

In [14]:
gs = GridSearchCV(pipe, 
                  pipe_params_cvec,          #parameters values. 
                  cv = 3,
                  n_jobs = -2,
                  verbose = 1 )      

In [15]:
gs.fit(X_train, y_train);


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  24 out of  24 | elapsed:  2.0min finished


In [16]:
gs.score(X_train, y_train)

0.8102581293999329

In [17]:
gs.score(X_test, y_test)

0.7207361222847949

In [18]:
gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 1000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2)}

In [19]:
log_preds = gs.predict(X_test)

In [89]:
coef_names = gs.best_estimator_.named_steps['cvec'].get_feature_names()
coef_vals = gs.best_estimator_.named_steps['log'].coef_[0]

coef_df = pd.DataFrame ({
    'coefs' : coef_names,
    'vals' : coef_vals
}).set_index('coefs')

coef_df.reindex(coef_df['vals'].abs().sort_values(ascending=False).index)[:20]

Unnamed: 0_level_0,vals
coefs,Unnamed: 1_level_1
blackbeans,3.016799
lime,2.886635
sweetenedcondensedmilk,2.56456
coconutmilk,2.365526
collardgreens,2.263042
unsweetenedcoconutmilk,2.122995
bananas,1.978985
superfinesugar,1.914742
unsweetenedcocoapowder,1.69658
parmesancheese,1.625179


In [28]:
print ("Accuracy : ", accuracy_score(y_test, log_preds)) 

Accuracy :  0.7207361222847949


**Lasso Model**

In [85]:
lasso_pipe = Pipeline([('tfid', TfidfVectorizer(
    max_df=.325,
    max_features=6500,
    min_df=5,
    ngram_range=(1, 2))),
    ('log', LogisticRegression(penalty='none',
                                 solver='lbfgs',
                                 max_iter = 2111,
                                 verbose=0
                                ))])

In [81]:
lasso_pipe.fit(X_train, y_train);

In [82]:
lasso_pipe.score(X_train, y_train)

0.9992960107274556

In [83]:
lasso_pipe.score(X_test, y_test)

0.6908688656476267

**Complementary EDA**

In [95]:
df['cuisine']

0              greek
1        southern_us
2           filipino
3             indian
4             indian
            ...     
39769          irish
39770        italian
39771          irish
39772        chinese
39773        mexican
Name: cuisine, Length: 39774, dtype: object

In [100]:
print("The total number of distinct cuisines : ", len(df.cuisine.unique()))
print("These cusisines are : ", pd.DataFrame(df.cuisine.unique()))

The total number of distinct cuisines :  20
These cusisines are :                 0
0          greek
1    southern_us
2       filipino
3         indian
4       jamaican
5        spanish
6        italian
7        mexican
8        chinese
9        british
10          thai
11    vietnamese
12  cajun_creole
13     brazilian
14        french
15      japanese
16         irish
17        korean
18      moroccan
19       russian


In [110]:
#How many ingredients per cuisine?

n_ingredients_cuisine = df.groupby('cuisine')['id'].count().sort_values(ascending =False).reset_index().rename(columns={'cuisine': 'Cuisine' , 'id':'Total Ingredients'})

In [111]:
n_ingredients_cuisine

Unnamed: 0,Cuisine,Total Ingredients
0,italian,7838
1,mexican,6438
2,southern_us,4320
3,indian,3003
4,chinese,2673
5,french,2646
6,cajun_creole,1546
7,thai,1539
8,japanese,1423
9,greek,1175
