### Multinomial Naive Bayes

This model showed some promise, tuned the hyperparameters until I seemed to level out at just over 70% accuracy on the testing score, though somewhat overfit.

Training score: 0.8547900427457883

Testing score: 0.7266499057196731

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

In [14]:
df['cuisine'].value_counts(normalize=True)

italian         0.197063
mexican         0.161865
southern_us     0.108614
indian          0.075502
chinese         0.067205
french          0.066526
cajun_creole    0.038870
thai            0.038694
japanese        0.035777
greek           0.029542
spanish         0.024865
korean          0.020868
vietnamese      0.020742
moroccan        0.020642
british         0.020214
filipino        0.018982
irish           0.016770
jamaican        0.013225
russian         0.012294
brazilian       0.011741
Name: cuisine, dtype: float64

In [4]:
df = pd.read_json('../data/cleaned_v1.json')

In [4]:
df.shape

(39774, 4)

In [5]:
X = df['ingredients']
y = df['cuisine']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.8)

In [33]:
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('mnb', MultinomialNB())
])

In [49]:
pipe1_params = {
    'cvec__max_features': np.arange(3000, 6000, 30),
    'cvec__max_df': [0.9, 0.95, 0.98, 1.0],
    'cvec__min_df': [1,2,3],
    'mnb__alpha': np.linspace(0.0, 0.3, 30),
    'mnb__fit_prior': [True, False]
}

In [50]:
random_search1 = RandomizedSearchCV(estimator=pipe1, 
                            param_distributions=pipe1_params, 
                            cv=5,
                            scoring='accuracy',
                            n_jobs=-1, 
                            verbose=1)

In [51]:
random_search1.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.5s finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                             ('mnb', MultinomialNB())]),
                   n_jobs=-1,
                   param_distributions={'cvec__max_df': [0.9, 0.95, 0.98, 1.0],
                                        'cvec__max_features': array([3000, 3030, 3060, 3090, 3120, 3150, 3180, 3210, 3240, 3270, 3300,
       3330, 3360, 3390, 3420, 3450, 3480, 3510, 3540, 3570, 3600, 3630,
       3660, 3690, 3720, 3750, 3780, 3810, 3840, 3870, 390...
                                        'mnb__alpha': array([0.        , 0.01034483, 0.02068966, 0.03103448, 0.04137931,
       0.05172414, 0.06206897, 0.07241379, 0.08275862, 0.09310345,
       0.10344828, 0.1137931 , 0.12413793, 0.13448276, 0.14482759,
       0.15517241, 0.16551724, 0.17586207, 0.1862069 , 0.19655172,
       0.20689655, 0.21724138, 0.22758621, 0.23793103, 0.24827586,
       0.25862069, 0.26896552, 0.27931034, 0.28965517, 0.3 

In [52]:
random_search1.best_params_

{'mnb__fit_prior': True,
 'mnb__alpha': 0.16551724137931034,
 'cvec__min_df': 1,
 'cvec__max_features': 3900,
 'cvec__max_df': 1.0}

In [53]:
random_search1.best_score_

0.7188828670706687

In [54]:
random_search1.score(X_train, y_train), random_search1.score(X_test, y_test)

(0.8547900427457883, 0.7266499057196731)

In [16]:
pipe2 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

In [55]:
pipe2_params = {
    'tvec__max_features': np.arange(3000, 6000, 30),
    'tvec__max_df': [0.9, 0.95, 0.98, 1.0],
    'tvec__min_df': [1, 2, 3],
    'mnb__alpha': np.linspace(0.0, 0.3, 30),
    'mnb__fit_prior': [True, False]
}

In [56]:
random_search2 = RandomizedSearchCV(estimator=pipe2, 
                            param_distributions=pipe2_params, 
                            cv=5,
                            scoring='accuracy',
                            n_jobs=-1, 
                            verbose=1)

In [57]:
random_search2.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.0s finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                             ('mnb', MultinomialNB())]),
                   n_jobs=-1,
                   param_distributions={'mnb__alpha': array([0.        , 0.01034483, 0.02068966, 0.03103448, 0.04137931,
       0.05172414, 0.06206897, 0.07241379, 0.08275862, 0.09310345,
       0.10344828, 0.1137931 , 0.12413793, 0.13448276, 0.14482759,
       0.15517241, 0.16551724, 0.17586207, 0.18620...
       3990, 4020, 4050, 4080, 4110, 4140, 4170, 4200, 4230, 4260, 4290,
       4320, 4350, 4380, 4410, 4440, 4470, 4500, 4530, 4560, 4590, 4620,
       4650, 4680, 4710, 4740, 4770, 4800, 4830, 4860, 4890, 4920, 4950,
       4980, 5010, 5040, 5070, 5100, 5130, 5160, 5190, 5220, 5250, 5280,
       5310, 5340, 5370, 5400, 5430, 5460, 5490, 5520, 5550, 5580, 5610,
       5640, 5670, 5700, 5730, 5760, 5790, 5820, 5850, 5880, 5910, 5940,
       5970]),
                                    

In [58]:
random_search2.best_score_

0.7143567788938565

In [59]:
random_search2.best_params_ 

{'tvec__min_df': 1,
 'tvec__max_features': 4650,
 'tvec__max_df': 0.95,
 'mnb__fit_prior': False,
 'mnb__alpha': 0.19655172413793104}

In [60]:
random_search2.score(X_train, y_train), random_search2.score(X_test, y_test)

(0.858058838320342, 0.7231301068510371)