### Bootstrap bagging

A couple random searches with different hyperparameters were very unpromising, very overfit, so I moved on to other models.

Training score: 0.9639624539054643

Testing score: 0.5854823647726456

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

In [8]:
df = pd.read_json('../data/cleaned_v1.json')

In [9]:
X = df['ingredients']
y = df['cuisine']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.7)

In [11]:
pipe1 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('bag', BaggingClassifier())
])

In [12]:
pipe1_params = {
    'tvec__max_features': [3500, 4000],
    'tvec__max_df': [0.9, 0.95],
    'tvec__min_df': [0.01],
    'bag__base_estimator': [DecisionTreeClassifier(), ExtraTreeClassifier()],
    'bag__n_estimators': [10, 30, 50],
    'bag__bootstrap': [True, False]
}

In [13]:
rs1 = RandomizedSearchCV(estimator=pipe1,
                        param_distributions=pipe1_params,
                        cv=5,
                        scoring="accuracy",
                        n_jobs=-1,
                        verbose=1)

In [14]:
rs1.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.5min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                             ('bag', BaggingClassifier())]),
                   n_jobs=-1,
                   param_distributions={'bag__base_estimator': [DecisionTreeClassifier(),
                                                                ExtraTreeClassifier()],
                                        'bag__bootstrap': [True, False],
                                        'bag__n_estimators': [10, 30, 50],
                                        'tvec__max_df': [0.9, 0.95],
                                        'tvec__max_features': [3500, 4000],
                                        'tvec__min_df': [0.01]},
                   scoring='accuracy', verbose=1)

In [15]:
rs1.best_score_

0.5840604194766918

In [16]:
rs1.best_params_

{'tvec__min_df': 0.01,
 'tvec__max_features': 3500,
 'tvec__max_df': 0.95,
 'bag__n_estimators': 30,
 'bag__bootstrap': True,
 'bag__base_estimator': ExtraTreeClassifier()}

In [18]:
rs1.score(X_train, y_train), rs1.score(X_test, y_test)

(0.9639624539054643, 0.5854823647726456)