In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('../data/ing1.csv')

In [4]:
df.head()

Unnamed: 0,id,cuisine,ingredients,ingredient_count
0,10259,greek,romainelettuce blackolives grapetomatoes garli...,9
1,25693,southern_us,plainflour groundpepper salt tomatoes groundbl...,11
2,20130,filipino,eggs pepper salt mayonaise cookingoil greenchi...,12
3,22213,indian,water vegetableoil wheat salt,4
4,13162,indian,blackpepper shallots cornflour cayennepepper o...,20


In [5]:
X = df.ingredients
y = df.cuisine

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
XGBClassifier()

In [6]:
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('xgb', XGBClassifier())
])

In [14]:
params = {
    'cvec__min_df': np.linspace(0, .5, 20),
    'cvec__max_df': np.linspace(.9, 1, 10),
    'cvec__max_features': range(300, 5000, 15),
    'xgb__learning_rate': np.linspace(.01, .3, 10),
    'xgb__min_child_weight': [.5, 1, 2, 3],
    'xgb__max_depth': range(3, 12),
    'xgb__gamma': np.linspace(0, 1, 10),
    'xgb__objective': ['multi:softmax'],
    'xgb__eval_metric': ['auc'],
}

In [15]:
rs1 = RandomizedSearchCV(pipe1, param_distributions=params, n_iter=100, n_jobs=-1, verbose=1)

In [16]:
rs1.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 12.8min finished


RandomizedSearchCV(estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                             ('xgb',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            importance_type='gain',
                                                            interaction_constraints=None,
                                                            learning_rate=None,
                                                            max_delta_ste

In [17]:
y_hat = rs1.predict(X_test)

In [19]:
rs1.score(X_train, y_train), rs1.score(X_test, y_test)

(0.8766677841099564, 0.7498994368463395)

In [20]:
pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('xgb', XGBClassifier())
])

In [None]:
TfidfVectorizer()

In [22]:
params2 = {
    'tfidf__min_df': np.linspace(0, .5, 20),
    'tfidf__max_df': np.linspace(.9, 1, 10),
    'tfidf__max_features': range(300, 5000, 15),
    'xgb__learning_rate': np.linspace(.01, .3, 10),
    'xgb__min_child_weight': [.5, 1, 2, 3],
    'xgb__max_depth': range(3, 12),
    'xgb__gamma': np.linspace(0, 1, 10),
    'xgb__objective': ['multi:softmax'],
    'xgb__eval_metric': ['auc'],
}

In [23]:
rs2 = RandomizedSearchCV(pipe2, param_distributions=params2, n_iter=100, n_jobs=-1, verbose=1)

In [24]:
rs2.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 18.8min finished


RandomizedSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                             ('xgb',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            importance_type='gain',
                                                            interaction_constraints=None,
                                                            learning_rate=None,
                                                            max_delta_st

In [25]:
 rs2.score(X_train, y_train), rs2.score(X_test, y_test)

(0.9, 0.7364239742558326)

In [39]:
df.cuisine.value_counts(normalize=True)

italian         0.197063
mexican         0.161865
southern_us     0.108614
indian          0.075502
chinese         0.067205
french          0.066526
cajun_creole    0.038870
thai            0.038694
japanese        0.035777
greek           0.029542
spanish         0.024865
korean          0.020868
vietnamese      0.020742
moroccan        0.020642
british         0.020214
filipino        0.018982
irish           0.016770
jamaican        0.013225
russian         0.012294
brazilian       0.011741
Name: cuisine, dtype: float64

In [40]:
rs1.best_params_

{'xgb__objective': 'multi:softmax',
 'xgb__min_child_weight': 1,
 'xgb__max_depth': 11,
 'xgb__learning_rate': 0.1711111111111111,
 'xgb__gamma': 0.7777777777777777,
 'xgb__eval_metric': 'auc',
 'cvec__min_df': 0.0,
 'cvec__max_features': 2595,
 'cvec__max_df': 0.9}