In [1]:
import json
from datetime import datetime
from pprint import pprint

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from tqdm.notebook import tqdm

In [2]:
from heart import *
from hotel import *
from airline import *

data_and_transformers = [
    ('heart', load_heart(), transformers_heart()),
    ('airline', load_airline(), transformers_airline()),
    ('hotel', load_hotel(), transformers_hotel())
]

In [3]:
from models import models_and_param_grids

In [4]:
results = []

for data_name, data, transformers in tqdm(data_and_transformers):
    for model_name, (model, param_grid) in tqdm(models_and_param_grids.items()):

        pipeline = make_pipeline(*transformers, model)
        grid_search = GridSearchCV(
            estimator=pipeline,
            cv=StratifiedKFold(n_splits=5),
            param_grid=param_grid,
            n_jobs=3,
            verbose=1,
            scoring=['accuracy', 'f1', 'precision', 'recall'],
            refit='f1',
            error_score='raise'
        )
        display(grid_search)

        X_train, X_test, y_train, y_test = data
        grid_search.fit(X=X_train, y=y_train)
        y_pred = grid_search.predict(X_test)

        res = {
            'dataset': data_name,
            'model': model_name,
            'f1': f1_score(y_test, y_pred),
            'accuracy': accuracy_score(y_test, y_pred)
        }
        display(pd.DataFrame([res]))
        with open(f'./output/{datetime.now()}_{model_name}_{data_name}.json', 'wt') as f:
            json.dump(res, f)
        results.append(res)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,heart,nb,0.384615,0.84


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,heart,xgb,0.275862,0.895


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,heart,rf,0.384615,0.92


Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,dataset,model,f1,accuracy
0,heart,logreg,0.357143,0.91


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,heart,knn,0.275862,0.895


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,heart,catboost,0.296296,0.905


  0%|          | 0/6 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy
0,airline,nb,0.789474,0.84


Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy
0,airline,xgb,0.905405,0.93


Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy
0,airline,rf,0.882759,0.915


Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,dataset,model,f1,accuracy
0,airline,logreg,0.853333,0.89


Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy
0,airline,knn,0.868421,0.9


Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy
0,airline,catboost,0.910345,0.935


  0%|          | 0/6 [00:00<?, ?it/s]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,hotel,nb,0.511628,0.685


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,hotel,xgb,0.632353,0.75


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,hotel,rf,0.615385,0.75


Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,dataset,model,f1,accuracy
0,hotel,logreg,0.517857,0.73


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,hotel,knn,0.555556,0.72


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy
0,hotel,catboost,0.619048,0.76


In [5]:
df_results = pd.DataFrame(results).pivot(
    index='model',
    columns='dataset',
    values=['f1', 'accuracy'],
).swaplevel(axis=1).sort_index(axis=1)
df_results.style.highlight_max().format(precision=3)

dataset,airline,airline,heart,heart,hotel,hotel
Unnamed: 0_level_1,accuracy,f1,accuracy,f1,accuracy,f1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
catboost,0.935,0.91,0.905,0.296,0.76,0.619
knn,0.9,0.868,0.895,0.276,0.72,0.556
logreg,0.89,0.853,0.91,0.357,0.73,0.518
nb,0.84,0.789,0.84,0.385,0.685,0.512
rf,0.915,0.883,0.92,0.385,0.75,0.615
xgb,0.93,0.905,0.895,0.276,0.75,0.632


In [6]:
df_results.style.highlight_max(props='textbf:--rwrap;').format(precision=3).to_latex('table.tex')