In [1]:
import json
from datetime import datetime
from pprint import pprint
import numpy as np
import pandas as pd

import os
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from tqdm.notebook import tqdm

In [2]:
from itertools import product


datasets = {}
for name in ('airline', 'hotel', 'heart'):
# for name in ('heart',):
    dataset = {}
    for part, split, typ in product(('X', 'y'), ('train', 'test'), ('bin_', '')):
        dataset[f'{typ}{part}_{split}'] = pd.read_csv(f'./data_preprocessed/{name}_{typ}{part}_{split}.csv')
    datasets[name] = dataset

# datasets['airline']['categorical'] = [
#     'Gender',
#     'Customer Type',
#     # 'Type of Travel',
#     # 'Class',
# ]

# datasets['hotel']['categorical'] = [
#     'deposit_type',
#     'customer_type',
#     # 'arrival_date_year',
#     'market_segment',
#     # 'arrival_date_month',
#     # 'arrival_date_day_of_month',
#     'meal',
#     'hotel'
# ]

# datasets['heart']['categorical'] = [
#     'Diabetes',
# ]

NameError: name 'load_heart' is not defined

In [None]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lazy_fca_sklearn_wrapper import MyBinarizedBinaryClassifier, MyPatternBinaryClassifier

models_and_param_grids = {
    'naive_bayes': (BernoulliNB(binarize=False), {
        'alpha': [1.0,]
    }),
    'xgboost': (XGBClassifier(random_state=0, objective='binary:logistic'), {
        'n_estimators': [100, 200],
    }),
    'random_forest': (RandomForestClassifier(random_state=0), {
        'n_estimators': [100, 200],
    }),
    'logreg': (LogisticRegression(random_state=0), {
        'C': [0.1, 1], 
        'class_weight': [None, 'balanced']
    }),
    'knn': (KNeighborsClassifier(), {
        'n_neighbors': [5, 10],
        'weights': ['distance'],
    }),
    'catboost': (CatBoostClassifier(verbose=0, random_state=0), {
        'n_estimators': [100, 200],
    }),
    'lazy_fca': (MyBinarizedBinaryClassifier(), {
        'alpha': [0, 0.1, 0.5, 0.9, 1.0],
        'method': ['standard', 'standard-support', 'ratio-support'],
    }),
    'lazy_fca_pat_structures': (MyPatternBinaryClassifier(), {
        # 'alpha': [0, 0.1, 0.5, 0.9, 1.0],
        'alpha': [0.5, 0.9],
        'method': ['standard', 'standard-support', 'ratio-support'],
        # 'method': ['standard', 'standard-support', 'ratio-support'],
    })
}

In [None]:
from sklearn.pipeline import FunctionTransformer


results = []
experiment_dir = f'./output/{datetime.now()}'
os.makedirs(experiment_dir)

for dataset_name, dataset in tqdm(datasets.items()):
    for model_name, (model, param_grid) in tqdm(models_and_param_grids.items()):
        print(dataset_name, model_name)

        if model_name == 'lazy_fca':
            X_train = dataset['bin_X_train'].astype(bool)
            X_test = dataset['bin_X_test'].astype(bool)
            y_train = dataset['bin_y_train'].astype(bool)
            y_test = dataset['bin_y_test'].astype(bool)
        elif model_name == 'lazy_fca_pat_structures':
            X_train = dataset['X_train']
            model.categorical = [X_train.columns.get_loc(col) for col in  X_train.select_dtypes(bool).columns]
            X_test = dataset['X_test']
            y_train = dataset['bin_y_train'].astype(bool)
            y_test = dataset['bin_y_test'].astype(bool)
            print(model.categorical)
        else:
            X_train = dataset['bin_X_train'].astype(float)
            X_test = dataset['bin_X_test'].astype(float)
            y_train = dataset['bin_y_train'].astype(float)
            y_test = dataset['bin_y_test'].astype(float)

        X_train = np.asarray(X_train)
        X_test = np.asarray(X_test)
        y_test = np.asarray(y_test).ravel()
        y_train = np.asarray(y_train).ravel()
            
        grid_search = GridSearchCV(
            estimator=model,
            cv=StratifiedKFold(n_splits=5),
            param_grid=param_grid,
            n_jobs=3,
            verbose=1,
            scoring=['accuracy', 'f1', 'precision', 'recall'],
            refit='f1',
            error_score='raise'
        )
            
        grid_search.fit(X=X_train, y=y_train)
        y_pred = grid_search.predict(X_test)

        res = {
            'dataset': dataset_name,
            'model': model_name,
            'f1': f1_score(y_test, y_pred),
            'accuracy': accuracy_score(y_test, y_pred),
            'params': grid_search.best_params_
        }
        display(pd.DataFrame([res]))
        with open(f'{experiment_dir}/{model_name}_{dataset_name}.json', 'wt') as f:
            json.dump(res, f)
        results.append(res)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

airline naive_bayes
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,naive_bayes,0.745098,0.74,{'alpha': 1.0}


airline xgboost
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,xgboost,0.752688,0.77,{'n_estimators': 100}


airline random_forest
Fitting 5 folds for each of 2 candidates, totalling 10 fits


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,random_forest,0.736842,0.75,{'n_estimators': 200}


airline logreg
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,logreg,0.77551,0.78,"{'C': 1, 'class_weight': 'balanced'}"


airline knn
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,knn,0.744681,0.76,"{'n_neighbors': 10, 'weights': 'distance'}"


airline catboost
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,catboost,0.8125,0.82,{'n_estimators': 100}


airline lazy_fca
Fitting 5 folds for each of 15 candidates, totalling 75 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,lazy_fca,0.683333,0.62,"{'alpha': 0.1, 'method': 'standard'}"


airline lazy_fca_pat_structures
[8, 9, 10, 11]
Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,lazy_fca_pat_structures,0.75,0.76,"{'alpha': 0.9, 'method': 'ratio-support'}"


  0%|          | 0/8 [00:00<?, ?it/s]

hotel naive_bayes
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,naive_bayes,0.666667,0.79,{'alpha': 1.0}


hotel xgboost
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,xgboost,0.666667,0.78,{'n_estimators': 200}


hotel random_forest
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,random_forest,0.625,0.76,{'n_estimators': 200}


hotel logreg
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,logreg,0.607595,0.69,"{'C': 1, 'class_weight': 'balanced'}"


hotel knn
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,knn,0.625,0.76,"{'n_neighbors': 10, 'weights': 'distance'}"


hotel catboost
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,catboost,0.634921,0.77,{'n_estimators': 200}


hotel lazy_fca
Fitting 5 folds for each of 15 candidates, totalling 75 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,lazy_fca,0.688889,0.72,"{'alpha': 0.1, 'method': 'standard'}"


hotel lazy_fca_pat_structures
[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,lazy_fca_pat_structures,0.598291,0.53,"{'alpha': 0.9, 'method': 'ratio-support'}"


  0%|          | 0/8 [00:00<?, ?it/s]

heart naive_bayes
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,naive_bayes,0.307692,0.82,{'alpha': 1.0}


heart xgboost
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,xgboost,0.222222,0.86,{'n_estimators': 200}


heart random_forest
Fitting 5 folds for each of 2 candidates, totalling 10 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,random_forest,0.153846,0.89,{'n_estimators': 200}


heart logreg
Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,logreg,0.363636,0.72,"{'C': 0.1, 'class_weight': 'balanced'}"


heart knn
Fitting 5 folds for each of 2 candidates, totalling 10 fits


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,knn,0.4,0.91,"{'n_neighbors': 5, 'weights': 'distance'}"


heart catboost
Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,catboost,0.142857,0.88,{'n_estimators': 200}


heart lazy_fca
Fitting 5 folds for each of 15 candidates, totalling 75 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,lazy_fca,0.311111,0.69,"{'alpha': 0.1, 'method': 'standard'}"


heart lazy_fca_pat_structures
[7, 8, 9]
Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,lazy_fca_pat_structures,0.311111,0.69,"{'alpha': 0.5, 'method': 'ratio-support'}"


In [None]:
df_results = pd.DataFrame(results).pivot(
    index='model',
    columns='dataset',
    values=['f1', 'accuracy'],
).swaplevel(axis=1).sort_index(axis=1)
df_results.style.highlight_max(color='red').format(precision=3)

dataset,airline,airline,heart,heart,hotel,hotel
Unnamed: 0_level_1,accuracy,f1,accuracy,f1,accuracy,f1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
catboost,0.82,0.812,0.88,0.143,0.77,0.635
knn,0.76,0.745,0.91,0.4,0.76,0.625
lazy_fca,0.62,0.683,0.69,0.311,0.72,0.689
lazy_fca_pat_structures,0.76,0.75,0.69,0.311,0.53,0.598
logreg,0.78,0.776,0.72,0.364,0.69,0.608
naive_bayes,0.74,0.745,0.82,0.308,0.79,0.667
random_forest,0.75,0.737,0.89,0.154,0.76,0.625
xgboost,0.77,0.753,0.86,0.222,0.78,0.667


In [None]:
df_results.style.highlight_max(props='textbf:--rwrap;').format(precision=3).to_latex('table.tex')