In [1]:
import json
from datetime import datetime
from pprint import pprint
import numpy as np
import pandas as pd

import os
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from tqdm.notebook import tqdm

In [2]:
import heart, hotel, airline

train_size = 700
test_size = 300

data_and_transformers = [
    ('heart', heart.load(train_size, test_size), heart.binarizer()),
    ('airline', airline.load(train_size, test_size), airline.binarizer()),
    ('hotel', hotel.load(train_size, test_size), hotel.binarizer()),
]

In [3]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lazy_fca_sklearn_wrapper import MyBinarizedBinaryClassifier, MyPatternBinaryClassifier

models_and_param_grids = {
    'naive_bayes': (BernoulliNB(binarize=False), {
        'bernoullinb__alpha': [1.0,]
    }),
    'xgboost': (XGBClassifier(random_state=0, objective='binary:logistic'), {
        'xgbclassifier__n_estimators': [100,],
    }),
    'random_forest': (RandomForestClassifier(random_state=0), {
        'randomforestclassifier__n_estimators': [10,],
    }),
    'logreg': (LogisticRegression(random_state=0), {
        'logisticregression__C': [0.1,], 
    }),
    'knn': (KNeighborsClassifier(), {
        'kneighborsclassifier__n_neighbors': [5,],
        'kneighborsclassifier__weights': ['distance'],
    }),
    'catboost': (CatBoostClassifier(verbose=0, random_state=0), {
        'catboostclassifier__n_estimators': [100,],
    }),
    'lazy_fca': (MyBinarizedBinaryClassifier(), {
        'mybinarizedbinaryclassifier__alpha': [0, 0.1, 0.5, 0.9, 1.0],
        'mybinarizedbinaryclassifier__method': ['standard', 'standard-support', 'ratio-support'],
    }),
    # 'lazy_fca_pat_structures': (MyPatternBinaryClassifier(), {
    #     'mybinarizedbinaryclassifier__alpha': [0, 0.1, 0.5, 0.9, 1.0],
    #     'mybinarizedbinaryclassifier__method': ['standard', 'standard-support', 'ratio-support'],
    # })
}

In [4]:
from sklearn.pipeline import FunctionTransformer


results = []
experiment_dir = f'./output/{datetime.now()}'
os.makedirs(experiment_dir)

for data_name, data, transformers in tqdm(data_and_transformers):
    for model_name, (model, param_grid) in tqdm(models_and_param_grids.items()):
        print(data_name, model_name)
        
        if model_name == 'lazy_fca':
            to_bool = FunctionTransformer(
                lambda df: np.asarray(df).astype(bool),
                feature_names_out='one-to-one'
            )
            pipeline = make_pipeline(*transformers, to_bool, model)
        else:
            pipeline = make_pipeline(*transformers, model)
            
        grid_search = GridSearchCV(
            estimator=pipeline,
            cv=StratifiedKFold(n_splits=5),
            param_grid=param_grid,
            n_jobs=3,
            verbose=1,
            scoring=['accuracy', 'f1', 'precision', 'recall'],
            refit='f1',
            error_score='raise'
        )

        X_train, X_test, y_train, y_test = data
        if model_name == 'lazy_fca':
            y_train = np.asarray(y_train).astype(bool) 
            y_test = np.asarray(y_test).astype(bool) 
            
        grid_search.fit(X=X_train, y=y_train)
        y_pred = grid_search.predict(X_test)

        res = {
            'dataset': data_name,
            'model': model_name,
            'f1': f1_score(y_test, y_pred),
            'accuracy': accuracy_score(y_test, y_pred),
            'params': grid_search.best_params_
        }
        display(pd.DataFrame([res]))
        with open(f'{experiment_dir}/{model_name}_{data_name}.json', 'wt') as f:
            json.dump(res, f)
        results.append(res)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

heart naive_bayes
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,naive_bayes,0.266667,0.816667,{'bernoullinb__alpha': 1.0}


heart xgboost
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,xgboost,0.26087,0.886667,{'xgbclassifier__n_estimators': 100}


heart random_forest
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,random_forest,0.125,0.906667,{'randomforestclassifier__n_estimators': 10}


heart logreg
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,logreg,0.129032,0.91,{'logisticregression__C': 0.1}


heart knn
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,knn,0.058824,0.893333,"{'kneighborsclassifier__n_neighbors': 5, 'knei..."


heart catboost
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,catboost,0.117647,0.9,{'catboostclassifier__n_estimators': 100}


heart lazy_fca
Fitting 5 folds for each of 15 candidates, totalling 75 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,heart,lazy_fca,0.308943,0.716667,"{'mybinarizedbinaryclassifier__alpha': 0.5, 'm..."


  0%|          | 0/7 [00:00<?, ?it/s]

airline naive_bayes
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,naive_bayes,0.855072,0.866667,{'bernoullinb__alpha': 1.0}


airline xgboost
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,xgboost,0.901818,0.91,{'xgbclassifier__n_estimators': 100}


airline random_forest
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,random_forest,0.85283,0.87,{'randomforestclassifier__n_estimators': 10}


airline logreg
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,logreg,0.88968,0.896667,{'logisticregression__C': 0.1}


airline knn
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,knn,0.897527,0.903333,"{'kneighborsclassifier__n_neighbors': 5, 'knei..."


airline catboost
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,catboost,0.916364,0.923333,{'catboostclassifier__n_estimators': 100}


airline lazy_fca
Fitting 5 folds for each of 15 candidates, totalling 75 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,airline,lazy_fca,0.891304,0.9,"{'mybinarizedbinaryclassifier__alpha': 0, 'myb..."


  0%|          | 0/7 [00:00<?, ?it/s]

hotel naive_bayes
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,naive_bayes,0.643902,0.756667,{'bernoullinb__alpha': 1.0}


hotel xgboost
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,xgboost,0.648148,0.746667,{'xgbclassifier__n_estimators': 100}




hotel random_forest
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,random_forest,0.613065,0.743333,{'randomforestclassifier__n_estimators': 10}




hotel logreg
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,logreg,0.628571,0.783333,{'logisticregression__C': 0.1}




hotel knn
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,knn,0.570048,0.703333,"{'kneighborsclassifier__n_neighbors': 5, 'knei..."


hotel catboost
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,catboost,0.639175,0.766667,{'catboostclassifier__n_estimators': 100}




hotel lazy_fca
Fitting 5 folds for each of 15 candidates, totalling 75 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,model,f1,accuracy,params
0,hotel,lazy_fca,0.651376,0.746667,"{'mybinarizedbinaryclassifier__alpha': 1.0, 'm..."


In [5]:
df_results = pd.DataFrame(results).pivot(
    index='model',
    columns='dataset',
    values=['f1', 'accuracy'],
).swaplevel(axis=1).sort_index(axis=1)
df_results.style.highlight_max(color='red').format(precision=3)

dataset,airline,airline,heart,heart,hotel,hotel
Unnamed: 0_level_1,accuracy,f1,accuracy,f1,accuracy,f1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
catboost,0.923,0.916,0.9,0.118,0.767,0.639
knn,0.903,0.898,0.893,0.059,0.703,0.57
lazy_fca,0.9,0.891,0.717,0.309,0.747,0.651
logreg,0.897,0.89,0.91,0.129,0.783,0.629
naive_bayes,0.867,0.855,0.817,0.267,0.757,0.644
random_forest,0.87,0.853,0.907,0.125,0.743,0.613
xgboost,0.91,0.902,0.887,0.261,0.747,0.648


In [6]:
df_results.style.highlight_max(props='textbf:--rwrap;').format(precision=3).to_latex('table.tex')