# LazyPredict Model Screening

Inspired by https://towardsdatascience.com/how-to-run-30-machine-learning-models-with-2-lines-of-code-d0f94a537e52

In [1]:
import pandas as pd
import numpy as np

import datetime as dt
from typing import Tuple, List, Dict

from sklearn.metrics import auc, roc_curve


from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder

import pyforest

from functools import partial

import warnings
warnings.filterwarnings('ignore')

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.IPBC74C7KURV7CB2PKT5Z5FNR3SIBV4J.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

Started at  2021-03-22 12:13:19.905929


In [3]:
# read data
in_kaggle = False


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/tabular-playground-series-mar-2021/train.csv'
        test_path = '../input/tabular-playground-series-mar-2021/test.csv'
        sample_submission_path = '../input/tabular-playground-series-mar-2021/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train.csv'
        test_path = 'data/test.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, test_path, sample_submission_path


In [4]:
%%time
# get the training set and labels
train_set_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

train = pd.read_csv(train_set_path)
test = pd.read_csv(test_set_path)
target = train.target

test_id = test.id

subm = pd.read_csv(sample_subm_path)

Wall time: 2.32 s


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 32 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      300000 non-null  int64  
 1   cat0    300000 non-null  object 
 2   cat1    300000 non-null  object 
 3   cat2    300000 non-null  object 
 4   cat3    300000 non-null  object 
 5   cat4    300000 non-null  object 
 6   cat5    300000 non-null  object 
 7   cat6    300000 non-null  object 
 8   cat7    300000 non-null  object 
 9   cat8    300000 non-null  object 
 10  cat9    300000 non-null  object 
 11  cat10   300000 non-null  object 
 12  cat11   300000 non-null  object 
 13  cat12   300000 non-null  object 
 14  cat13   300000 non-null  object 
 15  cat14   300000 non-null  object 
 16  cat15   300000 non-null  object 
 17  cat16   300000 non-null  object 
 18  cat17   300000 non-null  object 
 19  cat18   300000 non-null  object 
 20  cont0   300000 non-null  float64
 21  cont1   30

In [6]:
cols_to_drop = ['id', 'cat5', 'cat7', 'cat8', 'cat10']

cat_cols = [feature for feature in train.columns if 'cat' in feature and feature not in cols_to_drop ]
cont_cols = [feature for feature in train.columns if 'cont' in feature and feature not in cols_to_drop]

all_features = cat_cols + cont_cols

def preprocess(df, encoder=None,
               scaler=None, cols_to_drop=None,
               cols_to_encode=None, cols_to_scale=None):
    """
    Preprocess input data
    :param df: DataFrame with data
    :param encoder: encoder object with fit_transform method
    :param scaler: scaler object with fit_transform method
    :param cols_to_drop: columns to be removed
    :param cols_to_encode: columns to be encoded
    :param cols_to_scale: columns to be scaled
    :return: DataFrame
    """

    if encoder:
        for col in cols_to_encode:
            df[col] = encoder.fit_transform(df[col])

    if scaler:
        for col in cols_to_scale:
            df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

    if cols_to_drop:
        df = df.drop(cols_to_drop, axis=1)

    return df

In [7]:
# preprocess the data
train = preprocess(train, encoder=LabelEncoder(), scaler=StandardScaler(),
                  cols_to_drop=cols_to_drop, cols_to_encode=cat_cols,
                  cols_to_scale=cont_cols)

test = preprocess(test, encoder=LabelEncoder(), scaler=StandardScaler(),
                 cols_to_drop=cols_to_drop, cols_to_encode=cat_cols,
                 cols_to_scale=cont_cols)

In [8]:
# pre-requisite setup
X = train[all_features]
y = target

# view accuracy
from sklearn.metrics import accuracy_score

In [9]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# model screening with the lazypredict
import lazypredict
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|███████████████████████████████████████████████████████████████████████████████| 29/29 [3:28:51<00:00, 432.13s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NearestCentroid,0.82,0.78,0.78,0.82,0.53
BernoulliNB,0.83,0.78,0.78,0.83,0.63
XGBClassifier,0.85,0.77,0.77,0.84,16.57
LGBMClassifier,0.85,0.77,0.77,0.84,2.46
ExtraTreesClassifier,0.85,0.77,0.77,0.84,51.18
RandomForestClassifier,0.85,0.77,0.77,0.84,111.23
LinearDiscriminantAnalysis,0.84,0.77,0.77,0.83,1.38
AdaBoostClassifier,0.84,0.76,0.76,0.84,25.17
GaussianNB,0.81,0.76,0.76,0.82,0.67
SVC,0.84,0.76,0.76,0.84,6629.62


In [10]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)

We are done. That is all, folks!
Finished at  2021-03-22 15:42:18.069457
Elapsed time:  3:28:58.163528


# Takeaways from LazyClassifier Screening

It looks like, in addition to GBDT models, the following additional models are worth trying when mixing up into the ensemble

- RF classifier (see RF hyperopt example per https://www.kaggle.com/virajbagal/eda-xgb-random-forest-parameter-tuning-hyperopt)
- ExtraTreesClassifier

Less accurate but still potentially interesting opportunities can open with
- AdaBoost classifier
- LinearDiscriminantAnalysis classifier
- Ridge
- Logistic Regression
- SVC
- CalibratedClassifier
- SGDClassifier