In [1]:
import numpy as np
import pandas as pd
import pandas_profiling

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
def evalue_model(modelo, target, base, nome_modelo):
    
    yhat_prob = [x[1] for x in modelo.predict_proba(base)]
    
    dicionario = {'modelo': nome_modelo,
                  'auc': roc_auc_score(y_true = target, y_score = yhat_prob),
                  'aucpr': average_precision_score(y_true = target, y_score = yhat_prob),
                  'logloss': log_loss(target, yhat_prob)}
    
    ## retorna o dicionario
    return dicionario

In [3]:
df = pd.read_csv('train.csv.zip')

In [4]:
#profile = df.profile_report(title="Profile train.csv", explorative=True)
#profile.to_file(output_file="profile_report.html")

In [5]:
categorical_cols = df.columns[df.dtypes == "object"].tolist()

to_remove = ["id"]

high_cardinality = ["cat5", "cat7", "cat8", "cat10"]

In [6]:
df.drop(columns = to_remove, inplace = True)

In [7]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [8]:
categorical_transformer = Pipeline(steps=[
    ('OrdinalEncoder', OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = 999))
])

In [9]:
preprocessor = ColumnTransformer(
transformers=[
#('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])

In [10]:
%%time
classifiers = {
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state = 42),
    "RandomForestClassifier": RandomForestClassifier(random_state = 42),
    "XGBClassifier": XGBClassifier(random_state = 42),
    "LGBMClassifier": LGBMClassifier(random_state = 42),
    "CatBoostClassifier": CatBoostClassifier(random_state = 42, verbose = False)
}

# df para armazenar metricas no teste
results = pd.DataFrame(columns= ["modelo", "auc", "aucpr", "logloss"])

# df para armazenar as predicoes
#pred_df = pd.DataFrame(y_test,index=None)

for key, classifier in classifiers.items():
    print("Running", key)
    pipe = Pipeline([('preprocessor', preprocessor),
                 ('clf', classifier)])
    pipe          = pipe.fit(X_train, y_train)
    #pred_df[key]   = model.predict_proba(X_test)[:,1]
    results        = results.append(pd.DataFrame(evalue_model(pipe, y_test, X_test, key), index=[0]))

Running DecisionTreeClassifier
Running RandomForestClassifier
Running XGBClassifier
Running LGBMClassifier
Running CatBoostClassifier
CPU times: user 5min 17s, sys: 22.3 s, total: 5min 39s
Wall time: 2min 52s


In [11]:
results

Unnamed: 0,modelo,auc,aucpr,logloss
0,DecisionTreeClassifier,0.712094,0.45037,7.370102
0,RandomForestClassifier,0.859432,0.731141,0.551236
0,XGBClassifier,0.882735,0.771407,0.360983
0,LGBMClassifier,0.881049,0.7688,0.363469
0,CatBoostClassifier,0.883906,0.774461,0.358983


In [None]:
yhat_prob = [x[1] for x in modelo.predict_proba(base)]