# Import dependencies

In [1]:
import numpy as np
import pandas as pd
import pandas_profiling

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss

from hyperopt import fmin, hp, tpe, Trials, space_eval
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample
from functools import partial

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks

import category_encoders as ce

In [2]:
def evalue_model(model, y_test, X_test, model_name):
    
    yhat_prob = [x[1] for x in model.predict_proba(X_test)]
    
    results = {'model': model_name,
               'auc': roc_auc_score(y_true = y_test, y_score = yhat_prob),
               'aucpr': average_precision_score(y_true = y_test, y_score = yhat_prob),
               'logloss': log_loss(y_test, yhat_prob)}
    
    return results

In [3]:
#submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
#new_data = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
df = pd.read_csv('train.csv.zip')

In [4]:
#profile = df.profile_report(title="Profile train.csv", explorative=True)
#profile.to_file(output_file="profile_report.html")

# Prepare Data

In [5]:

to_remove = ["id"]

high_cardinality = ["cat5", "cat7", "cat8", "cat10"]

categorical_cols = df.columns[df.dtypes == "object"].tolist()
categorical_cols = [x for x in set(categorical_cols) - set(high_cardinality)]

In [6]:
df.drop(columns = to_remove, inplace = True)
#new_data.drop(columns = to_remove, inplace = True)

In [7]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Baseline Models

In [8]:
%%time

categorical_transformer = Pipeline(steps=[
    ('OrdinalEncoder', ce.OrdinalEncoder(cols=categorical_cols)),
    ('CatBoostEncoder', ce.JamesSteinEncoder(cols=high_cardinality))
])

preprocessor = ColumnTransformer(
    transformers=[
        #('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols + high_cardinality)
])

classifiers = {
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state = 42),
    "RandomForestClassifier": RandomForestClassifier(random_state = 42),
    "XGBClassifier": XGBClassifier(random_state = 42, use_label_encoder=False),
    "LGBMClassifier": LGBMClassifier(random_state = 42),
    "CatBoostClassifier": CatBoostClassifier(random_state = 42, verbose = False)
}

results = pd.DataFrame(columns= ["model", "auc", "aucpr", "logloss"])

#pred_df = pd.DataFrame(y_test,index=None)

for key, classifier in classifiers.items():
    print("Running", key)
    pipe = Pipeline([('preprocessor', preprocessor),
                 ('clf', classifier)])
    pipe          = pipe.fit(X_train, y_train)
    #pred_df[key]   = model.predict_proba(X_test)[:,1]
    results        = results.append(pd.DataFrame(evalue_model(pipe, y_test, X_test, key), index=[0]))

Running DecisionTreeClassifier


  elif pd.api.types.is_categorical(cols):


Running RandomForestClassifier


  elif pd.api.types.is_categorical(cols):


Running XGBClassifier


  elif pd.api.types.is_categorical(cols):


Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Running LGBMClassifier


  elif pd.api.types.is_categorical(cols):


Running CatBoostClassifier


  elif pd.api.types.is_categorical(cols):


CPU times: user 6min 2s, sys: 27.7 s, total: 6min 30s
Wall time: 3min 24s


In [9]:
results

Unnamed: 0,model,auc,aucpr,logloss
0,DecisionTreeClassifier,0.71416,0.454221,7.269377
0,RandomForestClassifier,0.861844,0.73783,0.544299
0,XGBClassifier,0.883809,0.776103,0.358358
0,LGBMClassifier,0.882859,0.773769,0.360421
0,CatBoostClassifier,0.88518,0.778127,0.356693


# Tunning

In [10]:
# Preparar dados de validacao
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [11]:
hp_space_lgbm = {
    'undersample':  hp.choice(label = 'sample', options = [None, 'tomek', 'ncr','oss']),
    'lgbm': {
        #'n_estimators': ho_scope.int(hp.quniform('n_estimators',100,600,100)), # auto 
        #'learning_rate': hp.loguniform('learning_rate',np.log(1e-5),np.log(0.05)), # auto 
        
        'max_depth':  ho_scope.int(hp.quniform('max_depth',2,63,1)),
        'num_leaves': hp.choice(label = 'power', options = [15, 31, 63, 127, 255, 511, 1023, 2047]),
        #'min_child_weight':  ho_scope.int(hp.quniform('min_child_weight',0,X_train.shape[0]/100,1)),
        'max_delta_step': ho_scope.int(hp.quniform('max_delta_step',1,10,1)),
        # amostragens
        'feature_fraction': hp.uniform('colsample_bytree',0.4,0.9),
        
        'reg_lambda': hp.loguniform('reg_lambda',np.log(1e-4),np.log(5)),
        'reg_alpha': hp.loguniform('reg_alpha',np.log(1e-4),np.log(5)),
        'min_gain_to_split': hp.loguniform('min_gain_to_split',np.log(1e-4),np.log(2)),
        
        #'scale_pos_weight' : ho_scope.int(hp.loguniform('scale_pos_weight',np.log(1),np.log(scale_pos_weight_max))),
    }
}

In [12]:
iteracoes_lgbm = Trials()

In [13]:
def instancia_modelo(hiperparametros):
    
    clf = LGBMClassifier(**hiperparametros['lgbm'], 
                            n_jobs = -1,
                            random_state = 42,      
                            objective = "binary", 
                            n_estimators = 100, 
                            bagging_freq = 1,       
                            learning_rate = 0.1)

    ## ADASYN: Adaptive synthetic sampling
    if hiperparametros['undersample'] == 'adasyn':
        undersample = ADASYN(random_state=42)

    ## Tomek Links: remover exemplos ambiguos
    elif hiperparametros['undersample'] == 'tomek':
        undersample = TomekLinks()

    ## Neighborhood Cleaning Rule for Undersampling: Condensed Nearest Neighbor (remove redundantes) + Edited Nearest Neighbors (remove ruido ou ambiguo)
    elif hiperparametros['undersample'] == 'ncr': 
        undersample  = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)

    ## One-Sided Selection : Tomek Links (remover ambiguos) + Condensed Nearest Neighbor (remove redundantes) 
    elif hiperparametros['undersample'] == 'oss':
        undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200)

    else:
        undersample = None
        
        
    categorical_transformer = Pipeline(steps=[
        ('OrdinalEncoder', ce.OrdinalEncoder(cols=categorical_cols)),
        ('JamesSteinEncoder', ce.JamesSteinEncoder(cols=high_cardinality))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            #('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, high_cardinality + categorical_cols)
    ])

    modelo = Pipeline([('preprocessor', preprocessor),
                       ('undersample', undersample),
                       ('clf', clf) ])

    ## retornando o objeto do modelo
    return modelo

In [14]:
## criando uma função para realizar o treino do modelo
def funcao_para_minimizar(hiperparametros, features, target):
    
    ## criando uma instancia do modelo com a combinação definida de hiperparametros para usar dentro da função
    modelo = instancia_modelo(hiperparametros)
    
    # Usando dados de validacao
    eval_set = [(pd.DataFrame(X_val), pd.DataFrame(y_val))]
    
    fit_params={'clf__early_stopping_rounds': 50, 
                'clf__eval_metric': 'auc',
                'clf__verbose': False,
                #'clf__categorical_feature': categorical_cols + high_cardinality,
                'clf__eval_set': eval_set}
    
    cv = StratifiedKFold(n_splits=5)
    
    ## treinando o modelo com cross-validation
    resultado = cross_val_score(estimator = modelo, 
                                X = features, 
                                y = target, 
                                scoring = "roc_auc",
                                cv = cv, 
                                error_score = "raise",
                              #  fit_params = fit_params,
                                n_jobs = -1)
    
    ## retornando a metrica da performance do modelo
    return resultado.mean()

In [15]:
%%time

## rodando a otimização
otimizacao = fmin(fn = partial(funcao_para_minimizar, features = X_train, target = y_train),
                  space = hp_space_lgbm, 
                  algo = tpe.suggest,
                  trials = iteracoes_lgbm,
                  max_evals = int(10), 
                  rstate = np.random.RandomState(42))

  0%|          | 0/10 [03:06<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
final_model = CatBoostClassifier(random_state = 42, verbose = False)

In [None]:
pipe = Pipeline([('preprocessor', preprocessor),
              ('clf', final_model)])
pipe = pipe.fit(X, y)

In [None]:
submission.loc[:, 'target'] = pipe.predict_proba(new_data)[:,1]
submission.to_csv('submission.csv', index = False)