# Modeling - ethereum - v5 - Comparação

# Setup

## Library import
We import all the required Python libraries

In [2]:
import os
import time

# Data manipulation
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import (
    LGBMClassifier, plot_importance, create_tree_digraph, plot_tree
)
import missingno as msno
# from pycaret.classification import ClassificationExperiment
from sklearn.experimental import enable_halving_search_cv
from sklearn.feature_selection import RFECV
from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV, GridSearchCV, HalvingGridSearchCV, cross_validate, KFold
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn import set_config

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas
set_config(transform_output = "pandas")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.5f}'.format)
# pd.options.display.float_format = '{:.5f}'.format
# pd.options.display.max_rows = 120

sns.set_context(context='paper', font_scale=2, rc=None)
sns.set_style("ticks")
sns.set_palette(sns.color_palette())

## Local library import
We import all the required local libraries libraries

In [3]:
os.chdir('../')
from src.utils.data_describe import breve_descricao, serie_nulos, cardinalidade, check_for_equal_columns
os.chdir('./notebooks/')

def round_4(x):
    return x.round(4)

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [4]:
RAW_FOLDER = '../data/raw/'
INTERIM_FOLDER = '../data/interim/'
PROCESSED_FOLDER = '../data/processed/'
REPORTS_FOLDER = '../reports/'
RANDOM_STATE = 42

train_size = 0.15

# palette = sns.color_palette("Spectral", as_cmap=True)
palette = sns.color_palette("husl", 10)
palette

# Experience 01 - Only numerical attributes

## Data import

In [5]:
df_train_exp_01 = pd.read_parquet(INTERIM_FOLDER + 'df_train_exp_01.pqt')
df_test_exp_01 = pd.read_parquet(INTERIM_FOLDER + 'df_test_exp_01.pqt')

display(df_train_exp_01.head(3))

Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name
0,0x0020731604c882cf7bf8c444be97d17b19ea4316,1,1457.31,34.12,4815.43,3,13,0,10,3,1.0,2.50105,1.34844,1.00087,11.27787,5.84292,16,17.52875,17.52978,0.00104,,,,,,,,,,,,,,,,
1,0x002bf459dc58584d58886169ea0e80f3ca95ffaf,1,3976.5,834.77,9622.53,2,2,0,1,2,0.58627,0.94751,0.76689,0.58541,0.94728,0.76635,4,1.53269,1.53378,0.00109,1.0,1.337,0.0,0.0,0.0,1.0,1.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,1.0
2,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,1,112.9,31.87,321.42,2,3,0,3,1,0.00102,0.8178,0.43961,0.50039,0.81751,0.65895,5,1.3179,1.31882,0.00092,1.0,1.337,0.0,0.0,0.0,1.0,1.0,0.0,1.337,1.337,1.337,0.0,0.0,0.0,0.0,1.0


# Categorical features encoded by frequency

## Data import

In [6]:
df_raw = pd.read_parquet(INTERIM_FOLDER + 'ethereum_complete.pqt')

df_train_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'df_cleaned_train_exp_02_transformed.pqt')
df_test_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'df_cleaned_test_exp_02_transformed.pqt')

X_train_exp_02 = df_train_exp_02.drop(columns=['FLAG'])
y_train_exp_02 = df_train_exp_02[['FLAG']]

X_test_exp_02 = df_test_exp_02.drop(columns=['FLAG'])
y_test_exp_02 = df_test_exp_02[['FLAG']]

print(f"""
X_train_exp_02: {X_train_exp_02.shape}
y_train_exp_02: {y_train_exp_02.shape}

X_test_exp_02: {X_test_exp_02.shape}
y_test_exp_02: {y_test_exp_02.shape}
""")

display(X_train_exp_02.head(3))


X_train_exp_02: (3974, 36)
y_train_exp_02: (3974, 1)

X_test_exp_02: (702, 36)
y_test_exp_02: (702, 1)



Unnamed: 0,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
3692,0.0,0.0,537.6,1,1,0,1,1,1.99,1.99,1.99,1.98975,1.98975,1.98975,2,1.98975,1.99,0.00025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89155,0.51636
152,23488.73,33.36,48178.53,2,36,0,26,2,0.01038,9.99,1.33005,0.07579,48.30435,24.19007,38,48.38014,47.88187,-0.49828,2.0,1.337,0.0,0.0,0.0,2.0,2.0,0.0,0.0,1.337,0.6685,0.0,0.0,0.0,0.0,2.0,0.89155,0.16709
1175,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.337,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.337,0.76024,0.0,0.0,0.0,0.0,1.0,0.89155,0.51636


In [7]:
df_raw = pd.read_parquet(INTERIM_FOLDER + 'ethereum_complete.pqt')

df_train_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'df_cleaned_train_exp_02_transformed.pqt')
df_test_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'df_cleaned_test_exp_02_transformed.pqt')

X_train_exp_02 = df_train_exp_02.drop(columns=['FLAG'])
y_train_exp_02 = df_train_exp_02[['FLAG']]

X_test_exp_02 = df_test_exp_02.drop(columns=['FLAG'])
y_test_exp_02 = df_test_exp_02[['FLAG']]

print(f"""
X_train_exp_02: {X_train_exp_02.shape}
y_train_exp_02: {y_train_exp_02.shape}

X_test_exp_02: {X_test_exp_02.shape}
y_test_exp_02: {y_test_exp_02.shape}
""")

display(X_train_exp_02.head(3))


X_train_exp_02: (3974, 36)
y_train_exp_02: (3974, 1)

X_test_exp_02: (702, 36)
y_test_exp_02: (702, 1)



Unnamed: 0,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,total_transactions_(including_tnx_to_create_contract),total_Ether_sent,total_ether_received,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_rec_contract_addr,ERC20_avg_time_between_contract_tnx,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
3692,0.0,0.0,537.6,1,1,0,1,1,1.99,1.99,1.99,1.98975,1.98975,1.98975,2,1.98975,1.99,0.00025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89155,0.51636
152,23488.73,33.36,48178.53,2,36,0,26,2,0.01038,9.99,1.33005,0.07579,48.30435,24.19007,38,48.38014,47.88187,-0.49828,2.0,1.337,0.0,0.0,0.0,2.0,2.0,0.0,0.0,1.337,0.6685,0.0,0.0,0.0,0.0,2.0,0.89155,0.16709
1175,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.337,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.337,0.76024,0.0,0.0,0.0,0.0,1.0,0.89155,0.51636


## Configurando modelos e dicionários de modelos

In [8]:
max_depth = np.arange(3, 36, 5).tolist(),
num_leaves = np.arange(2**np.min(max_depth), 70, 5).tolist()

lightgbm = LGBMClassifier(boosting_type='gbdt', random_state=RANDOM_STATE, n_jobs=-1, objective='binary', importance_type='gain', verbosity=-1)
catboost = CatBoostClassifier( random_state=RANDOM_STATE, verbose=0, early_stopping_rounds=20)
xgboost = XGBClassifier(random_state = RANDOM_STATE, objective = 'binary:logistic', eval_metric = 'logloss', early_stopping_rounds = 20, n_jobs = -1,)

dct_models = {
    'lightgbm': {
        'model': lightgbm,
        'dct_params': dict(
            max_depth = [3, 8, 13, 18, 23, 28, 33],
            num_leaves = [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68],
            learning_rate = [0.0001, 0.001, 0.01, 0.1],
            n_estimators = np.arange(50, 600, 100).tolist(),
        )
    },
    'catboost': {
        'model': catboost,
        'dct_params': dict(
            depth = [4, 6, 8, 10],
            learning_rate = [0.001, 0.01, 0.1],
            iterations = [100, 200, 300],
            l2_leaf_reg = [1, 3, 5],
            border_count = [32, 64, 128],
            random_strength = [0.1, 1, 10]
        )
    },
    'xgboost': {
        'model': xgboost,
        'dct_params': dict(
            max_depth = np.arange(3, 16, 4).tolist(),
            learning_rate = [0.0001, 0.001, 0.01, 0.1],
            n_estimators = np.arange(50, 600, 100).tolist(),
            reg_lambda = [0.1, 1, 3, 5, 10]
        )
    },
}

for key, item in dct_models.items():
    combinations = 1
    for param, values in item['dct_params'].items():
        combinations *= len(values)
    print(f"""{key}:

dct_params: {item['dct_params']}

combinations: {combinations}
""")
    print(30*'-')

lightgbm:

dct_params: {'max_depth': [3, 8, 13, 18, 23, 28, 33], 'num_leaves': [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68], 'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'n_estimators': [50, 150, 250, 350, 450, 550]}

combinations: 2184

------------------------------
catboost:

dct_params: {'depth': [4, 6, 8, 10], 'learning_rate': [0.001, 0.01, 0.1], 'iterations': [100, 200, 300], 'l2_leaf_reg': [1, 3, 5], 'border_count': [32, 64, 128], 'random_strength': [0.1, 1, 10]}

combinations: 972

------------------------------
xgboost:

dct_params: {'max_depth': [3, 7, 11, 15], 'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'n_estimators': [50, 150, 250, 350, 450, 550], 'reg_lambda': [0.1, 1, 3, 5, 10]}

combinations: 480

------------------------------


## Random search

In [9]:
dct_hyperparams = {}

for model in dct_models.keys():
    print(f'Iniciar {model}.')
    start_time = time.time()
    dct_params = dct_models[model]['dct_params']
    classifier = dct_models[model]['model']

    rscv = RandomizedSearchCV(
        classifier, dct_params, error_score='raise', random_state=RANDOM_STATE, n_iter=200,
        return_train_score=True, scoring='roc_auc', cv=3, verbose=False
    )

    if model == 'xgboost':
        search = rscv.fit(X_train_exp_02, y_train_exp_02['FLAG'].values, eval_set=[(X_test_exp_02, y_test_exp_02)])
    else:
        search = rscv.fit(X_train_exp_02, y_train_exp_02['FLAG'])

    dct_hyperparams[model] = {
        'best_rscv': search.best_params_,
        'results': rscv.cv_results_
        }
    end_time = time.time()
    execution_time = end_time - start_time
    print(f'O modelo {model} passou pela busca aleatória em {int(execution_time//60)} minutos e {execution_time%60:.1f} segundos.\n')


Iniciar lightgbm.



invalid value encountered in cast



O modelo lightgbm passou pela busca aleatória em 4 minutos e 3.8 segundos.

Iniciar catboost.



invalid value encountered in cast



O modelo catboost passou pela busca aleatória em 15 minutos e 6.7 segundos.

Iniciar xgboost.
[0]	validation_0-logloss:0.68879
[1]	validation_0-logloss:0.68871
[2]	validation_0-logloss:0.68863
[3]	validation_0-logloss:0.68855
[4]	validation_0-logloss:0.68847
[5]	validation_0-logloss:0.68839
[6]	validation_0-logloss:0.68831
[7]	validation_0-logloss:0.68822
[8]	validation_0-logloss:0.68814
[9]	validation_0-logloss:0.68806
[10]	validation_0-logloss:0.68798
[11]	validation_0-logloss:0.68790
[12]	validation_0-logloss:0.68782
[13]	validation_0-logloss:0.68774
[14]	validation_0-logloss:0.68766
[15]	validation_0-logloss:0.68758
[16]	validation_0-logloss:0.68750
[17]	validation_0-logloss:0.68742
[18]	validation_0-logloss:0.68734
[19]	validation_0-logloss:0.68726
[20]	validation_0-logloss:0.68717
[21]	validation_0-logloss:0.68709
[22]	validation_0-logloss:0.68701
[23]	validation_0-logloss:0.68693
[24]	validation_0-logloss:0.68685
[25]	validation_0-logloss:0.68677
[26]	validation_0-logloss:0.6866


invalid value encountered in cast



[11]	validation_0-logloss:0.26422
[12]	validation_0-logloss:0.25061
[13]	validation_0-logloss:0.23723
[14]	validation_0-logloss:0.22464
[15]	validation_0-logloss:0.21425
[16]	validation_0-logloss:0.20531
[17]	validation_0-logloss:0.19675
[18]	validation_0-logloss:0.18922
[19]	validation_0-logloss:0.18107
[20]	validation_0-logloss:0.17500
[21]	validation_0-logloss:0.16810
[22]	validation_0-logloss:0.16210
[23]	validation_0-logloss:0.15750
[24]	validation_0-logloss:0.15294
[25]	validation_0-logloss:0.14768
[26]	validation_0-logloss:0.14293
[27]	validation_0-logloss:0.13935
[28]	validation_0-logloss:0.13600
[29]	validation_0-logloss:0.13185
[30]	validation_0-logloss:0.12903
[31]	validation_0-logloss:0.12637
[32]	validation_0-logloss:0.12293
[33]	validation_0-logloss:0.12063
[34]	validation_0-logloss:0.11857
[35]	validation_0-logloss:0.11559
[36]	validation_0-logloss:0.11274
[37]	validation_0-logloss:0.11121
[38]	validation_0-logloss:0.10890
[39]	validation_0-logloss:0.10749
[40]	validatio

In [10]:
df_search_results = pd.DataFrame()

for model in dct_hyperparams.keys():
    print(model)
    df_random_search_results = pd.DataFrame.from_dict(dct_hyperparams[model]['results'], orient='columns')[[
        'rank_test_score', 'mean_fit_time', 'mean_score_time', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score'
    ]]

    df_random_search_results.insert(0, 'model', model)

    df_search_results = pd.concat([df_search_results, df_random_search_results], axis=0, ignore_index=True)

df_search_results.sort_values(by=['model', 'rank_test_score'], inplace=True, ascending=True)
display(df_search_results.loc[df_search_results['rank_test_score'] <= 3])

lightgbm
catboost
xgboost


Unnamed: 0,model,rank_test_score,mean_fit_time,mean_score_time,mean_test_score,std_test_score,mean_train_score,std_train_score
276,catboost,1,0.43674,0.005,0.99867,0.00044,1.0,0.0
349,catboost,2,0.63826,0.004,0.99865,0.00031,1.0,0.0
354,catboost,3,0.34633,0.005,0.99865,0.00042,0.99997,1e-05
102,lightgbm,1,0.113,0.00567,0.99869,0.00021,1.0,0.0
38,lightgbm,2,0.14868,0.00765,0.99862,0.00037,1.0,0.0
125,lightgbm,3,0.12767,0.007,0.9986,0.00019,1.0,0.0
129,lightgbm,3,0.27502,0.00933,0.9986,0.00019,1.0,0.0
542,xgboost,1,1.10501,0.0113,0.9987,0.0004,1.0,0.0
584,xgboost,1,1.1626,0.01485,0.9987,0.0004,1.0,0.0
407,xgboost,3,0.7999,0.00667,0.99865,0.0005,0.99999,1e-05


## Grid search

In [11]:
for key, item in dct_hyperparams.items():
    print(f"{key}: {item['best_rscv']}")
    print()

lightgbm: {'num_leaves': 13, 'n_estimators': 150, 'max_depth': 13, 'learning_rate': 0.1}

catboost: {'random_strength': 10, 'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 200, 'depth': 4, 'border_count': 64}

xgboost: {'reg_lambda': 1, 'n_estimators': 250, 'max_depth': 3, 'learning_rate': 0.1}



In [12]:
dct_best_hyperparams = {
    'lightgbm': {
        'model': lightgbm,
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['lightgbm']['best_rscv']['max_depth'] - 2, dct_hyperparams['lightgbm']['best_rscv']['max_depth'] + 2, 1).tolist(),
            num_leaves = np.arange(dct_hyperparams['lightgbm']['best_rscv']['num_leaves'] - 2, dct_hyperparams['lightgbm']['best_rscv']['num_leaves'] + 2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['lightgbm']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['lightgbm']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            n_estimators = np.arange(np.max([dct_hyperparams['lightgbm']['best_rscv']['n_estimators'] - 50, 0]), dct_hyperparams['lightgbm']['best_rscv']['n_estimators'] + 100, 50).tolist(),
        )
    },
    'catboost': {
        'model': catboost,
        'dct_params': dict(
            depth = np.arange(dct_hyperparams['catboost']['best_rscv']['depth']-2, dct_hyperparams['catboost']['best_rscv']['depth']+2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['catboost']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['catboost']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            iterations = np.arange(np.max([dct_hyperparams['catboost']['best_rscv']['iterations'] - 50, 0]), dct_hyperparams['catboost']['best_rscv']['iterations'] + 100, 50).tolist(),
            l2_leaf_reg = np.arange(dct_hyperparams['catboost']['best_rscv']['l2_leaf_reg'] - 1, dct_hyperparams['catboost']['best_rscv']['l2_leaf_reg'] + 1, 0.5).tolist(),
            # border_count = np.arange(dct_hyperparams['catboost']['best_rscv']['border_count'] - 1, dct_hyperparams['catboost']['best_rscv']['border_count'] + 1, 1).tolist(),
            # random_strength = np.arange(dct_hyperparams['catboost']['best_rscv']['random_strength'] - 0.1, dct_hyperparams['catboost']['best_rscv']['random_strength'] + 0.1, 0.05).tolist(),
        )
    },
    'xgboost': {
        'model': xgboost,
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['xgboost']['best_rscv']['max_depth'] - 2, dct_hyperparams['xgboost']['best_rscv']['max_depth'] + 2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['xgboost']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['xgboost']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            n_estimators = np.arange(np.max([dct_hyperparams['xgboost']['best_rscv']['n_estimators'] - 50, 0]), dct_hyperparams['xgboost']['best_rscv']['n_estimators'] + 100, 50).tolist(),
            reg_lambda = np.arange(dct_hyperparams['xgboost']['best_rscv']['reg_lambda'] - 1, dct_hyperparams['xgboost']['best_rscv']['reg_lambda'] + 1, 0.5).tolist(),
        )
    },
}

for key, item in dct_best_hyperparams.items():
    combinations = 1
    for param, values in item['dct_params'].items():
        combinations *= len(values)
    print(f"""{key}:

dct_params: {item['dct_params']}

combinations: {combinations}
""")
    print(30*'-')

lightgbm:

dct_params: {'max_depth': [11, 12, 13, 14], 'num_leaves': [11, 12, 13, 14], 'learning_rate': [0.09000000000000001, 0.09500000000000001, 0.10000000000000002, 0.10500000000000002], 'n_estimators': [100, 150, 200]}

combinations: 192

------------------------------
catboost:

dct_params: {'depth': [2, 3, 4, 5], 'learning_rate': [0.09000000000000001, 0.09500000000000001, 0.10000000000000002, 0.10500000000000002], 'iterations': [150, 200, 250], 'l2_leaf_reg': [0.0, 0.5, 1.0, 1.5]}

combinations: 192

------------------------------
xgboost:

dct_params: {'max_depth': [1, 2, 3, 4], 'learning_rate': [0.09000000000000001, 0.09500000000000001, 0.10000000000000002, 0.10500000000000002], 'n_estimators': [200, 250, 300], 'reg_lambda': [0.0, 0.5, 1.0, 1.5]}

combinations: 192

------------------------------


In [13]:
dct_final_hyperparams = {}

for model in dct_best_hyperparams.keys():
    print(f'Iniciar {model}.')
    start_time = time.time()
    dct_params = dct_best_hyperparams[model]['dct_params']
    classifier = dct_best_hyperparams[model]['model']

    gscv = GridSearchCV(
        classifier, dct_params, error_score='raise', n_jobs=-1,
        return_train_score=True, scoring='roc_auc', cv=3, verbose=False
    )

    if model == 'xgboost':
        search = gscv.fit(X_train_exp_02, y_train_exp_02['FLAG'].values, eval_set=[(X_test_exp_02, y_test_exp_02)])
    else:
        search = gscv.fit(X_train_exp_02, y_train_exp_02['FLAG'])

    dct_final_hyperparams[model] = {
        'best_gscv': search.best_params_,
        'results': gscv.cv_results_
        }
    end_time = time.time()
    execution_time = end_time - start_time
    print(f'O modelo {model} passou pela busca em grade em {int(execution_time//60)} minutos e {execution_time%60:.1f} segundos.\n')

Iniciar lightgbm.



invalid value encountered in cast



O modelo lightgbm passou pela busca em grade em 3 minutos e 43.3 segundos.

Iniciar catboost.
O modelo catboost passou pela busca em grade em 2 minutos e 34.8 segundos.

Iniciar xgboost.
[0]	validation_0-logloss:0.61989
[1]	validation_0-logloss:0.56253
[2]	validation_0-logloss:0.51206
[3]	validation_0-logloss:0.47111
[4]	validation_0-logloss:0.43378
[5]	validation_0-logloss:0.40295
[6]	validation_0-logloss:0.37273
[7]	validation_0-logloss:0.34775
[8]	validation_0-logloss:0.32635
[9]	validation_0-logloss:0.30697
[10]	validation_0-logloss:0.28984
[11]	validation_0-logloss:0.27437
[12]	validation_0-logloss:0.25937
[13]	validation_0-logloss:0.24570
[14]	validation_0-logloss:0.23336
[15]	validation_0-logloss:0.22276
[16]	validation_0-logloss:0.21331
[17]	validation_0-logloss:0.20471
[18]	validation_0-logloss:0.19702
[19]	validation_0-logloss:0.18994
[20]	validation_0-logloss:0.18210
[21]	validation_0-logloss:0.17546
[22]	validation_0-logloss:0.16891
[23]	validation_0-logloss:0.16391
[24]	va


invalid value encountered in cast



[59]	validation_0-logloss:0.08067
[60]	validation_0-logloss:0.07957
[61]	validation_0-logloss:0.07896
[62]	validation_0-logloss:0.07816
[63]	validation_0-logloss:0.07752
[64]	validation_0-logloss:0.07684
[65]	validation_0-logloss:0.07600
[66]	validation_0-logloss:0.07478
[67]	validation_0-logloss:0.07397
[68]	validation_0-logloss:0.07330
[69]	validation_0-logloss:0.07237
[70]	validation_0-logloss:0.07118
[71]	validation_0-logloss:0.07119
[72]	validation_0-logloss:0.06994
[73]	validation_0-logloss:0.06959
[74]	validation_0-logloss:0.06902
[75]	validation_0-logloss:0.06783
[76]	validation_0-logloss:0.06771
[77]	validation_0-logloss:0.06722
[78]	validation_0-logloss:0.06679
[79]	validation_0-logloss:0.06579
[80]	validation_0-logloss:0.06573
[81]	validation_0-logloss:0.06464
[82]	validation_0-logloss:0.06479
[83]	validation_0-logloss:0.06444
[84]	validation_0-logloss:0.06391
[85]	validation_0-logloss:0.06351
[86]	validation_0-logloss:0.06327
[87]	validation_0-logloss:0.06280
[88]	validatio

In [14]:
for model, items in dct_final_hyperparams.items():
    print(f"{model}: {items['best_gscv']}")
    print()

lightgbm: {'learning_rate': 0.09000000000000001, 'max_depth': 11, 'n_estimators': 200, 'num_leaves': 12}

catboost: {'depth': 3, 'iterations': 150, 'l2_leaf_reg': 1.0, 'learning_rate': 0.09500000000000001}

xgboost: {'learning_rate': 0.09500000000000001, 'max_depth': 3, 'n_estimators': 250, 'reg_lambda': 1.0}



In [15]:
df_search_results = pd.DataFrame()

for model in dct_final_hyperparams.keys():
    print(model)
    df_random_search_results = pd.DataFrame.from_dict(dct_final_hyperparams[model]['results'], orient='columns')[[
        'rank_test_score', 'mean_fit_time', 'mean_score_time', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score'
    ]]

    df_random_search_results.insert(0, 'model', model)

    df_search_results = pd.concat([df_search_results, df_random_search_results], axis=0, ignore_index=True)

df_search_results.sort_values(by=['model', 'rank_test_score'], inplace=True, ascending=True)
display(df_search_results.loc[df_search_results['rank_test_score'] <= 3])

lightgbm
catboost
xgboost


Unnamed: 0,model,rank_test_score,mean_fit_time,mean_score_time,mean_test_score,std_test_score,mean_train_score,std_train_score
249,catboost,1,1.24398,0.01989,0.99865,0.0005,0.99995,2e-05
247,catboost,2,1.25524,0.01619,0.99862,0.00057,0.99998,1e-05
281,catboost,3,2.01462,0.01869,0.99862,0.0004,1.0,0.0
9,lightgbm,1,3.99962,0.01765,0.99877,0.00023,1.0,0.0
21,lightgbm,1,4.20143,0.00951,0.99877,0.00023,1.0,0.0
33,lightgbm,1,2.67193,0.00885,0.99877,0.00023,1.0,0.0
45,lightgbm,1,1.82105,0.00935,0.99877,0.00023,1.0,0.0
462,xgboost,1,0.56032,0.0107,0.99874,0.0004,0.99999,0.0
466,xgboost,1,0.88837,0.0129,0.99874,0.0004,0.99999,0.0
458,xgboost,3,0.48224,0.0117,0.99874,0.0004,0.99999,1e-05
