# Modeling - ethereum - v5 - Comparação

# Setup

## Library import
We import all the required Python libraries

In [1]:
import os
import time

# Data manipulation
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import (
    LGBMClassifier, plot_importance, create_tree_digraph, plot_tree
)
import missingno as msno
# from pycaret.classification import ClassificationExperiment
from sklearn.experimental import enable_halving_search_cv
from sklearn.feature_selection import RFECV
from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV, GridSearchCV, HalvingGridSearchCV, cross_validate, KFold
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn import set_config

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas
set_config(transform_output = "pandas")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.5f}'.format)
# pd.options.display.float_format = '{:.5f}'.format
# pd.options.display.max_rows = 120

sns.set_context(context='paper', font_scale=2, rc=None)
sns.set_style("ticks")
sns.set_palette(sns.color_palette())

## Local library import
We import all the required local libraries libraries

In [2]:
os.chdir('../')
from src.utils.data_describe import breve_descricao, serie_nulos, cardinalidade, check_for_equal_columns
os.chdir('./notebooks/')

def round_4(x):
    return x.round(4)

def save_dict_to_txt(content: dict, file_path: str, mode: str = 'w', encoding: str = 'utf-8') -> None:
    try:
        with open(file_path, mode=mode, encoding=encoding) as f:
            for key, value in content.items():
                f.write(f"{key}: {value}\n")
        print(f"Dictionary successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving file: {str(e)}")

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [3]:
RAW_FOLDER = '../data/raw/'
INTERIM_FOLDER = '../data/interim/'
PROCESSED_FOLDER = '../data/processed/'
REPORTS_FOLDER = '../reports/'
RANDOM_STATE = 42

train_size = 0.15
scoring_metric = 'roc_auc'

# palette = sns.color_palette("Spectral", as_cmap=True)
palette = sns.color_palette("husl", 10)
palette

# Experience 01 - Only numerical attributes

## Data import

In [4]:
df_train_exp_01 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_train_exp_01.pqt')
df_test_exp_01 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_test_exp_01.pqt')

display(df_train_exp_01.head(3))

Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.61327,0.38569,0.0,1.8,0.03284,0.0,0.0,0.0,102,3.0873,3.08548,0.0,-0.00182,8.0,403.42831,2.26081,0.0,1.0,5.0,0.0,7.0,0.0,365.0,57.63262,2.26081,2.26081,2.26081,1.0,7.0
2,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,10,2,0.11312,1.16545,0.35891,0.05,3.53862,1.79431,0.0,0.0,0.0,12,3.58862,3.58906,0.0,0.00044,8.0,521.51207,0.0,0.0,0.0,7.0,0.0,8.0,0.0,442.81984,65.18901,0.0,0.0,0.0,0.0,8.0
4,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,12.80241,2.6711,0.0,9.0,0.02269,0.0,0.0,0.0,4619,104.31888,53.4219,0.0,-50.89699,42.0,162829.6609,123539.9329,0.0,4.0,23.0,0.0,27.0,0.0,90000.0,4934.23215,0.0,45000.0,13726.65922,6.0,27.0


# Categorical features encoded by frequency

## Data import

In [5]:
df_raw = pd.read_parquet(INTERIM_FOLDER + 'transaction_dataset.pqt')

df_train_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_train_exp_02_transformed.pqt')
df_test_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_test_exp_02_transformed.pqt')

X_train_exp_02 = df_train_exp_02.drop(columns=['FLAG'])
y_train_exp_02 = df_train_exp_02[['FLAG']]

X_test_exp_02 = df_test_exp_02.drop(columns=['FLAG'])
y_test_exp_02 = df_test_exp_02[['FLAG']]

print(f"""
X_train_exp_02: {X_train_exp_02.shape}
y_train_exp_02: {y_train_exp_02.shape}

X_test_exp_02: {X_test_exp_02.shape}
y_test_exp_02: {y_test_exp_02.shape}
""")

display(X_train_exp_02.head(3))


X_train_exp_02: (8343, 40)
y_train_exp_02: (8343, 1)

X_test_exp_02: (1473, 40)
y_test_exp_02: (1473, 1)



Unnamed: 0,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
2859,163.07,0.17,326.47,2,2,0,2,2,35.49665,65.50334,50.5,3.0,97.99902,50.49951,0.0,0.0,0.0,4,100.99902,101.0,0.0,0.00098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84238,0.53506
2149,0.0,4.64,2726.18,1,2,0,1,1,0.015,6.90985,3.46242,6.92436,6.92436,6.92436,0.0,0.0,0.0,3,6.92436,6.92485,0.0,0.00049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84238,0.53506
2979,1050.5,887.86,1393223.92,689,754,0,405,448,0.0,81.8288,1.66417,0.0,59.98,1.52056,0.0,0.0,0.0,1443,1047.66474,1254.78214,0.0,207.1174,31.0,2547558.025,387.73596,0.0,1.0,19.0,0.0,23.0,0.0,2537935.0,87846.82844,0.7,387.03596,193.86798,1.0,23.0,0.15762,0.46494


In [6]:
df_raw = pd.read_parquet(INTERIM_FOLDER + 'transaction_dataset.pqt')

df_train_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_train_exp_02_transformed.pqt')
df_test_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_test_exp_02_transformed.pqt')

X_train_exp_02 = df_train_exp_02.drop(columns=['FLAG'])
y_train_exp_02 = df_train_exp_02[['FLAG']]

X_test_exp_02 = df_test_exp_02.drop(columns=['FLAG'])
y_test_exp_02 = df_test_exp_02[['FLAG']]

print(f"""
X_train_exp_02: {X_train_exp_02.shape}
y_train_exp_02: {y_train_exp_02.shape}

X_test_exp_02: {X_test_exp_02.shape}
y_test_exp_02: {y_test_exp_02.shape}
""")

display(X_train_exp_02.head(3))


X_train_exp_02: (8343, 40)
y_train_exp_02: (8343, 1)

X_test_exp_02: (1473, 40)
y_test_exp_02: (1473, 1)




Unnamed: 0,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
2859,163.07,0.17,326.47,2,2,0,2,2,35.49665,65.50334,50.5,3.0,97.99902,50.49951,0.0,0.0,0.0,4,100.99902,101.0,0.0,0.00098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84238,0.53506
2149,0.0,4.64,2726.18,1,2,0,1,1,0.015,6.90985,3.46242,6.92436,6.92436,6.92436,0.0,0.0,0.0,3,6.92436,6.92485,0.0,0.00049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84238,0.53506
2979,1050.5,887.86,1393223.92,689,754,0,405,448,0.0,81.8288,1.66417,0.0,59.98,1.52056,0.0,0.0,0.0,1443,1047.66474,1254.78214,0.0,207.1174,31.0,2547558.025,387.73596,0.0,1.0,19.0,0.0,23.0,0.0,2537935.0,87846.82844,0.7,387.03596,193.86798,1.0,23.0,0.15762,0.46494


## Configurando modelos e dicionários de modelos

In [7]:
max_depth = np.arange(3, 36, 5).tolist(),
num_leaves = np.arange(2**np.min(max_depth), 70, 5).tolist()

lightgbm = LGBMClassifier(boosting_type='gbdt', random_state=RANDOM_STATE, n_jobs=-1, objective='binary', importance_type='gain', verbosity=-1)
catboost = CatBoostClassifier( random_state=RANDOM_STATE, verbose=0, early_stopping_rounds=20)
xgboost = XGBClassifier(
    random_state=RANDOM_STATE, objective='binary:logistic', eval_metric='logloss', early_stopping_rounds=10, n_jobs=-1, verbose_eval=None
)
random_forest = RandomForestClassifier(
    max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=RANDOM_STATE, verbose=0
)
decision_tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
knn = KNeighborsClassifier(n_jobs=-1)

dct_models = {
    'lightgbm': {
        'model': lightgbm,
        'is_active': True,
        'dct_params': dict(
            max_depth = [3, 8, 13, 18, 23, 28, 33],
            num_leaves = [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68],
            learning_rate = [0.0001, 0.001, 0.01, 0.1],
            n_estimators = np.arange(50, 600, 100).tolist(),
        )
    },
    'random_forest': {
        'model': random_forest,
        'is_active': True,
        'dct_params': dict(
            max_depth = [3, 8, 13, 18, 23, 28, 33],
            max_leaf_nodes = [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68],
            n_estimators = np.arange(50, 600, 100).tolist(),
        )
    },
    'decision_tree': {
        'model': decision_tree,
        'is_active': True,
        'dct_params': dict(
            max_depth = [3, 8, 13, 18, 23, 28, 33],
            max_leaf_nodes = [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68]
        )
    },
    'knn': {
        'model': knn,
        'is_active': True,
        'dct_params': dict(
            n_neighbors = [3, 8, 13, 18, 23, 28, 33]
        )
    },
    'catboost': {
        'model': catboost,
        'is_active': True,
        'dct_params': dict(
            depth = [4, 6, 8, 10],
            learning_rate = [0.001, 0.01, 0.1],
            iterations = [100, 200, 300],
            l2_leaf_reg = [1, 3, 5],
            border_count = [32, 64, 128],
            random_strength = [0.1, 1, 10]
        )
    },
    'xgboost': {
        'model': xgboost,
        'is_active': True,
        'dct_params': dict(
            max_depth = np.arange(3, 16, 4).tolist(),
            learning_rate = [0.0001, 0.001, 0.01, 0.1],
            n_estimators = np.arange(50, 600, 100).tolist(),
            reg_lambda = [0.1, 1, 3, 5, 10]
        )
    },
}

for key, item in dct_models.items():
    combinations = 1
    for param, values in item['dct_params'].items():
        combinations *= len(values)
    dct_models[key].update({'combinations': combinations})
    print(f"""{key}:
    dct_params: {item['dct_params']},
    combinations: {combinations}
    \n""")
    print(30*'-')

save_dict_to_txt(content=dct_models, file_path=REPORTS_FOLDER+f'random_search-params_combinations-{scoring_metric}.txt')


lightgbm:
    dct_params: {'max_depth': [3, 8, 13, 18, 23, 28, 33], 'num_leaves': [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68], 'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'n_estimators': [50, 150, 250, 350, 450, 550]},
    combinations: 2184
    

------------------------------
random_forest:
    dct_params: {'max_depth': [3, 8, 13, 18, 23, 28, 33], 'max_leaf_nodes': [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68], 'n_estimators': [50, 150, 250, 350, 450, 550]},
    combinations: 546
    

------------------------------
decision_tree:
    dct_params: {'max_depth': [3, 8, 13, 18, 23, 28, 33], 'max_leaf_nodes': [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68]},
    combinations: 91
    

------------------------------
knn:
    dct_params: {'n_neighbors': [3, 8, 13, 18, 23, 28, 33]},
    combinations: 7
    

------------------------------
catboost:
    dct_params: {'depth': [4, 6, 8, 10], 'learning_rate': [0.001, 0.01, 0.1], 'iterations': [100, 200, 300], 'l2_leaf_reg': [

## Random search

In [8]:
dct_hyperparams = {}

for model in dct_models.keys():
    if dct_models[model]['is_active']:
        try:
            print(f'Iniciar {model}.')
            start_time = time.time()
            dct_params = dct_models[model]['dct_params']
            classifier = dct_models[model]['model']

            rscv = RandomizedSearchCV(
                classifier, dct_params, error_score='raise', random_state=RANDOM_STATE, n_iter=100,
                return_train_score=True, scoring=scoring_metric, cv=3, verbose=False
            )

            if model == 'xgboost':
                search = rscv.fit(X_train_exp_02, y_train_exp_02['FLAG'].values, eval_set=[(X_test_exp_02, y_test_exp_02)])
            else:
                search = rscv.fit(X_train_exp_02, y_train_exp_02['FLAG'])

            dct_hyperparams[model] = {
                'best_rscv': search.best_params_,
                'results': rscv.cv_results_
                }
            end_time = time.time()
            execution_time = end_time - start_time
            print(f'O modelo {model} passou pela busca aleatória em {int(execution_time//60)} minutos e {execution_time%60:.1f} segundos.\n')
        except:
            print(f'Erro ao executar o modelo {model}.\n')

    else:
        print(f'O modelo {model} está desativado.\n')


Iniciar lightgbm.
O modelo lightgbm passou pela busca aleatória em 2 minutos e 3.3 segundos.

Iniciar random_forest.
O modelo lightgbm passou pela busca aleatória em 2 minutos e 3.3 segundos.

Iniciar random_forest.
O modelo random_forest passou pela busca aleatória em 3 minutos e 27.2 segundos.

Iniciar decision_tree.
O modelo random_forest passou pela busca aleatória em 3 minutos e 27.2 segundos.

Iniciar decision_tree.



The total space of parameters 91 is smaller than n_iter=100. Running 91 iterations. For exhaustive searches, use GridSearchCV.



O modelo decision_tree passou pela busca aleatória em 0 minutos e 14.7 segundos.

Iniciar knn.



The total space of parameters 7 is smaller than n_iter=100. Running 7 iterations. For exhaustive searches, use GridSearchCV.



O modelo knn passou pela busca aleatória em 0 minutos e 2.5 segundos.

Iniciar catboost.
O modelo catboost passou pela busca aleatória em 8 minutos e 59.9 segundos.

Iniciar xgboost.
[0]	validation_0-logloss:0.53020
[1]	validation_0-logloss:0.53012
[2]	validation_0-logloss:0.53004
O modelo catboost passou pela busca aleatória em 8 minutos e 59.9 segundos.

Iniciar xgboost.
[0]	validation_0-logloss:0.53020
[1]	validation_0-logloss:0.53012
[2]	validation_0-logloss:0.53004
[3]	validation_0-logloss:0.52996
[4]	validation_0-logloss:0.52988
[3]	validation_0-logloss:0.52996
[4]	validation_0-logloss:0.52988
[5]	validation_0-logloss:0.52980
[6]	validation_0-logloss:0.52972
[7]	validation_0-logloss:0.52964[5]	validation_0-logloss:0.52980
[6]	validation_0-logloss:0.52972
[7]	validation_0-logloss:0.52964
[8]	validation_0-logloss:0.52956
[9]	validation_0-logloss:0.52948

[8]	validation_0-logloss:0.52956
[9]	validation_0-logloss:0.52948
[10]	validation_0-logloss:0.52940
[11]	validation_0-logloss:0.5

In [9]:
df_search_results = pd.DataFrame()

for model in dct_hyperparams.keys():
    print(model)
    df_random_search_results = pd.DataFrame.from_dict(dct_hyperparams[model]['results'], orient='columns')[[
        'rank_test_score', 'mean_fit_time', 'mean_score_time', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score'
    ]]

    df_random_search_results.insert(0, 'model', model)

    df_search_results = pd.concat([df_search_results, df_random_search_results], axis=0, ignore_index=True)

df_search_results.sort_values(by=['model', 'rank_test_score'], inplace=True, ascending=True)
display(df_search_results.loc[df_search_results['rank_test_score'] <= 3])

df_search_results.to_excel(REPORTS_FOLDER + f'artigo_random_search-{scoring_metric}.xlsx')

lightgbm

random_forest
decision_tree
knn
catboost
xgboost
random_forest
decision_tree
knn
catboost
xgboost


Unnamed: 0,model,rank_test_score,mean_fit_time,mean_score_time,mean_test_score,std_test_score,mean_train_score,std_train_score
380,catboost,1,1.67728,0.00585,0.99839,0.00042,1.0,0.0
307,catboost,2,2.3404,0.0078,0.99816,0.00042,1.0,0.0
354,catboost,3,1.78131,0.00732,0.99816,0.00047,1.0,0.0
217,decision_tree,1,0.05003,0.00082,0.98247,0.00137,0.99019,0.00047
229,decision_tree,2,0.04332,0.0067,0.98202,0.00135,0.98781,0.0022
242,decision_tree,2,0.04372,0.00067,0.98202,0.00135,0.98781,0.0022
255,decision_tree,2,0.05757,0.0,0.98202,0.00135,0.98781,0.0022
268,decision_tree,2,0.0406,0.01049,0.98202,0.00135,0.98781,0.0022
281,decision_tree,2,0.043,0.0053,0.98202,0.00135,0.98781,0.0022
297,knn,1,0.00231,0.04939,0.927,0.00744,0.94002,0.00323


## Grid search

In [10]:
for key, item in dct_hyperparams.items():
    print(f"{key}: {item['best_rscv']}")
    print()

lightgbm: {'num_leaves': 18, 'n_estimators': 250, 'max_depth': 23, 'learning_rate': 0.1}

random_forest: {'n_estimators': 350, 'max_leaf_nodes': 68, 'max_depth': 28}

decision_tree: {'max_leaf_nodes': 28, 'max_depth': 8}

knn: {'n_neighbors': 33}

catboost: {'random_strength': 1, 'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 200, 'depth': 8, 'border_count': 128}

xgboost: {'reg_lambda': 5, 'n_estimators': 350, 'max_depth': 7, 'learning_rate': 0.1}



In [11]:
dct_best_hyperparams = {
    'lightgbm': {
        'model': lightgbm,
        'is_active': True,
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['lightgbm']['best_rscv']['max_depth'] - 2, dct_hyperparams['lightgbm']['best_rscv']['max_depth'] + 2, 1).tolist(),
            num_leaves = np.arange(dct_hyperparams['lightgbm']['best_rscv']['num_leaves'] - 2, dct_hyperparams['lightgbm']['best_rscv']['num_leaves'] + 2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['lightgbm']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['lightgbm']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            n_estimators = np.arange(np.max([dct_hyperparams['lightgbm']['best_rscv']['n_estimators'] - 50, 0]), dct_hyperparams['lightgbm']['best_rscv']['n_estimators'] + 100, 50).tolist(),
        )
    },
    'random_forest': {
        'model': random_forest,
        'is_active': True,
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['random_forest']['best_rscv']['max_depth'] - 2, dct_hyperparams['random_forest']['best_rscv']['max_depth'] + 2, 1).tolist(),
            max_leaf_nodes = np.arange(dct_hyperparams['random_forest']['best_rscv']['max_leaf_nodes'] - 2, dct_hyperparams['random_forest']['best_rscv']['max_leaf_nodes'] + 2, 1).tolist(),
            n_estimators = np.arange(np.max([dct_hyperparams['random_forest']['best_rscv']['n_estimators'] - 50, 0]), dct_hyperparams['random_forest']['best_rscv']['n_estimators'] + 100, 50).tolist(),
        )
    },
    'decision_tree': {
        'model': decision_tree,
        'is_active': True,
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['decision_tree']['best_rscv']['max_depth'] - 2, dct_hyperparams['decision_tree']['best_rscv']['max_depth'] + 2, 1).tolist(),
            max_leaf_nodes = np.arange(dct_hyperparams['decision_tree']['best_rscv']['max_leaf_nodes'] - 2, dct_hyperparams['decision_tree']['best_rscv']['max_leaf_nodes'] + 2, 1).tolist(),
        )
    },
    'knn': {
        'model': knn,
        'is_active': True,
        'dct_params': dict(
            n_neighbors = np.arange(dct_hyperparams['knn']['best_rscv']['n_neighbors'] - 2, dct_hyperparams['knn']['best_rscv']['n_neighbors'] + 2, 1).tolist()
        )
    },
    'catboost': {
        'model': catboost,
        'is_active': True,
        'dct_params': dict(
            depth = np.arange(dct_hyperparams['catboost']['best_rscv']['depth']-2, dct_hyperparams['catboost']['best_rscv']['depth']+2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['catboost']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['catboost']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            iterations = np.arange(np.max([dct_hyperparams['catboost']['best_rscv']['iterations'] - 50, 0]), dct_hyperparams['catboost']['best_rscv']['iterations'] + 100, 50).tolist(),
            l2_leaf_reg = np.arange(dct_hyperparams['catboost']['best_rscv']['l2_leaf_reg'] - 1, dct_hyperparams['catboost']['best_rscv']['l2_leaf_reg'] + 1, 0.5).tolist(),
            # border_count = np.arange(dct_hyperparams['catboost']['best_rscv']['border_count'] - 1, dct_hyperparams['catboost']['best_rscv']['border_count'] + 1, 1).tolist(),
            # random_strength = np.arange(dct_hyperparams['catboost']['best_rscv']['random_strength'] - 0.1, dct_hyperparams['catboost']['best_rscv']['random_strength'] + 0.1, 0.05).tolist(),
        )
    },
    'xgboost': {
        'model': xgboost,
        'is_active': True,
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['xgboost']['best_rscv']['max_depth'] - 2, dct_hyperparams['xgboost']['best_rscv']['max_depth'] + 2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['xgboost']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['xgboost']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            n_estimators = np.arange(np.max([dct_hyperparams['xgboost']['best_rscv']['n_estimators'] - 50, 0]), dct_hyperparams['xgboost']['best_rscv']['n_estimators'] + 100, 50).tolist(),
            reg_lambda = np.arange(dct_hyperparams['xgboost']['best_rscv']['reg_lambda'] - 1, dct_hyperparams['xgboost']['best_rscv']['reg_lambda'] + 1, 0.5).tolist(),
        )
    },
}

for key, item in dct_best_hyperparams.items():
    if dct_best_hyperparams[model]['is_active']:
        combinations = 1
        for param, values in item['dct_params'].items():
            combinations *= len(values)
        dct_best_hyperparams[key].update({'combinations': combinations})
        print(f"""{key}:

        dct_params: {item['dct_params']}

        combinations: {combinations}
            """)
    else:
        print(f"{key} is not active.\n")
    print(30*'-')

save_dict_to_txt(content=dct_best_hyperparams, file_path=REPORTS_FOLDER+f'grid_search-params_combinations-{scoring_metric}.txt')

lightgbm:

        dct_params: {'max_depth': [21, 22, 23, 24], 'num_leaves': [16, 17, 18, 19], 'learning_rate': [0.09000000000000001, 0.09500000000000001, 0.10000000000000002, 0.10500000000000002], 'n_estimators': [200, 250, 300]}

        combinations: 192
            
------------------------------
random_forest:

        dct_params: {'max_depth': [26, 27, 28, 29], 'max_leaf_nodes': [66, 67, 68, 69], 'n_estimators': [300, 350, 400]}

        combinations: 48
            
------------------------------
decision_tree:

        dct_params: {'max_depth': [6, 7, 8, 9], 'max_leaf_nodes': [26, 27, 28, 29]}

        combinations: 16
            
------------------------------
knn:

        dct_params: {'n_neighbors': [31, 32, 33, 34]}

        combinations: 4
            
------------------------------
catboost:

        dct_params: {'depth': [6, 7, 8, 9], 'learning_rate': [0.09000000000000001, 0.09500000000000001, 0.10000000000000002, 0.10500000000000002], 'iterations': [150, 200, 250], 'l2

In [12]:
dct_final_hyperparams = {}

for model in dct_best_hyperparams.keys():
    if dct_best_hyperparams[model]['is_active']:
        print(f'Iniciar {model}.')
        start_time = time.time()
        dct_params = dct_best_hyperparams[model]['dct_params']
        classifier = dct_best_hyperparams[model]['model']

        gscv = GridSearchCV(
            classifier, dct_params, error_score='raise', n_jobs=-1,
            return_train_score=True, scoring=scoring_metric, cv=3, verbose=False
        )

        if model == 'xgboost':
            search = gscv.fit(X_train_exp_02, y_train_exp_02['FLAG'].values, eval_set=[(X_test_exp_02, y_test_exp_02)])
        else:
            search = gscv.fit(X_train_exp_02, y_train_exp_02['FLAG'])

        dct_final_hyperparams[model] = {
            'best_gscv': search.best_params_,
            'results': gscv.cv_results_
            }
        end_time = time.time()
        execution_time = end_time - start_time
        print(f'O modelo {model} passou pela busca em grade em {int(execution_time//60)} minutos e {execution_time%60:.1f} segundos.\n')

    else:
        print(f'O modelo {model} está desativado.\n')

Iniciar lightgbm.


O modelo lightgbm passou pela busca em grade em 6 minutos e 18.2 segundos.

Iniciar random_forest.
O modelo random_forest passou pela busca em grade em 1 minutos e 28.0 segundos.

Iniciar decision_tree.
O modelo random_forest passou pela busca em grade em 1 minutos e 28.0 segundos.

Iniciar decision_tree.
O modelo decision_tree passou pela busca em grade em 0 minutos e 1.0 segundos.

Iniciar knn.
O modelo decision_tree passou pela busca em grade em 0 minutos e 1.0 segundos.

Iniciar knn.
O modelo knn passou pela busca em grade em 0 minutos e 1.8 segundos.

Iniciar catboost.
O modelo knn passou pela busca em grade em 0 minutos e 1.8 segundos.

Iniciar catboost.



invalid value encountered in cast



O modelo catboost passou pela busca em grade em 16 minutos e 40.7 segundos.

Iniciar xgboost.
[0]	validation_0-logloss:0.46012
[1]	validation_0-logloss:0.40913
[2]	validation_0-logloss:0.36921
[3]	validation_0-logloss:0.33568
[0]	validation_0-logloss:0.46012
[1]	validation_0-logloss:0.40913
[2]	validation_0-logloss:0.36921
[3]	validation_0-logloss:0.33568
[4]	validation_0-logloss:0.30687
[5]	validation_0-logloss:0.28247
[6]	validation_0-logloss:0.26105
[4]	validation_0-logloss:0.30687
[5]	validation_0-logloss:0.28247
[6]	validation_0-logloss:0.26105
[7]	validation_0-logloss:0.24256
[8]	validation_0-logloss:0.22665
[9]	validation_0-logloss:0.21250
[10]	validation_0-logloss:0.20021
[7]	validation_0-logloss:0.24256
[8]	validation_0-logloss:0.22665
[9]	validation_0-logloss:0.21250
[10]	validation_0-logloss:0.20021
[11]	validation_0-logloss:0.18884
[12]	validation_0-logloss:0.17751
[11]	validation_0-logloss:0.18884
[12]	validation_0-logloss:0.17751
[13]	validation_0-logloss:0.16821
[14]	val

In [13]:
for model, items in dct_final_hyperparams.items():
    print(f"{model}: {items['best_gscv']}\n")

lightgbm: {'learning_rate': 0.10000000000000002, 'max_depth': 21, 'n_estimators': 250, 'num_leaves': 18}

random_forest: {'max_depth': 26, 'max_leaf_nodes': 69, 'n_estimators': 300}

decision_tree: {'max_depth': 8, 'max_leaf_nodes': 29}

knn: {'n_neighbors': 31}

catboost: {'depth': 7, 'iterations': 250, 'l2_leaf_reg': 3.0, 'learning_rate': 0.09500000000000001}

xgboost: {'learning_rate': 0.09500000000000001, 'max_depth': 5, 'n_estimators': 300, 'reg_lambda': 4.5}


random_forest: {'max_depth': 26, 'max_leaf_nodes': 69, 'n_estimators': 300}

decision_tree: {'max_depth': 8, 'max_leaf_nodes': 29}

knn: {'n_neighbors': 31}

catboost: {'depth': 7, 'iterations': 250, 'l2_leaf_reg': 3.0, 'learning_rate': 0.09500000000000001}

xgboost: {'learning_rate': 0.09500000000000001, 'max_depth': 5, 'n_estimators': 300, 'reg_lambda': 4.5}



In [14]:
df_search_results = pd.DataFrame()

for model in dct_final_hyperparams.keys():
    print(model)
    df_random_search_results = pd.DataFrame.from_dict(dct_final_hyperparams[model]['results'], orient='columns')[[
        'rank_test_score', 'mean_fit_time', 'mean_score_time', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score'
    ]]

    df_random_search_results.insert(0, 'model', model)

    df_search_results = pd.concat([df_search_results, df_random_search_results], axis=0, ignore_index=True)

df_search_results.sort_values(by=['model', 'rank_test_score'], inplace=True, ascending=True)
display(df_search_results.loc[df_search_results['rank_test_score'] <= 3])


df_search_results.to_excel(REPORTS_FOLDER + f'artigo_grid_search-{scoring_metric}.xlsx')

lightgbm
random_forest
decision_tree
knn
catboost
xgboost


Unnamed: 0,model,rank_test_score,mean_fit_time,mean_score_time,mean_test_score,std_test_score,mean_train_score,std_train_score
349,catboost,1,9.4601,0.02245,0.99828,0.00042,1.0,0.0
449,catboost,2,32.60853,0.04782,0.99826,0.00049,1.0,0.0
433,catboost,3,26.83412,0.07487,0.99826,0.00054,1.0,0.0
251,decision_tree,1,0.07849,0.00395,0.98258,0.00128,0.99037,0.00033
250,decision_tree,2,0.12027,0.00677,0.98247,0.00137,0.99019,0.00047
248,decision_tree,3,0.12679,0.00604,0.98207,0.00106,0.98872,0.00139
256,knn,1,0.01866,0.52589,0.92707,0.00743,0.94087,0.00288
257,knn,2,0.01532,0.56154,0.92701,0.00723,0.94046,0.00301
258,knn,3,0.01435,0.44617,0.927,0.00744,0.94002,0.00323
102,lightgbm,1,4.41085,0.01927,0.99866,0.0004,1.0,0.0
