# Modeling - ethereum - v5 - Comparação

# Setup

## Library import
We import all the required Python libraries

In [14]:
import os
import time

# Data manipulation
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import (
    LGBMClassifier, plot_importance, create_tree_digraph, plot_tree
)
import missingno as msno
# from pycaret.classification import ClassificationExperiment
from sklearn.experimental import enable_halving_search_cv
from sklearn.feature_selection import RFECV
from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV, GridSearchCV, HalvingGridSearchCV, cross_validate, KFold
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn import set_config

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas
set_config(transform_output = "pandas")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.5f}'.format)
# pd.options.display.float_format = '{:.5f}'.format
# pd.options.display.max_rows = 120

sns.set_context(context='paper', font_scale=2, rc=None)
sns.set_style("ticks")
sns.set_palette(sns.color_palette())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Local library import
We import all the required local libraries libraries

In [15]:
os.chdir('../')
from src.utils.data_describe import breve_descricao, serie_nulos, cardinalidade, check_for_equal_columns
os.chdir('./notebooks/')

def round_4(x):
    return x.round(4)

def save_dict_to_txt(content: dict, file_path: str, mode: str = 'w', encoding: str = 'utf-8') -> None:
    try:
        with open(file_path, mode=mode, encoding=encoding) as f:
            for key, value in content.items():
                f.write(f"{key}: {value}\n")
        print(f"Dictionary successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving file: {str(e)}")

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [16]:
RAW_FOLDER = '../data/raw/'
INTERIM_FOLDER = '../data/interim/'
PROCESSED_FOLDER = '../data/processed/'
REPORTS_FOLDER = '../reports/'
RANDOM_STATE = 42

GENERATE_REPORTS = False

dct_active_models = {
    'lightgbm': True,
    'random_forest': False,
    'decision_tree': False,
    'knn': False,
    'catboost': False,
    'xgboost': False,
}

train_size = 0.15
scoring_metric = ['roc_auc', 'f1', 'recall']
refit_metric = 'roc_auc'


# palette = sns.color_palette("Spectral", as_cmap=True)
palette = sns.color_palette("husl", 10)
palette

# Experience 01 - Only numerical attributes

## Data import

In [17]:
df_train_exp_01 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_train_exp_01.pqt')
df_test_exp_01 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_test_exp_01.pqt')

display(df_train_exp_01.head(3))

Unnamed: 0,Address,FLAG,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.61327,0.38569,0.0,1.8,0.03284,0.0,0.0,0.0,102,3.0873,3.08548,0.0,-0.00182,8.0,403.42831,2.26081,0.0,1.0,5.0,0.0,7.0,0.0,365.0,57.63262,2.26081,2.26081,2.26081,1.0,7.0
2,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,10,2,0.11312,1.16545,0.35891,0.05,3.53862,1.79431,0.0,0.0,0.0,12,3.58862,3.58906,0.0,0.00044,8.0,521.51207,0.0,0.0,0.0,7.0,0.0,8.0,0.0,442.81984,65.18901,0.0,0.0,0.0,0.0,8.0
4,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,12.80241,2.6711,0.0,9.0,0.02269,0.0,0.0,0.0,4619,104.31888,53.4219,0.0,-50.89699,42.0,162829.6609,123539.9329,0.0,4.0,23.0,0.0,27.0,0.0,90000.0,4934.23215,0.0,45000.0,13726.65922,6.0,27.0


# Categorical features encoded by frequency

## Data import

In [18]:
df_raw = pd.read_parquet(INTERIM_FOLDER + 'transaction_dataset.pqt')

df_train_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_train_exp_02_transformed.pqt')
df_test_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_test_exp_02_transformed.pqt')

X_train_exp_02 = df_train_exp_02.drop(columns=['FLAG'])
y_train_exp_02 = df_train_exp_02[['FLAG']]

X_test_exp_02 = df_test_exp_02.drop(columns=['FLAG'])
y_test_exp_02 = df_test_exp_02[['FLAG']]

print(f"""
X_train_exp_02: {X_train_exp_02.shape}
y_train_exp_02: {y_train_exp_02.shape}

X_test_exp_02: {X_test_exp_02.shape}
y_test_exp_02: {y_test_exp_02.shape}
""")

display(X_train_exp_02.head(3))


X_train_exp_02: (8343, 40)
y_train_exp_02: (8343, 1)

X_test_exp_02: (1473, 40)
y_test_exp_02: (1473, 1)



Unnamed: 0,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
2859,163.07,0.17,326.47,2,2,0,2,2,35.49665,65.50334,50.5,3.0,97.99902,50.49951,0.0,0.0,0.0,4,100.99902,101.0,0.0,0.00098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84238,0.53506
2149,0.0,4.64,2726.18,1,2,0,1,1,0.015,6.90985,3.46242,6.92436,6.92436,6.92436,0.0,0.0,0.0,3,6.92436,6.92485,0.0,0.00049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84238,0.53506
2979,1050.5,887.86,1393223.92,689,754,0,405,448,0.0,81.8288,1.66417,0.0,59.98,1.52056,0.0,0.0,0.0,1443,1047.66474,1254.78214,0.0,207.1174,31.0,2547558.025,387.73596,0.0,1.0,19.0,0.0,23.0,0.0,2537935.0,87846.82844,0.7,387.03596,193.86798,1.0,23.0,0.15762,0.46494


In [19]:
df_raw = pd.read_parquet(INTERIM_FOLDER + 'transaction_dataset.pqt')

df_train_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_train_exp_02_transformed.pqt')
df_test_exp_02 = pd.read_parquet(INTERIM_FOLDER + 'artigo_df_cleaned_test_exp_02_transformed.pqt')

X_train_exp_02 = df_train_exp_02.drop(columns=['FLAG'])
y_train_exp_02 = df_train_exp_02[['FLAG']]

X_test_exp_02 = df_test_exp_02.drop(columns=['FLAG'])
y_test_exp_02 = df_test_exp_02[['FLAG']]

print(f"""
X_train_exp_02: {X_train_exp_02.shape}
y_train_exp_02: {y_train_exp_02.shape}

X_test_exp_02: {X_test_exp_02.shape}
y_test_exp_02: {y_test_exp_02.shape}
""")

display(X_train_exp_02.head(3))


X_train_exp_02: (8343, 40)
y_train_exp_02: (8343, 1)

X_test_exp_02: (1473, 40)
y_test_exp_02: (1473, 1)



Unnamed: 0,Avg_min_between_sent_tnx,Avg_min_between_received_tnx,Time_Diff_between_first_and_last_(Mins),Sent_tnx,Received_Tnx,Number_of_Created_Contracts,Unique_Received_From_Addresses,Unique_Sent_To_Addresses,min_value_received,max_value_received,avg_val_received,min_val_sent,max_val_sent,avg_val_sent,min_value_sent_to_contract,max_val_sent_to_contract,avg_value_sent_to_contract,total_transactions_(including_tnx_to_create_contract,total_Ether_sent,total_ether_received,total_ether_sent_contracts,total_ether_balance,Total_ERC20_tnxs,ERC20_total_Ether_received,ERC20_total_ether_sent,ERC20_total_Ether_sent_contract,ERC20_uniq_sent_addr,ERC20_uniq_rec_addr,ERC20_uniq_sent_addr.1,ERC20_uniq_rec_contract_addr,ERC20_min_val_rec,ERC20_max_val_rec,ERC20_avg_val_rec,ERC20_min_val_sent,ERC20_max_val_sent,ERC20_avg_val_sent,ERC20_uniq_sent_token_name,ERC20_uniq_rec_token_name,ERC20_most_sent_token_type,ERC20_most_rec_token_type
2859,163.07,0.17,326.47,2,2,0,2,2,35.49665,65.50334,50.5,3.0,97.99902,50.49951,0.0,0.0,0.0,4,100.99902,101.0,0.0,0.00098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84238,0.53506
2149,0.0,4.64,2726.18,1,2,0,1,1,0.015,6.90985,3.46242,6.92436,6.92436,6.92436,0.0,0.0,0.0,3,6.92436,6.92485,0.0,0.00049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84238,0.53506
2979,1050.5,887.86,1393223.92,689,754,0,405,448,0.0,81.8288,1.66417,0.0,59.98,1.52056,0.0,0.0,0.0,1443,1047.66474,1254.78214,0.0,207.1174,31.0,2547558.025,387.73596,0.0,1.0,19.0,0.0,23.0,0.0,2537935.0,87846.82844,0.7,387.03596,193.86798,1.0,23.0,0.15762,0.46494


## Configurando modelos e dicionários de modelos

In [21]:
max_depth = np.arange(3, 36, 5).tolist(),
num_leaves = np.arange(2**np.min(max_depth), 70, 5).tolist()

lightgbm = LGBMClassifier(boosting_type='gbdt', random_state=RANDOM_STATE, n_jobs=-1, objective='binary', importance_type='gain', verbosity=-1)
catboost = CatBoostClassifier( random_state=RANDOM_STATE, verbose=0, early_stopping_rounds=20)
xgboost = XGBClassifier(
    random_state=RANDOM_STATE, objective='binary:logistic', eval_metric='logloss', early_stopping_rounds=10, n_jobs=-1, verbose_eval=None
)
random_forest = RandomForestClassifier(
    max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=RANDOM_STATE, verbose=0
)
decision_tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
knn = KNeighborsClassifier(n_jobs=-1)

dct_models = {
    'lightgbm': {
        'model': lightgbm,
        'is_active': dct_active_models['lightgbm'],
        'dct_params': dict(
            max_depth = [3, 8, 13, 18, 23, 28, 33],
            num_leaves = [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68],
            learning_rate = [0.0001, 0.001, 0.01, 0.1],
            n_estimators = np.arange(50, 600, 100).tolist(),
        )
    },
    'random_forest': {
        'model': random_forest,
        'is_active': dct_active_models['random_forest'],
        'dct_params': dict(
            max_depth = [3, 8, 13, 18, 23, 28, 33],
            max_leaf_nodes = [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68],
            n_estimators = np.arange(50, 600, 100).tolist(),
        )
    },
    'decision_tree': {
        'model': decision_tree,
        'is_active': dct_active_models['decision_tree'],
        'dct_params': dict(
            max_depth = [3, 8, 13, 18, 23, 28, 33],
            max_leaf_nodes = [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68]
        )
    },
    'knn': {
        'model': knn,
        'is_active': dct_active_models['knn'],
        'dct_params': dict(
            n_neighbors = [3, 8, 13, 18, 23, 28, 33]
        )
    },
    'catboost': {
        'model': catboost,
        'is_active': dct_active_models['catboost'],
        'dct_params': dict(
            depth = [4, 6, 8, 10],
            learning_rate = [0.001, 0.01, 0.1],
            iterations = [100, 200, 300],
            l2_leaf_reg = [1, 3, 5],
            border_count = [32, 64, 128],
            random_strength = [0.1, 1, 10]
        )
    },
    'xgboost': {
        'model': xgboost,
        'is_active': dct_active_models['xgboost'],
        'dct_params': dict(
            max_depth = np.arange(3, 16, 4).tolist(),
            learning_rate = [0.0001, 0.001, 0.01, 0.1],
            n_estimators = np.arange(50, 600, 100).tolist(),
            reg_lambda = [0.1, 1, 3, 5, 10]
        )
    },
}

for key, item in dct_models.items():
    combinations = 1
    for param, values in item['dct_params'].items():
        combinations *= len(values)
    dct_models[key].update({'combinations': combinations})
    print(f"""{key}:
    dct_params: {item['dct_params']},
    combinations: {combinations}
    \n""")
    print(30*'-')

if GENERATE_REPORTS:
    save_dict_to_txt(content=dct_models, file_path=REPORTS_FOLDER+f'random_search-params_combinations-{scoring_metric}.txt')
else:
    print(f'GENERATE_REPORTS: {GENERATE_REPORTS}')


lightgbm:
    dct_params: {'max_depth': [3, 8, 13, 18, 23, 28, 33], 'num_leaves': [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68], 'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'n_estimators': [50, 150, 250, 350, 450, 550]},
    combinations: 2184
    

------------------------------
random_forest:
    dct_params: {'max_depth': [3, 8, 13, 18, 23, 28, 33], 'max_leaf_nodes': [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68], 'n_estimators': [50, 150, 250, 350, 450, 550]},
    combinations: 546
    

------------------------------
decision_tree:
    dct_params: {'max_depth': [3, 8, 13, 18, 23, 28, 33], 'max_leaf_nodes': [8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68]},
    combinations: 91
    

------------------------------
knn:
    dct_params: {'n_neighbors': [3, 8, 13, 18, 23, 28, 33]},
    combinations: 7
    

------------------------------
catboost:
    dct_params: {'depth': [4, 6, 8, 10], 'learning_rate': [0.001, 0.01, 0.1], 'iterations': [100, 200, 300], 'l2_leaf_reg': [

## Random search

In [22]:
dct_hyperparams = {}

for model in dct_models.keys():
    if dct_models[model]['is_active']:
        try:
            print(f'Iniciar {model}.')
            start_time = time.time()
            dct_params = dct_models[model]['dct_params']
            classifier = dct_models[model]['model']

            rscv = RandomizedSearchCV(
                classifier, dct_params, error_score='raise', random_state=RANDOM_STATE, n_iter=100,
                return_train_score=True, scoring=scoring_metric, cv=3, verbose=False, refit=refit_metric
            )

            if model == 'xgboost':
                search = rscv.fit(X_train_exp_02, y_train_exp_02['FLAG'].values, eval_set=[(X_test_exp_02, y_test_exp_02)])
            else:
                search = rscv.fit(X_train_exp_02, y_train_exp_02['FLAG'])

            dct_hyperparams[model] = {
                'best_rscv': search.best_params_,
                'results': rscv.cv_results_
                }
            end_time = time.time()
            execution_time = end_time - start_time
            print(f'O modelo {model} passou pela busca aleatória em {int(execution_time//60)} minutos e {execution_time%60:.1f} segundos.\n')
        except:
            print(f'Erro ao executar o modelo {model}.\n')

    else:
        print(f'O modelo {model} está desativado.\n')


Iniciar lightgbm.
O modelo lightgbm passou pela busca aleatória em 2 minutos e 23.6 segundos.

O modelo random_forest está desativado.

O modelo decision_tree está desativado.

O modelo knn está desativado.

O modelo catboost está desativado.

O modelo xgboost está desativado.



In [23]:
df_search_results = pd.DataFrame()

for model in dct_hyperparams.keys():
    print(model)
    df_random_search_results = pd.DataFrame.from_dict(dct_hyperparams[model]['results'], orient='columns')

    df_random_search_results.insert(0, 'model', model)

    df_search_results = pd.concat([df_search_results, df_random_search_results], axis=0, ignore_index=True)

df_search_results.sort_values(by=['model', 'rank_test_roc_auc'], inplace=True, ascending=True)
display(df_search_results.loc[df_search_results['rank_test_roc_auc'] <= 3])

if GENERATE_REPORTS:
    df_search_results.to_excel(REPORTS_FOLDER + f'artigo_random_search-{scoring_metric}.xlsx')
else:
    print(f'GENERATE_REPORTS: {GENERATE_REPORTS}')

lightgbm


Unnamed: 0,model,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_num_leaves,param_n_estimators,param_max_depth,param_learning_rate,params,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc,split0_train_roc_auc,split1_train_roc_auc,split2_train_roc_auc,mean_train_roc_auc,std_train_roc_auc,split0_test_f1,split1_test_f1,split2_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,mean_train_f1,std_train_f1,split0_test_recall,split1_test_recall,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,mean_train_recall,std_train_recall
33,lightgbm,0.24853,0.08223,0.02371,0.001,18,250,23,0.1,"{'num_leaves': 18, 'n_estimators': 250, 'max_depth': 23, 'learning_rate': 0.1}",0.99872,0.99912,0.99814,0.99866,0.0004,1,1.0,1.0,1.0,1.0,0.0,0.97204,0.97792,0.96895,0.97297,0.00372,4,1.0,1.0,1.0,1.0,0.0,0.95786,0.96921,0.9611,0.96272,0.00477,6,1.0,1.0,1.0,1.0,0.0
92,lightgbm,0.37427,0.09806,0.03254,0.00361,23,250,18,0.1,"{'num_leaves': 23, 'n_estimators': 250, 'max_depth': 18, 'learning_rate': 0.1}",0.99859,0.99897,0.99783,0.99846,0.00047,2,1.0,1.0,1.0,1.0,0.0,0.97288,0.97959,0.96648,0.97298,0.00536,2,1.0,1.0,1.0,1.0,0.0,0.95948,0.97245,0.95786,0.96326,0.00653,4,1.0,1.0,1.0,1.0,0.0
24,lightgbm,0.2087,0.02626,0.02001,0.00116,43,150,8,0.1,"{'num_leaves': 43, 'n_estimators': 150, 'max_depth': 8, 'learning_rate': 0.1}",0.99857,0.99876,0.99794,0.99843,0.00035,3,1.0,1.0,1.0,1.0,0.0,0.9688,0.97465,0.96321,0.96889,0.00467,20,1.0,1.0,1.0,1.0,0.0,0.95624,0.96596,0.95462,0.95894,0.00501,17,1.0,1.0,1.0,1.0,0.0


GENERATE_REPORTS: False


## Grid search

In [24]:
for key, item in dct_hyperparams.items():
    print(f"{key}: {item['best_rscv']}")
    print()

lightgbm: {'num_leaves': 18, 'n_estimators': 250, 'max_depth': 23, 'learning_rate': 0.1}



In [25]:
dct_best_hyperparams = {
    'lightgbm': {
        'model': lightgbm,
        'is_active': dct_active_models['lightgbm'],
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['lightgbm']['best_rscv']['max_depth'] - 2, dct_hyperparams['lightgbm']['best_rscv']['max_depth'] + 2, 1).tolist(),
            num_leaves = np.arange(dct_hyperparams['lightgbm']['best_rscv']['num_leaves'] - 2, dct_hyperparams['lightgbm']['best_rscv']['num_leaves'] + 2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['lightgbm']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['lightgbm']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            n_estimators = np.arange(np.max([dct_hyperparams['lightgbm']['best_rscv']['n_estimators'] - 50, 0]), dct_hyperparams['lightgbm']['best_rscv']['n_estimators'] + 100, 50).tolist(),
        )
    },
    'random_forest': {
        'model': random_forest,
        'is_active': dct_active_models['random_forest'],
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['random_forest']['best_rscv']['max_depth'] - 2, dct_hyperparams['random_forest']['best_rscv']['max_depth'] + 2, 1).tolist(),
            max_leaf_nodes = np.arange(dct_hyperparams['random_forest']['best_rscv']['max_leaf_nodes'] - 2, dct_hyperparams['random_forest']['best_rscv']['max_leaf_nodes'] + 2, 1).tolist(),
            n_estimators = np.arange(np.max([dct_hyperparams['random_forest']['best_rscv']['n_estimators'] - 50, 0]), dct_hyperparams['random_forest']['best_rscv']['n_estimators'] + 100, 50).tolist(),
        )
    },
    'decision_tree': {
        'model': decision_tree,
        'is_active': dct_active_models['decision_tree'],
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['decision_tree']['best_rscv']['max_depth'] - 2, dct_hyperparams['decision_tree']['best_rscv']['max_depth'] + 2, 1).tolist(),
            max_leaf_nodes = np.arange(dct_hyperparams['decision_tree']['best_rscv']['max_leaf_nodes'] - 2, dct_hyperparams['decision_tree']['best_rscv']['max_leaf_nodes'] + 2, 1).tolist(),
        )
    },
    'knn': {
        'model': knn,
        'is_active': dct_active_models['knn'],
        'dct_params': dict(
            n_neighbors = np.arange(dct_hyperparams['knn']['best_rscv']['n_neighbors'] - 2, dct_hyperparams['knn']['best_rscv']['n_neighbors'] + 2, 1).tolist()
        )
    },
    'catboost': {
        'model': catboost,
        'is_active': dct_active_models['catboost'],
        'dct_params': dict(
            depth = np.arange(dct_hyperparams['catboost']['best_rscv']['depth']-2, dct_hyperparams['catboost']['best_rscv']['depth']+2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['catboost']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['catboost']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            iterations = np.arange(np.max([dct_hyperparams['catboost']['best_rscv']['iterations'] - 50, 0]), dct_hyperparams['catboost']['best_rscv']['iterations'] + 100, 50).tolist(),
            l2_leaf_reg = np.arange(dct_hyperparams['catboost']['best_rscv']['l2_leaf_reg'] - 1, dct_hyperparams['catboost']['best_rscv']['l2_leaf_reg'] + 1, 0.5).tolist(),
            # border_count = np.arange(dct_hyperparams['catboost']['best_rscv']['border_count'] - 1, dct_hyperparams['catboost']['best_rscv']['border_count'] + 1, 1).tolist(),
            # random_strength = np.arange(dct_hyperparams['catboost']['best_rscv']['random_strength'] - 0.1, dct_hyperparams['catboost']['best_rscv']['random_strength'] + 0.1, 0.05).tolist(),
        )
    },
    'xgboost': {
        'model': xgboost,
        'is_active': dct_active_models['xgboost'],
        'dct_params': dict(
            max_depth = np.arange(dct_hyperparams['xgboost']['best_rscv']['max_depth'] - 2, dct_hyperparams['xgboost']['best_rscv']['max_depth'] + 2, 1).tolist(),
            learning_rate = np.arange(dct_hyperparams['xgboost']['best_rscv']['learning_rate'] - 0.01, dct_hyperparams['xgboost']['best_rscv']['learning_rate'] + 0.01, 0.005).tolist(),
            n_estimators = np.arange(np.max([dct_hyperparams['xgboost']['best_rscv']['n_estimators'] - 50, 0]), dct_hyperparams['xgboost']['best_rscv']['n_estimators'] + 100, 50).tolist(),
            reg_lambda = np.arange(dct_hyperparams['xgboost']['best_rscv']['reg_lambda'] - 1, dct_hyperparams['xgboost']['best_rscv']['reg_lambda'] + 1, 0.5).tolist(),
        )
    },
}

for key, item in dct_best_hyperparams.items():
    if dct_best_hyperparams[model]['is_active']:
        combinations = 1
        for param, values in item['dct_params'].items():
            combinations *= len(values)
        dct_best_hyperparams[key].update({'combinations': combinations})
        print(f"""{key}:

        dct_params: {item['dct_params']}

        combinations: {combinations}
            """)
    else:
        print(f"{key} is not active.\n")
    print(30*'-')

if GENERATE_REPORTS:
    save_dict_to_txt(content=dct_best_hyperparams, file_path=REPORTS_FOLDER+f'grid_search-params_combinations-{scoring_metric}.txt')
else:
    print(f'GENERATE_REPORTS: {GENERATE_REPORTS}')

KeyError: 'random_forest'

In [None]:
dct_final_hyperparams = {}

for model in dct_best_hyperparams.keys():
    if dct_best_hyperparams[model]['is_active']:
        print(f'Iniciar {model}.')
        start_time = time.time()
        dct_params = dct_best_hyperparams[model]['dct_params']
        classifier = dct_best_hyperparams[model]['model']

        gscv = GridSearchCV(
            classifier, dct_params, error_score='raise', n_jobs=-1,
            return_train_score=True, scoring=scoring_metric, cv=3, verbose=False, refit=refit_metric
        )

        if model == 'xgboost':
            search = gscv.fit(X_train_exp_02, y_train_exp_02['FLAG'].values, eval_set=[(X_test_exp_02, y_test_exp_02)])
        else:
            search = gscv.fit(X_train_exp_02, y_train_exp_02['FLAG'])

        dct_final_hyperparams[model] = {
            'best_gscv': search.best_params_,
            'results': gscv.cv_results_
            }
        end_time = time.time()
        execution_time = end_time - start_time
        print(f'O modelo {model} passou pela busca em grade em {int(execution_time//60)} minutos e {execution_time%60:.1f} segundos.\n')

    else:
        print(f'O modelo {model} está desativado.\n')

Iniciar lightgbm.



invalid value encountered in cast



O modelo lightgbm passou pela busca em grade em 3 minutos e 25.4 segundos.

Iniciar random_forest.
O modelo random_forest passou pela busca em grade em 1 minutos e 30.1 segundos.

Iniciar decision_tree.
O modelo decision_tree passou pela busca em grade em 0 minutos e 1.1 segundos.

Iniciar knn.
O modelo knn passou pela busca em grade em 0 minutos e 3.4 segundos.

Iniciar catboost.
O modelo catboost passou pela busca em grade em 17 minutos e 3.1 segundos.

Iniciar xgboost.
[0]	validation_0-logloss:0.46012
[1]	validation_0-logloss:0.40913
[2]	validation_0-logloss:0.36921
[3]	validation_0-logloss:0.33568
[4]	validation_0-logloss:0.30687
[5]	validation_0-logloss:0.28247
[6]	validation_0-logloss:0.26105
[7]	validation_0-logloss:0.24256
[8]	validation_0-logloss:0.22665



invalid value encountered in cast



[9]	validation_0-logloss:0.21250
[10]	validation_0-logloss:0.20021
[11]	validation_0-logloss:0.18884
[12]	validation_0-logloss:0.17751
[13]	validation_0-logloss:0.16821
[14]	validation_0-logloss:0.16001
[15]	validation_0-logloss:0.15233
[16]	validation_0-logloss:0.14540
[17]	validation_0-logloss:0.13929
[18]	validation_0-logloss:0.13264
[19]	validation_0-logloss:0.12699
[20]	validation_0-logloss:0.12202
[21]	validation_0-logloss:0.11762
[22]	validation_0-logloss:0.11338
[23]	validation_0-logloss:0.10966
[24]	validation_0-logloss:0.10596
[25]	validation_0-logloss:0.10276
[26]	validation_0-logloss:0.09962
[27]	validation_0-logloss:0.09690
[28]	validation_0-logloss:0.09313
[29]	validation_0-logloss:0.08992
[30]	validation_0-logloss:0.08717
[31]	validation_0-logloss:0.08529
[32]	validation_0-logloss:0.08273
[33]	validation_0-logloss:0.08071
[34]	validation_0-logloss:0.07900
[35]	validation_0-logloss:0.07747
[36]	validation_0-logloss:0.07586
[37]	validation_0-logloss:0.07450
[38]	validation

In [None]:
for model, items in dct_final_hyperparams.items():
    print(f"{model}: {items['best_gscv']}\n")

lightgbm: {'learning_rate': 0.10000000000000002, 'max_depth': 21, 'n_estimators': 250, 'num_leaves': 18}

random_forest: {'max_depth': 26, 'max_leaf_nodes': 69, 'n_estimators': 300}

decision_tree: {'max_depth': 8, 'max_leaf_nodes': 29}

knn: {'n_neighbors': 31}

catboost: {'depth': 7, 'iterations': 250, 'l2_leaf_reg': 3.0, 'learning_rate': 0.09500000000000001}

xgboost: {'learning_rate': 0.09500000000000001, 'max_depth': 5, 'n_estimators': 300, 'reg_lambda': 4.5}



In [None]:
df_search_results = pd.DataFrame()

for model in dct_final_hyperparams.keys():
    print(model)
    df_random_search_results = pd.DataFrame.from_dict(dct_final_hyperparams[model]['results'], orient='columns')

    df_random_search_results.insert(0, 'model', model)

    df_search_results = pd.concat([df_search_results, df_random_search_results], axis=0, ignore_index=True)

df_search_results.sort_values(by=['model', 'rank_test_roc_auc'], inplace=True, ascending=True)
display(df_search_results.loc[df_search_results['rank_test_roc_auc'] <= 3])


if GENERATE_REPORTS:
    df_search_results.to_excel(REPORTS_FOLDER + f'artigo_grid_search-{scoring_metric}.xlsx')
else:
    print(f'GENERATE_REPORTS: {GENERATE_REPORTS}')

lightgbm
random_forest
decision_tree
knn
catboost
xgboost


Unnamed: 0,model,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_num_leaves,params,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc,split0_train_roc_auc,split1_train_roc_auc,split2_train_roc_auc,mean_train_roc_auc,std_train_roc_auc,split0_test_f1,split1_test_f1,split2_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,mean_train_f1,std_train_f1,split0_test_recall,split1_test_recall,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,mean_train_recall,std_train_recall,param_max_leaf_nodes,param_n_neighbors,param_depth,param_iterations,param_l2_leaf_reg,param_reg_lambda
349,catboost,10.26053,0.09321,0.06608,0.00725,0.095,,,,"{'depth': 7, 'iterations': 250, 'l2_leaf_reg': 3.0, 'learning_rate': 0.09500000000000001}",0.99824,0.99881,0.99779,0.99828,0.00042,1,1.0,1.0,1.0,1.0,0.0,0.96529,0.97862,0.96473,0.96954,0.00642,12,1.0,1.0,1.0,1.0,0.0,0.94652,0.96434,0.953,0.95462,0.00737,38,1.0,1.0,1.0,1.0,0.0,,,7.0,250.0,3.0,
449,catboost,34.99455,1.18058,0.13642,0.02334,0.095,,,,"{'depth': 9, 'iterations': 250, 'l2_leaf_reg': 3.5, 'learning_rate': 0.09500000000000001}",0.99849,0.99872,0.99758,0.99826,0.00049,2,1.0,1.0,1.0,1.0,0.0,0.96603,0.9803,0.96314,0.96982,0.0075,6,1.0,1.0,1.0,1.0,0.0,0.94489,0.96759,0.953,0.95516,0.00939,22,1.0,1.0,1.0,1.0,0.0,,,9.0,250.0,3.5,
433,catboost,29.29753,0.61419,0.14365,0.02653,0.095,,,,"{'depth': 9, 'iterations': 200, 'l2_leaf_reg': 3.5, 'learning_rate': 0.09500000000000001}",0.99845,0.99879,0.99752,0.99826,0.00054,3,1.0,1.0,1.0,1.0,0.0,0.96694,0.97865,0.9623,0.9693,0.00688,20,1.0,1.0,1.0,1.0,0.0,0.94814,0.96596,0.95138,0.95516,0.00775,22,1.0,1.0,1.0,1.0,0.0,,,9.0,200.0,3.5,
251,decision_tree,0.09077,0.0137,0.01546,0.00156,,8.0,,,"{'max_depth': 8, 'max_leaf_nodes': 29}",0.98104,0.98254,0.98417,0.98258,0.00128,1,0.99079,0.99,0.99031,0.99037,0.00033,0.91653,0.92129,0.90998,0.91594,0.00464,5,0.94814,0.94629,0.95659,0.95034,0.00449,0.92545,0.90113,0.90113,0.90924,0.01146,1,0.94814,0.93517,0.94652,0.94327,0.00577,29.0,,,,,
250,decision_tree,0.07801,0.00733,0.01388,0.00023,,8.0,,,"{'max_depth': 8, 'max_leaf_nodes': 28}",0.98099,0.98213,0.98429,0.98247,0.00137,2,0.99073,0.98959,0.99025,0.99019,0.00047,0.91566,0.91833,0.90805,0.91401,0.00436,6,0.94643,0.94006,0.95531,0.94727,0.00625,0.92382,0.89303,0.89627,0.90438,0.01382,2,0.94489,0.92139,0.94408,0.93679,0.01089,28.0,,,,,
248,decision_tree,0.10486,0.01369,0.01809,0.00541,,8.0,,,"{'max_depth': 8, 'max_leaf_nodes': 26}",0.98091,0.98182,0.98347,0.98207,0.00106,3,0.98966,0.98675,0.98975,0.98872,0.00139,0.91512,0.91937,0.90507,0.91319,0.00599,7,0.94415,0.93899,0.95258,0.94524,0.0056,0.91734,0.89627,0.89627,0.9033,0.00993,4,0.93841,0.92301,0.94408,0.93517,0.0089,26.0,,,,,
256,knn,0.00853,0.00071,0.89683,0.06755,,,,,{'n_neighbors': 31},0.92352,0.93742,0.92028,0.92707,0.00743,1,0.9415,0.93707,0.94404,0.94087,0.00288,0.70862,0.73466,0.71416,0.71915,0.0112,2,0.74045,0.72244,0.73825,0.73371,0.00802,0.66613,0.68882,0.66613,0.67369,0.0107,1,0.69125,0.6718,0.69368,0.68558,0.00979,,31.0,,,,
257,knn,0.00752,0.00041,0.93637,0.0596,,,,,{'n_neighbors': 32},0.92348,0.93709,0.92046,0.92701,0.00723,2,0.94121,0.93645,0.94372,0.94046,0.00301,0.71018,0.73016,0.70888,0.71641,0.00974,4,0.73707,0.71686,0.73109,0.72834,0.00848,0.66126,0.67099,0.65316,0.6618,0.00729,4,0.68152,0.65964,0.67747,0.67288,0.0095,,32.0,,,,
258,knn,0.00838,0.00105,0.76548,0.19815,,,,,{'n_neighbors': 33},0.92355,0.93733,0.92012,0.927,0.00744,3,0.941,0.93566,0.94339,0.94002,0.00323,0.70873,0.73484,0.7159,0.71982,0.01101,1,0.7373,0.72143,0.73625,0.73166,0.00725,0.66451,0.6872,0.66775,0.67315,0.01002,2,0.68801,0.67261,0.68882,0.68314,0.00746,,33.0,,,,
102,lightgbm,2.62278,0.22946,0.03872,0.00029,0.1,21.0,250.0,18.0,"{'learning_rate': 0.10000000000000002, 'max_depth': 21, 'n_estimators': 250, 'num_leaves': 18}",0.99872,0.99912,0.99814,0.99866,0.0004,1,1.0,1.0,1.0,1.0,0.0,0.97204,0.97792,0.96895,0.97297,0.00372,21,1.0,1.0,1.0,1.0,0.0,0.95786,0.96921,0.9611,0.96272,0.00477,73,1.0,1.0,1.0,1.0,0.0,,,,,,
