In [23]:
import time
import optuna
import mlflow
import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# Definindo a seed para o random state
rs = 840

In [24]:
# Lendo os dados
data_fraud = pd.read_csv("../data/processed/data_fraud.csv", index_col=0)

In [25]:
# Dividindo em variáveis explicativas e target
x = data_fraud.drop(["score_fraude_modelo", "fraude"], axis = 1)
y = data_fraud["fraude"]

Vamos relembrar pontos importantes que descobrimos na etapa de análise:

- As variáveis **pais** e **categoria_produto** possuem uma alta cardinalidade.
- As variáveis **pais** e **categoria_produto** possuem muitos valores com contagem de categorias iguais.
- Ainda existem variáveis com valores ausentes, tanto categóricas, como numéricas.
- O target está desbalanceado.

Sabendo disso, vamos desenhar como a etapa de experimentação irá se desenrolar:

1. Os dados serão divididos em treino, dev e teste. Iremos treinar o algoritmo
com os dados de treino, fazer a tunagem com os dados de dev, e, por fim, validar
com os dados de teste.
2. Será criado um esqueleto para o pipeline de transformação, consistindo
em um imputer e scaler(quando necessário) para as variáveis numéricas e um imputer 
e um encoder para as categóricas.
    
    2.1. Não usaremos o OneHotEncoder para as colunas com uma alta quantidade de
    categorias únicas, pois isso elevaria a dimensionalidade dos dados.

    2.2. Também não será utilizado o CountEncoder nas colunas com uma alta quantidade
    de categorias únicas, pois algumas categorias apresentam a mesma quantidade de registros.

3. A princípio, testaremos alguns modelos base com o StandardScaler (quando necessário),
OneHotEncoder para as features de baixa dimensão e CatBoostEncoder para as de alta
dimensão.
3. As métricas avaliadas serão o Recall e a Latência média.
4. Os modelos mais promissores entrarão em outra rodada de experimentos, dessa
vez para testar outras combinações de encoders e scalers (se necessário).


In [26]:
# Define o local para salvar os exoerimentos
mlflow.set_tracking_uri('../mlruns')

In [27]:
# Divindo os dados em treino, dev e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(x,
                                                        y,
                                                        test_size=30,
                                                        stratify=y,
                                                        random_state=rs)

x_dev, x_teste, y_dev, y_teste = train_test_split(x_teste, 
                                                  y_teste,
                                                  stratify=y_teste,
                                                  test_size=0.5,
                                                  random_state=rs)

# Dividindo features numéricas de categóricas
cat_cols_high_dim = ["pais", "categoria_produto"]
cat_cols = [col for col in x_treino.select_dtypes("object").columns if col not in cat_cols_high_dim]
num_cols = x_treino.select_dtypes(["int", "float"]).columns

# Setando o KFold
kf = StratifiedKFold(shuffle=True, random_state=rs)

In [28]:
# Definindo os dicionarios com os modelos e etapas de pre-processamento que serã utilizadas

dict_models_scale_sensitive = {"LR": LogisticRegression(random_state=rs,
                                                        class_weight='balanced')}

dict_models_tree_based = {"LGBM": LGBMClassifier(is_unbalance=True,
                                                 objective= 'binary',
                                                 random_state=rs),
                          "XGB": XGBClassifier(random_state=rs,
                                               objective='binary:hinge'),
                          "RF": RandomForestClassifier(class_weight='balanced',
                                                       random_state=rs)}

# Criando dicionário com os encoders
dict_encoders = {"OHE": OneHotEncoder(drop='first'),
                 "TE": ce.TargetEncoder(),
                 "BE": ce.BinaryEncoder(),
                 "ME": ce.MEstimateEncoder(),
                 "CE": ce.CatBoostEncoder(),
                 "GE":ce.GrayEncoder(),
                 "CTE":ce.CountEncoder()}

dict_imputers_num = {"SIAVG": SimpleImputer(strategy='mean'),
                     "SIMEDIAN": SimpleImputer(strategy='median')}

dict_scalers = {"SS": StandardScaler(),
                "RS": RobustScaler()}

# Criando dicionário com os transformers
dict_transformers = {"PT": PowerTransformer()}

<Experiment: artifact_location='/home/daniel/Documents/preditor_fraude/notebooks/../mlruns/582570540056830123', creation_time=1711647899434, experiment_id='582570540056830123', last_update_time=1711647899434, lifecycle_stage='active', name='Comparando modelos base', tags={}>

In [30]:
## Criando/acessando o experimento
#mlflow.set_experiment('Comparando modelos base')
#
## Iniciando os experimentos sem transformers
#for tag, model in dict_models_scale_sensitive.items():
#
#    # Gerando a tag de identificação do modelo
#    nome_modelo = f'{tag}'
#
#    with mlflow.start_run(run_name=nome_modelo):
#
#        # Criando os pipeline com os transformers
#        pipe_cat = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
#                            ('encoder', OneHotEncoder(drop='first'))])
#    
#        # Criando os pipeline com os transformers
#        pipe_cat_high_dim = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
#                                        ('encoder_hd', ce.CatBoostEncoder())])
#    
#        pipe_num = Pipeline([('imputer_num', SimpleImputer(strategy="median")),
#                            ('scaler', StandardScaler())])
#    
#        # Criando o transformador
#        transformer = ColumnTransformer([('cat', pipe_cat, cat_cols),
#                                        ('num', pipe_num, num_cols),
#                                        ('cat_hd', pipe_cat_high_dim, cat_cols_high_dim)],
#                                        remainder="passthrough")
#    
#        # Criando o pipeline final
#        pipe = Pipeline([('transformer', transformer),
#                        ('model', model)])
#    
#        # Executando o cross validation
#        cross_val_scores = cross_val_score(pipe, x_treino, y_treino, cv=kf, scoring='recall')
#    
#        # Calculando a média das métricas
#        mean_score = cross_val_scores.mean()           
#    
#        # Salvando a métrica da folder 1
#        mlflow.log_metric('recall_fold_1', cross_val_scores[0])
#    
#        # Salvando a métrica da folder 2
#        mlflow.log_metric('recall_fold_2', cross_val_scores[1])
#    
#        # Salvando a métrica da folder 3
#        mlflow.log_metric('recall_fold_3', cross_val_scores[2])
#    
#        # Salvando a métrica da folder 4
#        mlflow.log_metric('recall_fold_4', cross_val_scores[3])
#    
#        # Salvando a métrica da folder 5
#        mlflow.log_metric('recall_fold_5', cross_val_scores[4])
#    
#        # Salvando as métricas
#        mlflow.log_metric('recall_mean', mean_score)
#    
#        # Treinando o algoritmo
#        pipe.fit(x_treino, y_treino)
#    
#        # Calculando a latência média
#        latency_list = []
#    
#        for _, row in x_treino[:1000].iterrows():
#        
#            # Início da contagem de tempo
#            start_time = time.time()
#    
#            # Extrair os recursos da linha
#            features = row.values.reshape(1, -1)
#    
#            # Fazer a previsão para a linha individual
#            prediction = pipe.predict(pd.DataFrame(features, columns = x_treino.columns.to_list()))
#    
#            # Encerra a contagem
#            end_time = time.time()
#            atomic_time = end_time - start_time
#    
#            # Transforma segundo em milissegundo
#            atomic_milissec = atomic_time * 1000
#    
#            # Adiciona o tempo em uma lista
#            latency_list.append(atomic_milissec)
#    
#        # calcula a média 
#        mlflow.log_metric("Latência média", np.mean(latency_list))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
## Iniciando os experimentos sem transformers
#for tag, model in dict_models_tree_based.items():
#
#    # Gerando a tag de identificação do modelo
#    nome_modelo = f'{tag}'
#
#    with mlflow.start_run(run_name=nome_modelo):
#
#        # Criando os pipeline com os transformers
#        pipe_cat = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
#                            ('encoder', OneHotEncoder(drop='first'))])
#    
#        # Criando os pipeline com os transformers
#        pipe_cat_high_dim = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
#                                        ('encoder_hd', ce.CatBoostEncoder())])
#    
#        pipe_num = Pipeline([('imputer_num', SimpleImputer(strategy="median"))])
#    
#        # Criando o transformador
#        transformer = ColumnTransformer([('cat', pipe_cat, cat_cols),
#                                        ('num', pipe_num, num_cols),
#                                        ('cat_hd', pipe_cat_high_dim, cat_cols_high_dim)],
#                                        remainder="passthrough")
#    
#        # Criando o pipeline final
#        pipe = Pipeline([('transformer', transformer),
#                        ('model', model)])
#    
#        # Executando o cross validation
#        cross_val_scores = cross_val_score(pipe, x_treino, y_treino, cv=kf, scoring='recall')
#    
#        # Calculando a média das métricas
#        mean_score = cross_val_scores.mean()           
#    
#        # Salvando a métrica da folder 1
#        mlflow.log_metric('recall_fold_1', cross_val_scores[0])
#    
#        # Salvando a métrica da folder 2
#        mlflow.log_metric('recall_fold_2', cross_val_scores[1])
#    
#        # Salvando a métrica da folder 3
#        mlflow.log_metric('recall_fold_3', cross_val_scores[2])
#    
#        # Salvando a métrica da folder 4
#        mlflow.log_metric('recall_fold_4', cross_val_scores[3])
#    
#        # Salvando a métrica da folder 5
#        mlflow.log_metric('recall_fold_5', cross_val_scores[4])
#    
#        # Salvando as métricas
#        mlflow.log_metric('recall_mean', mean_score)
#    
#        # Treinando o algoritmo
#        pipe.fit(x_treino, y_treino)
#    
#        # Calculando a latência média
#        latency_list = []
#    
#        for _, row in x_treino[:1000].iterrows():
#        
#            # Início da contagem de tempo
#            start_time = time.time()
#    
#            # Extrair os recursos da linha
#            features = row.values.reshape(1, -1)
#    
#            # Fazer a previsão para a linha individual
#            prediction = pipe.predict(pd.DataFrame(features, columns = x_treino.columns.to_list()))
#    
#            # Encerra a contagem
#            end_time = time.time()
#            atomic_time = end_time - start_time
#    
#            # Transforma segundo em milissegundo
#            atomic_milissec = atomic_time * 1000
#    
#            # Adiciona o tempo em uma lista
#            latency_list.append(atomic_milissec)
#    
#        # calcula a média 
#        mlflow.log_metric("Latência média", np.mean(latency_list))

[LightGBM] [Info] Number of positive: 5999, number of negative: 113977
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2753
[LightGBM] [Info] Number of data points in the train set: 119976, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.050002 -> initscore=-2.944404
[LightGBM] [Info] Start training from score -2.944404


[LightGBM] [Info] Number of positive: 5999, number of negative: 113977
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2753
[LightGBM] [Info] Number of data points in the train set: 119976, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.050002 -> initscore=-2.944404
[LightGBM] [Info] Start training from score -2.944404
[LightGBM] [Info] Number of positive: 5998, number of negative: 113978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011248 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2753
[LightGBM] [Info] Number of data points in the train set: 119976, number of used features: 31
[LightGBM] [Info

In [33]:
# Definindo as colunas de interesse
colunas_para_buscar = ["tags.mlflow.runName", 'metrics.recall_mean',
                       'metrics.Latência média', 'metrics.recall_fold_1', 
                       'metrics.recall_fold_2', 'metrics.recall_fold_3', 
                       'metrics.recall_fold_4', 'metrics.recall_fold_5']

# Buscando os melhores modelos
runs = mlflow.search_runs()[colunas_para_buscar]

# Ordenando por recall médio
runs.sort_values(by="metrics.recall_mean", ascending=False)

Unnamed: 0,tags.mlflow.runName,metrics.recall_mean,metrics.Latência média,metrics.recall_fold_1,metrics.recall_fold_2,metrics.recall_fold_3,metrics.recall_fold_4,metrics.recall_fold_5
2,LGBM,0.673514,37.477145,0.667111,0.687792,0.662667,0.678,0.672
3,LR,0.67018,21.193055,0.667779,0.683789,0.654,0.676667,0.668667
1,XGB,0.106163,54.676076,0.106738,0.114076,0.108,0.102,0.1
0,RF,0.065485,27.256877,0.063376,0.073382,0.070667,0.064667,0.055333


Dos modelos testados, apenas o **LightGBM** e a **Regressão Logística** tiveram
um bom resultado. 

- O **Recall** do **LightGBM** é cerca de **0.5% superior** ao da **Regressão**.
- A **latência** do **LightGBM** é cerca de **76% maior** do que a da **Regressão**.

Dado esses aspectos, a Regressão Logística é a melhor opção.

In [37]:
## Criando/acessando o experimento
#mlflow.set_experiment('Comparando regressões')
#
## Iniciando os experimentos com regressões
#for tag_encoder, encoder in dict_encoders.items():
#    for tag_scaler, scaler in dict_scalers.items():
#        for tag_imputer, imputer in dict_imputers_num.items():
#        
#            # Gerando a tag de identificação do modelo
#            nome_modelo = f'LR_{tag_encoder}_{tag_scaler}_{tag_imputer}'
#
#            with mlflow.start_run(run_name=nome_modelo):
#            
#                # Criando os pipeline com os transformers
#                pipe_cat = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
#                                    ('encoder', encoder)])
#
#                # Criando os pipeline com os transformers
#                pipe_cat_high_dim = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
#                                                ('encoder_hd', ce.CatBoostEncoder())])
#
#                pipe_num = Pipeline([('imputer_num', imputer),
#                                     ('scaler', scaler)])
#
#                # Criando o transformador
#                transformer = ColumnTransformer([('cat', pipe_cat, cat_cols),
#                                                ('num', pipe_num, num_cols),
#                                                ('cat_hd', pipe_cat_high_dim, cat_cols_high_dim)],
#                                                remainder="passthrough")
#
#                # Criando o pipeline final
#                pipe = Pipeline([('transformer', transformer),
#                                ('model', LogisticRegression(class_weight='balanced',
#                                                             random_state=rs))])
#
#                # Executando o cross validation
#                cross_val_scores = cross_val_score(pipe, x_treino, y_treino, cv=kf, scoring='recall')
#
#                # Calculando a média das métricas
#                mean_score = cross_val_scores.mean()           
#
#                # Salvando a métrica da folder 1
#                mlflow.log_metric('recall_fold_1', cross_val_scores[0])
#
#                # Salvando a métrica da folder 2
#                mlflow.log_metric('recall_fold_2', cross_val_scores[1])
#
#                # Salvando a métrica da folder 3
#                mlflow.log_metric('recall_fold_3', cross_val_scores[2])
#
#                # Salvando a métrica da folder 4
#                mlflow.log_metric('recall_fold_4', cross_val_scores[3])
#
#                # Salvando a métrica da folder 5
#                mlflow.log_metric('recall_fold_5', cross_val_scores[4])
#
#                # Salvando as métricas
#                mlflow.log_metric('recall_mean', mean_score)
#
#                # Treinando o algoritmo
#                pipe.fit(x_treino, y_treino)
#
#                # Calculando a latência média
#                latency_list = []
#
#                for _, row in x_treino[:1000].iterrows():
#                
#                    # Início da contagem de tempo
#                    start_time = time.time()
#
#                    # Extrair os recursos da linha
#                    features = row.values.reshape(1, -1)
#
#                    # Fazer a previsão para a linha individual
#                    prediction = pipe.predict(pd.DataFrame(features, columns = x_treino.columns.to_list()))
#
#                    # Encerra a contagem
#                    end_time = time.time()
#                    atomic_time = end_time - start_time
#
#                    # Transforma segundo em milissegundo
#                    atomic_milissec = atomic_time * 1000
#
#                    # Adiciona o tempo em uma lista
#                    latency_list.append(atomic_milissec)
#
#                # calcula a média 
#                mlflow.log_metric("Latência média", np.mean(latency_list))
#

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [38]:
# Definindo as colunas de interesse
colunas_para_buscar = ["tags.mlflow.runName", 'metrics.recall_mean',
                       'metrics.Latência média', 'metrics.recall_fold_1', 
                       'metrics.recall_fold_2', 'metrics.recall_fold_3', 
                       'metrics.recall_fold_4', 'metrics.recall_fold_5']

# Buscando os melhores modelos
runs = mlflow.search_runs()[colunas_para_buscar]

# Ordenando por recall médio
runs.sort_values(by="metrics.recall_mean", ascending=False)

Unnamed: 0,tags.mlflow.runName,metrics.recall_mean,metrics.Latência média,metrics.recall_fold_1,metrics.recall_fold_2,metrics.recall_fold_3,metrics.recall_fold_4,metrics.recall_fold_5
6,LR_GE_SS_SIMEDIAN,0.671114,43.481706,0.667779,0.686458,0.652,0.678667,0.670667
26,LR_OHE_SS_SIMEDIAN,0.67018,26.611778,0.667779,0.683789,0.654,0.676667,0.668667
19,LR_BE_SS_SIAVG,0.670047,61.067499,0.667779,0.683789,0.652667,0.678,0.668
27,LR_OHE_SS_SIAVG,0.670047,25.670715,0.669113,0.681788,0.652667,0.678,0.668667
7,LR_GE_SS_SIAVG,0.669514,49.177293,0.669113,0.682455,0.652,0.674,0.67
18,LR_BE_SS_SIMEDIAN,0.669513,50.902795,0.667111,0.683789,0.651333,0.677333,0.668
24,LR_OHE_RS_SIMEDIAN,0.666846,26.665359,0.667779,0.678452,0.651333,0.672,0.664667
5,LR_GE_RS_SIAVG,0.666845,40.565927,0.663776,0.677118,0.649333,0.674667,0.669333
25,LR_OHE_RS_SIAVG,0.666046,26.34743,0.665777,0.677118,0.651333,0.672667,0.663333
4,LR_GE_RS_SIMEDIAN,0.665646,61.110228,0.663109,0.681121,0.648667,0.672,0.663333


In [42]:
(26.611778*100)/43.481706	

61.20223985691821

A combinação **Logistic Regression** + **OneHotEncoder** + **StandardSaler** + **SimpleImputer(Mediana)** que obteve o segundo lugar, possui um score 0.14% menor do que o primeiro e uma latência quase 40% menor que o mesmo.