In [1]:
import time
import optuna
import mlflow
import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# Definindo a seed para o random state
rs = 840

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Lendo os dados
data_fraud = pd.read_csv("../data/processed/data_fraud.csv", index_col=0)

In [3]:
# Dividindo em variáveis explicativas e target
x = data_fraud.drop(["score_fraude_modelo", "fraude"], axis = 1)
y = data_fraud["fraude"]

Vamos relembrar pontos importantes que descobrimos na etapa de análise:

- As variáveis **pais** e **categoria_produto** possuem uma alta cardinalidade.
- As variáveis **pais** e **categoria_produto** possuem muitos valores com contagem de categorias iguais.
- Ainda existem variáveis com valores ausentes, tanto categóricas, como numéricas.
- O target está desbalanceado.

Sabendo disso, vamos desenhar como a etapa de experimentação irá se desenrolar:

1. Os dados serão divididos em treino, dev e teste. Iremos treinar o algoritmo
com os dados de treino, fazer a tunagem com os dados de dev, e, por fim, validar
com os dados de teste.
2. Será criado um esqueleto para o pipeline de transformação, consistindo
em um imputer e scaler(quando necessário) para as variáveis numéricas e um imputer 
e um encoder para as categóricas.
    
    2.1. Não usaremos o OneHotEncoder para as colunas com uma alta quantidade de
    categorias únicas, pois isso elevaria a dimensionalidade dos dados.

    2.2. Também não será utilizado o CountEncoder nas colunas com uma alta quantidade
    de categorias únicas, pois algumas categorias apresentam a mesma quantidade de registros.

3. A princípio, testaremos alguns modelos base com o StandardScaler (quando necessário),
OneHotEncoder para as features de baixa dimensão e CatBoostEncoder para as de alta
dimensão.
3. As métricas avaliadas serão o Recall e a Latência média.
4. Os modelos mais promissores entrarão em outra rodada de experimentos, dessa
vez para testar outras combinações de encoders e scalers (se necessário).

## Realizando os experimentos


In [4]:
# Define o local para salvar os exoerimentos
mlflow.set_tracking_uri('../mlruns')

In [5]:
# Divindo os dados em treino, dev e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        stratify=y,
                                                        random_state=rs)

x_dev, x_teste, y_dev, y_teste = train_test_split(x_teste, 
                                                  y_teste,
                                                  stratify=y_teste,
                                                  test_size=0.5,
                                                  random_state=rs)

# Dividindo features numéricas de categóricas
cat_cols_high_dim = ["pais", "categoria_produto"]
cat_cols = [col for col in x_treino.select_dtypes("object").columns if col not in cat_cols_high_dim]
num_cols = x_treino.select_dtypes(["int", "float"]).columns

# Setando o KFold
kf = StratifiedKFold(shuffle=True, random_state=rs)

In [6]:
# Definindo os dicionarios com os modelos e etapas de pre-processamento que serã utilizadas

dict_models_scale_sensitive = {"LR": LogisticRegression(random_state=rs,
                                                        class_weight='balanced')}

dict_models_tree_based = {"LGBM": LGBMClassifier(is_unbalance=True,
                                                 objective= 'binary',
                                                 random_state=rs),
                          "XGB": XGBClassifier(random_state=rs,
                                               objective='binary:hinge'),
                          "RF": RandomForestClassifier(class_weight='balanced',
                                                       random_state=rs)}

# Criando dicionário com os encoders
dict_encoders = {"OHE": OneHotEncoder(drop='first'),
                 "TE": ce.TargetEncoder(),
                 "BE": ce.BinaryEncoder(),
                 "ME": ce.MEstimateEncoder(),
                 "CE": ce.CatBoostEncoder(),
                 "GE":ce.GrayEncoder(),
                 "CTE":ce.CountEncoder()}

dict_imputers_num = {"SIAVG": SimpleImputer(strategy='mean'),
                     "SIMEDIAN": SimpleImputer(strategy='median')}

dict_scalers = {"SS": StandardScaler(),
                "RS": RobustScaler()}

# Criando dicionário com os transformers
dict_transformers = {"PT": PowerTransformer()}

In [7]:
## Criando/acessando o experimento
mlflow.set_experiment('Comparando modelos base')

# Iniciando os experimentos sem transformers
for tag, model in dict_models_scale_sensitive.items():

    # Gerando a tag de identificação do modelo
    nome_modelo = f'{tag}'

    with mlflow.start_run(run_name=nome_modelo):

        # Criando os pipeline com os transformers
        pipe_cat = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                            ('encoder', OneHotEncoder(drop='first'))])
    
        # Criando os pipeline com os transformers
        pipe_cat_high_dim = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                                        ('encoder_hd', ce.CatBoostEncoder())])
    
        pipe_num = Pipeline([('imputer_num', SimpleImputer(strategy="median")),
                            ('scaler', StandardScaler())])
    
        # Criando o transformador
        transformer = ColumnTransformer([('cat', pipe_cat, cat_cols),
                                        ('num', pipe_num, num_cols),
                                        ('cat_hd', pipe_cat_high_dim, cat_cols_high_dim)],
                                        remainder="passthrough")
    
        # Criando o pipeline final
        pipe = Pipeline([('transformer', transformer),
                        ('model', model)])
    
        # Executando o cross validation
        cross_val_scores = cross_val_score(pipe, x_treino, y_treino, cv=kf, scoring='recall')
    
        # Calculando a média das métricas
        mean_score = cross_val_scores.mean()           
    
        # Salvando a métrica da folder 1
        mlflow.log_metric('recall_fold_1', cross_val_scores[0])
    
        # Salvando a métrica da folder 2
        mlflow.log_metric('recall_fold_2', cross_val_scores[1])
    
        # Salvando a métrica da folder 3
        mlflow.log_metric('recall_fold_3', cross_val_scores[2])
    
        # Salvando a métrica da folder 4
        mlflow.log_metric('recall_fold_4', cross_val_scores[3])
    
        # Salvando a métrica da folder 5
        mlflow.log_metric('recall_fold_5', cross_val_scores[4])
    
        # Salvando as métricas
        mlflow.log_metric('recall_mean', mean_score)
    
        # Treinando o algoritmo
        pipe.fit(x_treino, y_treino)
    
        # Calculando a latência média
        latency_list = []
    
        for _, row in x_treino[:1000].iterrows():
        
            # Início da contagem de tempo
            start_time = time.time()
    
            # Extrair os recursos da linha
            features = row.values.reshape(1, -1)
    
            # Fazer a previsão para a linha individual
            prediction = pipe.predict(pd.DataFrame(features, columns = x_treino.columns.to_list()))
    
            # Encerra a contagem
            end_time = time.time()
            atomic_time = end_time - start_time
    
            # Transforma segundo em milissegundo
            atomic_milissec = atomic_time * 1000
    
            # Adiciona o tempo em uma lista
            latency_list.append(atomic_milissec)
    
        # calcula a média 
        mlflow.log_metric("Latência média", np.mean(latency_list))

2024/04/01 16:23:57 INFO mlflow.tracking.fluent: Experiment with name 'Comparando modelos base' does not exist. Creating a new experiment.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Iniciando os experimentos sem transformers
for tag, model in dict_models_tree_based.items():

    # Gerando a tag de identificação do modelo
    nome_modelo = f'{tag}'

    with mlflow.start_run(run_name=nome_modelo):

        # Criando os pipeline com os transformers
        pipe_cat = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                            ('encoder', OneHotEncoder(drop='first'))])
    
        # Criando os pipeline com os transformers
        pipe_cat_high_dim = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                                        ('encoder_hd', ce.CatBoostEncoder())])
    
        pipe_num = Pipeline([('imputer_num', SimpleImputer(strategy="median"))])
    
        # Criando o transformador
        transformer = ColumnTransformer([('cat', pipe_cat, cat_cols),
                                        ('num', pipe_num, num_cols),
                                        ('cat_hd', pipe_cat_high_dim, cat_cols_high_dim)],
                                        remainder="passthrough")
    
        # Criando o pipeline final
        pipe = Pipeline([('transformer', transformer),
                        ('model', model)])
    
        # Executando o cross validation
        cross_val_scores = cross_val_score(pipe, x_treino, y_treino, cv=kf, scoring='recall')
    
        # Calculando a média das métricas
        mean_score = cross_val_scores.mean()           
    
        # Salvando a métrica da folder 1
        mlflow.log_metric('recall_fold_1', cross_val_scores[0])
    
        # Salvando a métrica da folder 2
        mlflow.log_metric('recall_fold_2', cross_val_scores[1])
    
        # Salvando a métrica da folder 3
        mlflow.log_metric('recall_fold_3', cross_val_scores[2])
    
        # Salvando a métrica da folder 4
        mlflow.log_metric('recall_fold_4', cross_val_scores[3])
    
        # Salvando a métrica da folder 5
        mlflow.log_metric('recall_fold_5', cross_val_scores[4])
    
        # Salvando as métricas
        mlflow.log_metric('recall_mean', mean_score)
    
        # Treinando o algoritmo
        pipe.fit(x_treino, y_treino)
    
        # Calculando a latência média
        latency_list = []
    
        for _, row in x_treino[:1000].iterrows():
        
            # Início da contagem de tempo
            start_time = time.time()
    
            # Extrair os recursos da linha
            features = row.values.reshape(1, -1)
    
            # Fazer a previsão para a linha individual
            prediction = pipe.predict(pd.DataFrame(features, columns = x_treino.columns.to_list()))
    
            # Encerra a contagem
            end_time = time.time()
            atomic_time = end_time - start_time
    
            # Transforma segundo em milissegundo
            atomic_milissec = atomic_time * 1000
    
            # Adiciona o tempo em uma lista
            latency_list.append(atomic_milissec)
    
        # calcula a média 
        mlflow.log_metric("Latência média", np.mean(latency_list))

[LightGBM] [Info] Number of positive: 4200, number of negative: 79800
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2753
[LightGBM] [Info] Number of data points in the train set: 84000, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.050000 -> initscore=-2.944439
[LightGBM] [Info] Start training from score -2.944439
[LightGBM] [Info] Number of positive: 4200, number of negative: 79800
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2753
[LightGBM] [Info] Number of data points in the train set: 84000, number of used features: 31
[LightGBM] [Info] [b

In [9]:
# Definindo as colunas de interesse
colunas_para_buscar = ["tags.mlflow.runName", 'metrics.recall_mean',
                       'metrics.Latência média', 'metrics.recall_fold_1', 
                       'metrics.recall_fold_2', 'metrics.recall_fold_3', 
                       'metrics.recall_fold_4', 'metrics.recall_fold_5']

# Buscando os melhores modelos
runs = mlflow.search_runs()[colunas_para_buscar]

# Ordenando por recall médio
runs.sort_values(by="metrics.recall_mean", ascending=False)

Unnamed: 0,tags.mlflow.runName,metrics.recall_mean,metrics.Latência média,metrics.recall_fold_1,metrics.recall_fold_2,metrics.recall_fold_3,metrics.recall_fold_4,metrics.recall_fold_5
3,LR,0.674476,23.068824,0.665714,0.672381,0.673333,0.673333,0.687619
2,LGBM,0.638857,38.432095,0.629524,0.622857,0.647619,0.644762,0.649524
1,XGB,0.10781,70.765401,0.108571,0.093333,0.115238,0.110476,0.111429
0,RF,0.058286,40.490916,0.064762,0.049524,0.06,0.058095,0.059048


Dos modelos testados, apenas o **LightGBM** e a **Regressão Logística** tiveram
um bom resultado. 

- O **Recall** do **LightGBM** é cerca de **0.5% superior** ao da **Regressão**.
- A **latência** do **LightGBM** é cerca de **76% maior** do que a da **Regressão**.

Dado esses aspectos, a Regressão Logística é a melhor opção.

In [10]:
# Criando/acessando o experimento
mlflow.set_experiment('Comparando regressões')

# Iniciando os experimentos com regressões
for tag_encoder, encoder in dict_encoders.items():
    for tag_scaler, scaler in dict_scalers.items():
        for tag_imputer, imputer in dict_imputers_num.items():
        
            # Gerando a tag de identificação do modelo
            nome_modelo = f'LR_{tag_encoder}_{tag_scaler}_{tag_imputer}'

            with mlflow.start_run(run_name=nome_modelo):
            
                # Criando os pipeline com os transformers
                pipe_cat = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                                    ('encoder', encoder)])

                # Criando os pipeline com os transformers
                pipe_cat_high_dim = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                                                ('encoder_hd', ce.CatBoostEncoder())])

                pipe_num = Pipeline([('imputer_num', imputer),
                                     ('scaler', scaler)])

                # Criando o transformador
                transformer = ColumnTransformer([('cat', pipe_cat, cat_cols),
                                                ('num', pipe_num, num_cols),
                                                ('cat_hd', pipe_cat_high_dim, cat_cols_high_dim)],
                                                remainder="passthrough")

                # Criando o pipeline final
                pipe = Pipeline([('transformer', transformer),
                                ('model', LogisticRegression(class_weight='balanced',
                                                             random_state=rs))])

                # Executando o cross validation
                cross_val_scores = cross_val_score(pipe, x_treino, y_treino, cv=kf, scoring='recall')

                # Calculando a média das métricas
                mean_score = cross_val_scores.mean()           

                # Salvando a métrica da folder 1
                mlflow.log_metric('recall_fold_1', cross_val_scores[0])

                # Salvando a métrica da folder 2
                mlflow.log_metric('recall_fold_2', cross_val_scores[1])

                # Salvando a métrica da folder 3
                mlflow.log_metric('recall_fold_3', cross_val_scores[2])

                # Salvando a métrica da folder 4
                mlflow.log_metric('recall_fold_4', cross_val_scores[3])

                # Salvando a métrica da folder 5
                mlflow.log_metric('recall_fold_5', cross_val_scores[4])

                # Salvando as métricas
                mlflow.log_metric('recall_mean', mean_score)

                # Treinando o algoritmo
                pipe.fit(x_treino, y_treino)

                # Calculando a latência média
                latency_list = []

                for _, row in x_treino[:1000].iterrows():
                
                    # Início da contagem de tempo
                    start_time = time.time()

                    # Extrair os recursos da linha
                    features = row.values.reshape(1, -1)

                    # Fazer a previsão para a linha individual
                    prediction = pipe.predict(pd.DataFrame(features, columns = x_treino.columns.to_list()))

                    # Encerra a contagem
                    end_time = time.time()
                    atomic_time = end_time - start_time

                    # Transforma segundo em milissegundo
                    atomic_milissec = atomic_time * 1000

                    # Adiciona o tempo em uma lista
                    latency_list.append(atomic_milissec)

                # calcula a média 
                mlflow.log_metric("Latência média", np.mean(latency_list))


2024/04/01 16:35:21 INFO mlflow.tracking.fluent: Experiment with name 'Comparando regressões' does not exist. Creating a new experiment.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [12]:
# Definindo as colunas de interesse
colunas_para_buscar = ["tags.mlflow.runName", 'metrics.recall_mean',
                       'metrics.Latência média', 'metrics.recall_fold_1', 
                       'metrics.recall_fold_2', 'metrics.recall_fold_3', 
                       'metrics.recall_fold_4', 'metrics.recall_fold_5']

# Buscando os melhores modelos
runs = mlflow.search_runs()[colunas_para_buscar]

# Ordenando por recall médio
runs.sort_values(by="metrics.recall_mean", ascending=False)

Unnamed: 0,tags.mlflow.runName,metrics.recall_mean,metrics.Latência média,metrics.recall_fold_1,metrics.recall_fold_2,metrics.recall_fold_3,metrics.recall_fold_4,metrics.recall_fold_5
6,LR_GE_SS_SIMEDIAN,0.675619,50.380469,0.673333,0.67619,0.674286,0.668571,0.685714
26,LR_OHE_SS_SIMEDIAN,0.674476,21.52731,0.665714,0.672381,0.673333,0.673333,0.687619
7,LR_GE_SS_SIAVG,0.674286,44.665262,0.662857,0.677143,0.675238,0.668571,0.687619
27,LR_OHE_SS_SIAVG,0.673905,24.908871,0.664762,0.671429,0.675238,0.673333,0.684762
18,LR_BE_SS_SIMEDIAN,0.673714,37.26149,0.665714,0.671429,0.675238,0.668571,0.687619
4,LR_GE_RS_SIMEDIAN,0.672952,37.361869,0.66,0.67619,0.673333,0.668571,0.686667
5,LR_GE_RS_SIAVG,0.672381,42.174859,0.661905,0.672381,0.671429,0.667619,0.688571
19,LR_BE_SS_SIAVG,0.672381,37.976671,0.667619,0.668571,0.677143,0.666667,0.681905
25,LR_OHE_RS_SIAVG,0.672,22.176238,0.662857,0.673333,0.673333,0.664762,0.685714
24,LR_OHE_RS_SIMEDIAN,0.671238,22.642108,0.659048,0.670476,0.673333,0.664762,0.688571


EDITARRRRRRRRRRRRRRR

A combinação **Logistic Regression** + **OneHotEncoder** + **StandardSaler** + **SimpleImputer(Mediana)** que obteve o segundo lugar, possui um score 0.14% menor do que o primeiro e uma latência quase 40% menor que o mesmo.

## Tunando o modelo candidato

Agora que já temos a melhor combinação de preprocessors, vamos buscar a melhor
opção de hiperparâmetros.

Para tal, usaremos o **Optuna**.

In [17]:
# Criando função para tunar o modelo
def objective(trial):

    params = {
        'C': trial.suggest_float('C', 1e-4, 1e+4, log=True),
        'penalty': trial.suggest_categorical('penalty', [None, 'l2']),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'saga', 'newton-cholesky']),
        'max_iter': trial.suggest_int('max_iter', 50, 1000),
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'random_state': rs
    }
    
    # Criando os pipeline com os transformers
    pipe_cat = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                        ('encoder', OneHotEncoder(drop='first'))])

    # Criando os pipeline com os transformers
    pipe_cat_high_dim = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                                    ('encoder_hd', ce.CatBoostEncoder())])

    pipe_num = Pipeline([('imputer_num', SimpleImputer(strategy="median")),
                        ('scaler', StandardScaler())])

    # Criando o transformador
    transformer = ColumnTransformer([('cat', pipe_cat, cat_cols),
                                    ('num', pipe_num, num_cols),
                                    ('cat_hd', pipe_cat_high_dim, cat_cols_high_dim)],
                                    remainder="passthrough")
    
    # Criando o pipeline final
    pipe = Pipeline([('transformer', transformer),
                    ('model', LogisticRegression(**params))])

    # Treinando o modelo com os dados de treino
    pipe.fit(x_treino, y_treino)
   
    recall = recall_score(y_dev, pipe.predict(x_dev))
    
    return recall

# Criando o estudo de otimização
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 50)


[I 2024-04-01 17:14:31,732] A new study created in memory with name: no-name-d8084c9c-526d-4c87-a4f1-90f423f3ddc5


[I 2024-04-01 17:14:36,446] Trial 0 finished with value: 0.6764444444444444 and parameters: {'C': 0.05159891320663209, 'penalty': None, 'solver': 'lbfgs', 'max_iter': 246, 'fit_intercept': False, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6764444444444444.
[I 2024-04-01 17:16:17,378] Trial 1 finished with value: 0.6622222222222223 and parameters: {'C': 19.165077884766948, 'penalty': None, 'solver': 'saga', 'max_iter': 758, 'fit_intercept': False, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6764444444444444.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.50758e-18): result may not be accurate.
[I 2024-04-01 17:16:20,333] Trial 2 finished with value: 0.6728888888888889 and parameters: {'C': 26.796710997496334, 'penalty': None, 'solver': 'newton-chol

In [19]:
params = study.best_params
params['random_state'] = rs

In [21]:
# Criando os pipeline com os transformers
pipe_cat = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                    ('encoder', OneHotEncoder(drop='first'))])

# Criando os pipeline com os transformers
pipe_cat_high_dim = Pipeline([("imputer_cat", SimpleImputer(strategy='most_frequent')),
                                ('encoder_hd', ce.CatBoostEncoder())])

pipe_num = Pipeline([('imputer_num', SimpleImputer(strategy="median")),
                    ('scaler', StandardScaler())])

# Criando o transformador
transformer = ColumnTransformer([('cat', pipe_cat, cat_cols),
                                ('num', pipe_num, num_cols),
                                ('cat_hd', pipe_cat_high_dim, cat_cols_high_dim)],
                                remainder="passthrough")

# Criando o pipeline final
pipe = Pipeline([('transformer', transformer),
                ('model', LogisticRegression(**params))])

# Treinando o modelo com os dados de treino
pipe.fit(x_treino, y_treino)

In [22]:
# Calculando o recall
recall_score(y_teste, pipe.predict(x_teste))

0.6835555555555556