In [1]:
# Manipulação e Visualização de Dados
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from scipy.stats import ks_2samp
from scipy.stats import ks_2samp
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay,accuracy_score, f1_score, recall_score

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

import optuna
import joblib

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
df_train = pd.read_csv('../data/fold1_no_scale.csv')
df_val = pd.read_csv('../data/fold2_no_scale.csv')
df_test = pd.read_csv('../data/fold3_no_scale.csv')

In [3]:
TARGET = "Churn"

X_train, y_train = df_train.drop(columns=[TARGET]), df_train[TARGET]
X_val,   y_val   = df_val.drop(columns=[TARGET]),   df_val[TARGET]
X_test,  y_test  = df_test.drop(columns=[TARGET]),  df_test[TARGET]

In [4]:
feature_column_names = X_train.columns.tolist()

In [5]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_val)
print("Acurácia Random Forest (validação):", accuracy_score(y_val, rf_preds))

# MLPClassifier
mlp_model = MLPClassifier(random_state=42,max_iter=1500)
mlp_model.fit(X_train, y_train)
mlp_preds = mlp_model.predict(X_val)
print("Acurácia MLP (validação):", accuracy_score(y_val, mlp_preds))

# XGBoost
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_val)
print("Acurácia XGBoost (validação):", accuracy_score(y_val, xgb_preds))

Acurácia Random Forest (validação): 0.7768762677484787
Acurácia MLP (validação): 0.7870182555780934
Acurácia XGBoost (validação): 0.7834685598377282


In [6]:
# def compute_metric(metric, y_true, probs, preds):
#     """
#     Calcula o score de acordo com a métrica escolhida.
    
#     Parâmetros:
#       - metric: string ('ks', 'roc', 'accuracy', 'f1', 'recall')
#       - y_true: valores reais
#       - probs: probabilidades preditas (usadas para KS e ROC-AUC)
#       - preds: classes preditas (usadas para acurácia, F1 e recall)
      
#     Retorna:
#       - Valor da métrica escolhida.
#     """
#     if metric == 'ks':
#         return ks_2samp(probs[y_true == 1], probs[y_true == 0]).statistic
#     elif metric == 'roc':
#         fpr, tpr, thresholds = roc_curve(y_true, probs)
#         return auc(fpr, tpr)
#     elif metric == 'accuracy':
#         return accuracy_score(y_true, preds)
#     elif metric == 'f1':
#         return f1_score(y_true, preds)
#     elif metric == 'recall':
#         return recall_score(y_true, preds)
#     else:
#         raise ValueError("Métrica não suportada. Escolha entre 'ks', 'roc', 'accuracy', 'f1' ou 'recall'.")
    
# PARAM_SPACE = {
#     "rf": {
#         "n_estimators":           (50, 300,    "int"),
#         "max_depth":              (2, 80,     "int_log"),
#         "min_samples_split":      (2, 20,     "int"),
#         "min_samples_leaf":       (1, 20,     "int"),
#         "max_features":           (0.1, 1.0,  "float"),
#         "bootstrap":              [True, False],
#         "criterion":              ["gini", "entropy"],
#     },
#     "mlp": {
#         "hidden_layer_sizes":     [(50,), (100,), (50, 50), (100, 50), (100, 100)],
#         "activation":             ["relu", "tanh", "logistic"],
#         "solver":                 ["adam", "sgd"],
#         "alpha":                  (1e-5, 1e-2,    "float_log"),
#         "learning_rate_init":     (1e-7, 1e-1,    "float_log"),
#         "batch_size":             ["auto", 8, 16, 32, 64, 128],
#         "max_iter":               [ 500, 1000,1300,1500,2000],
#         "momentum":               (0.0, 0.99,     "float"),
#         "learning_rate":          ["constant", "invscaling", "adaptive"],
#         "tol":                    (1e-5, 1e-2,    "float_log"),
#     },
#     "xgb": {
#         "n_estimators":           (50, 300,    "int"),
#         "max_depth":              (2, 10,     "int_log"),
#         "learning_rate":          (1e-3, 0.3,  "float_log"),
#         "gamma":                  (0, 5,       "float"),
#         "subsample":              (0.5, 1.0,   "float"),
#         "colsample_bytree":       (0.5, 1.0,   "float"),
#         "reg_alpha":              (1e-5, 1e-1, "float_log"),
#         "reg_lambda":             (1e-5, 1e-1, "float_log"),
#     }
# }


# def get_objective(model_name, metric="ks"):
#     space = PARAM_SPACE[model_name]
#     def objective(trial):
#         # sugere tudo dinamicamente
#         kwargs = {}
#         for k,v in space.items():
#             if isinstance(v, tuple):
#                 low, high, kind = v
#                 if kind=="int":
#                     kwargs[k] = trial.suggest_int(k, low, high)
#                 elif kind=="int_log":
#                     kwargs[k] = trial.suggest_int(k, low, high, log=True)
#                 elif kind=="float":
#                     kwargs[k] = trial.suggest_float(k, low, high)
#                 elif kind=="float_log":
#                     kwargs[k] = trial.suggest_float(k, low, high, log=True)
#             else:
#                 # lista de choices
#                 kwargs[k] = trial.suggest_categorical(k, v)

#         # instancia o modelo
#         if model_name=="rf":
#             clf = RandomForestClassifier(random_state=42, **kwargs)
#             # fit & score
#             clf.fit(X_train, y_train)
#             probs = clf.predict_proba(X_val)[:,1]
#             preds = clf.predict(X_val)
#         elif model_name=="mlp":
#             clf = MLPClassifier(random_state=42, **kwargs)
#             scaler = StandardScaler()
#             X_train_scaled = scaler.fit_transform(X_train)
#             X_val_scaled = scaler.transform(X_val)
#             # fit & score
#             clf.fit(X_train_scaled, y_train)
#             probs = clf.predict_proba(X_val_scaled)[:,1]
#             preds = clf.predict(X_val_scaled)
#         else:  # xgb
#             clf = XGBClassifier(random_state=42, use_label_encoder=False,
#                                 eval_metric="logloss", **kwargs)
#             # fit & score
#             clf.fit(X_train, y_train)
#             probs = clf.predict_proba(X_val)[:,1]
#             preds = clf.predict(X_val)

#         return compute_metric(metric, y_val, probs, preds)
#     return objective

# for model in ["rf","mlp","xgb"]:
#     study = optuna.create_study(direction="maximize")
#     study.optimize(get_objective(model), n_trials=50)
#     print(model, "→ :best_params:", study.best_params, "| best_score:", study.best_value)
#     if model == "rf":
#         best_rf_params = study.best_params
#     elif model == "mlp":
#         best_mlp_params = study.best_params
#     elif model == "xgb":
#         best_xgb_params = study.best_params


In [7]:
best_rf_params = {'n_estimators': 181, 'max_depth': 80, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 0.14433895866314056, 'bootstrap': True, 'criterion': 'gini'}
best_mlp_params = {'hidden_layer_sizes': (50,), 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00014922136338028723, 'learning_rate_init': 0.0013999067855568553, 'batch_size': 'auto', 'max_iter': 1000, 'momentum': 0.9079471829305951, 'learning_rate': 'constant', 'tol': 0.00279005126458061}
best_xgb_params = {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.06394426113769631, 'gamma': 4.250873077353954, 'subsample': 0.8098870915836598, 'colsample_bytree': 0.8381743726836992, 'reg_alpha': 0.001466473948922872, 'reg_lambda': 0.002877321235259522}

In [8]:
# best_rf_params = {'n_estimators': 132, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 0.22290137215315986, 'bootstrap': True, 'criterion': 'entropy'}
# best_mlp_params = {'hidden_layer_sizes': (100, 50), 'activation': 'tanh', 'solver': 'adam', 'alpha': 9.378890046495715e-05, 'learning_rate_init': 9.351095668258579e-06, 'batch_size': 16, 'max_iter': 1500, 'momentum': 0.7731362527089873, 'learning_rate': 'invscaling', 'tol': 4.0359613960011186e-05}
# best_xgb_params = {'n_estimators': 181,
#  'max_depth': 2,
#  'learning_rate': 0.034086900335549314,
#  'gamma': 1.6478134039856276,
#  'subsample': 0.6339990719450903,
#  'colsample_bytree': 0.6094061736031922,
#  'reg_alpha': 0.004519046789752244,
#  'reg_lambda': 0.0011747913628717445}

In [9]:
import os
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier


SAVE_DIR = '../models/'
os.makedirs(SAVE_DIR, exist_ok=True)

X_train_df = pd.DataFrame(X_train)
X_valid_df = pd.DataFrame(X_val)
X_test_df  = pd.DataFrame(X_test)

y_train_df = pd.DataFrame(y_train)
y_valid_df = pd.DataFrame(y_val)
y_test_df  = pd.DataFrame(y_test)

models = {
    "Random Forest": RandomForestClassifier(random_state=42, **best_rf_params),
    "MLP":           MLPClassifier(random_state=42, **best_mlp_params),
    "XGBoost":       XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', **best_xgb_params)
}

folds = [
    {
        "X_train": pd.concat([X_train_df, X_valid_df], axis=0).reset_index(drop=True),
        "y_train": pd.concat([y_train_df, y_valid_df], axis=0).reset_index(drop=True),
        "X_test":  X_test_df.reset_index(drop=True),
        "y_test":  y_test_df.reset_index(drop=True)
    },
    {
        "X_train": pd.concat([X_valid_df, X_test_df], axis=0).reset_index(drop=True),
        "y_train": pd.concat([y_valid_df, y_test_df], axis=0).reset_index(drop=True),
        "X_test":  X_train_df.reset_index(drop=True),
        "y_test":  y_train_df.reset_index(drop=True)
    },
    {
        "X_train": pd.concat([X_test_df, X_train_df], axis=0).reset_index(drop=True),
        "y_train": pd.concat([y_test_df, y_train_df], axis=0).reset_index(drop=True),
        "X_test":  X_valid_df.reset_index(drop=True),
        "y_test":  y_valid_df.reset_index(drop=True)
    }
]

for i, fold in enumerate(folds, start=1):
    X_tr = fold["X_train"]
    y_tr = fold["y_train"].squeeze().values.ravel()

    print(f"\n=== Fold {i} ===")
    for name, model in models.items():
        model.fit(X_tr, y_tr)
        filename = os.path.join(SAVE_DIR, f'{name}_fold_{i}_model.joblib')
        joblib.dump(model, filename)
        print(f"Modelo '{name}' do Fold {i} salvo em: {filename}")



=== Fold 1 ===




Modelo 'Random Forest' do Fold 1 salvo em: ../models/Random Forest_fold_1_model.joblib
Modelo 'MLP' do Fold 1 salvo em: ../models/MLP_fold_1_model.joblib
Modelo 'XGBoost' do Fold 1 salvo em: ../models/XGBoost_fold_1_model.joblib

=== Fold 2 ===
Modelo 'Random Forest' do Fold 2 salvo em: ../models/Random Forest_fold_2_model.joblib
Modelo 'MLP' do Fold 2 salvo em: ../models/MLP_fold_2_model.joblib
Modelo 'XGBoost' do Fold 2 salvo em: ../models/XGBoost_fold_2_model.joblib

=== Fold 3 ===
Modelo 'Random Forest' do Fold 3 salvo em: ../models/Random Forest_fold_3_model.joblib
Modelo 'MLP' do Fold 3 salvo em: ../models/MLP_fold_3_model.joblib
Modelo 'XGBoost' do Fold 3 salvo em: ../models/XGBoost_fold_3_model.joblib
