# 0. Imports

In [1]:
import sys
import joblib

sys.path.insert(0, "../src/")

from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.discretisation import (
    EqualFrequencyDiscretiser,
    EqualWidthDiscretiser,
)
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

from utils.utils import load_config_file
from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from evaluation.classifier_eval import ModelEvaluation

# 1. Data Load

In [2]:
dl = DataLoad()
df = dl.load_data("train_dataset_name")

[2m2025-05-19 20:58:52[0m [[32m[1minfo     [0m] [1mComeçando a carga dos dados com o nome: train_dataset_name[0m


# 2. Data Validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

[2m2025-05-19 20:58:56[0m [[32m[1minfo     [0m] [1mValidacao iniciou             [0m
[2m2025-05-19 20:58:56[0m [[32m[1minfo     [0m] [1mValidation columns passed...  [0m
[2m2025-05-19 20:58:56[0m [[32m[1minfo     [0m] [1mValidacao com sucesso.        [0m


# 3. Data Transformation

In [4]:
dt = DataTransformation(df)
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()

# 4. Experimentations 

In [5]:
import mlflow
from mlflow.tracking import MlflowClient

In [6]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("prob_loan")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1747133663728, experiment_id='1', last_update_time=1747133663728, lifecycle_stage='active', name='prob_loan', tags={}>

## 4.1. Hyperparameters

In [7]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [8]:
from hyperopt import fmin, tpe, hp, STATUS_OK

In [9]:
pipe = Pipeline(
    [
        (
            "imputer",
            MeanMedianImputer(variables=["RendaMensal", "NumeroDeDependentes"]),
        ),
        (
            "discretizer",
            EqualFrequencyDiscretiser(
                variables=[
                    "TaxaDeUtilizacaoDeLinhasNaoGarantidas",
                    "TaxaDeEndividamento",
                    "RendaMensal",
                ]
            ),
        ),
        ("scaler", SklearnTransformerWrapper(StandardScaler())),
    ]
)

In [10]:
def objective(params):
    with mlflow.start_run(run_name="with_discretizer_hyperopt"):
        mlflow.set_tag("model_name", "lr_discretizer_hyperopt")
        mlflow.log_params(params)

        preprocessador = DataPreprocess(pipe)
        preprocessador.train(X_train)

        X_train_processed = preprocessador.transform(X_train)
        X_valid_processed = preprocessador.transform(X_valid)
        joblib.dump(preprocessador, "../models/preprocess.joblib")

        ## 1.1. logar um artefato (preprocessador)
        mlflow.log_artifact("../models/preprocess.joblib")

        ## 1.2. logar os parametros do preprocessador
        mlflow.log_params(
            params={
                "imputer": pipe["imputer"],
                "discretizer": pipe["discretizer"],
                "scaler": pipe["scaler"],
            }
        )

        # 2. Inicia o exprimento com cross validation
        model = LogisticRegression(**params)
        model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)
        roc_auc_scores = model_eval.cross_val_evaluate()

        ## 2.1. logar o resultado da performance
        mlflow.log_metric("train_roc_auc", roc_auc_scores.mean())

        # 3. Treinar o modelo
        model.fit(X_train_processed, y_train)

        # 4. Salvar as metricas de validacao do modelo
        y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:, 1]
        val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

        mlflow.log_metric("valid_roc_auc", val_roc_auc)

        # 5. Logar o modelo
        candidate_model_uri = mlflow.sklearn.log_model(model, "lr_model").model_uri

        # ===============================================================================
        signature = infer_signature(X_valid_processed, y_valid)

        eval_data = X_valid_processed
        eval_data["label"] = y_valid

        thresholds = {
            "accuracy_score": MetricThreshold(
                threshold=0.1,  # o score da acuracia precisar ser > 0.7 para modelo ser logado
                min_absolute_change=0.05,
                min_relative_change=0.05,
                greater_is_better=True,
            )
        }

        baseline_model = DummyClassifier(strategy="uniform").fit(
            X_train_processed, y_train
        )
        baseline_model_uri = mlflow.sklearn.log_model(
            baseline_model, "baseline_model", signature=signature
        ).model_uri

        # processo responsavel por avaliar o modelo do mlflow
        mlflow.evaluate(
            candidate_model_uri,
            eval_data,
            targets="label",
            model_type="classifier",
            validation_thresholds=thresholds,
            baseline_model=baseline_model_uri,
        )

        mlflow.end_run()

        return {"loss": -roc_auc_scores.mean(), "status": STATUS_OK}

In [11]:
search_space = {
    "warm_start": hp.choice("warm_start", [True, False]),
    "fit_intercept": hp.choice("fit_intercept", [True, False]),
    "tol": hp.uniform("tol", 0.00001, 0.0001),
    "C": hp.uniform("C", 0.05, 3),
    "solver": hp.choice("solver", ["newton-cg", "lbfgs", "liblinear"]),
    "max_iter": hp.choice("max_iter", range(100, 1000)),
    "multi_class": "auto",
    "class_weight": hp.choice("class_weight", [None, "balanced"]),
}

In [12]:
best_result = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=5)

[2m2025-05-19 20:59:08[0m [[32m[1minfo     [0m] [1mPreprocessamento iniciou...   [0m
[2m2025-05-19 20:59:09[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 20:59:09[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 20:59:09[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada...[0m
  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]









[2m2025-05-19 20:59:11[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo [0m
  0%|          | 0/5 [00:03<?, ?trial/s, best loss=?]



  from .autonotebook import tqdm as notebook_tqdm

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 618.54it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 467.62it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 403.32it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 363.15it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 322.20it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 295.91it/s]


2025/05/19 20:59:20 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

2025/05/19 20:59:20 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...

2025/05/19 20:59:23 INFO mlflow.models.evaluation.evaluators.shap: Shap explainer LinearExplainer is used.



  mlflow.evaluate(candidate_model_uri,

Downloading artifacts:   0%|          | 0/

🏃 View run with_discretizer_hyperopt at: http://127.0.0.1:5000/#/experiments/1/runs/ddfebcee207d41689c408817173e103e

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

 20%|██        | 1/5 [00:18<01:13, 18.28s/trial, best loss: -0.8245984820451715]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.031858 seconds

INFO:hyperopt.tpe:TPE using 1/1 trials with best loss -0.824598



[2m2025-05-19 20:59:27[0m [[32m[1minfo     [0m] [1mPreprocessamento iniciou...   [0m
[2m2025-05-19 20:59:27[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 20:59:27[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 20:59:27[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada...[0m
 20%|██        | 1/5 [00:18<01:13, 18.28s/trial, best loss: -0.8245984820451715]









[2m2025-05-19 20:59:29[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo [0m
 20%|██        | 1/5 [00:20<01:13, 18.28s/trial, best loss: -0.8245984820451715]



Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 47.52it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 77.04it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 96.58it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 87.87it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 99.36it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 95.28it/s]


2025/05/19 20:59:36 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

2025/05/19 20:59:36 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...

2025/05/19 20:59:37 INFO mlflow.models.evaluation.evaluators.shap: Shap explainer LinearExplainer is used.



  mlflow.evaluate(candidate_model_uri,

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        

🏃 View run with_discretizer_hyperopt at: http://127.0.0.1:5000/#/experiments/1/runs/4c9c97b140a04200b980bf1a91eb695d

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

 40%|████      | 2/5 [00:32<00:48, 16.06s/trial, best loss: -0.8473541789127749]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.031034 seconds

INFO:hyperopt.tpe:TPE using 2/2 trials with best loss -0.847354



[2m2025-05-19 20:59:41[0m [[32m[1minfo     [0m] [1mPreprocessamento iniciou...   [0m
[2m2025-05-19 20:59:41[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 20:59:41[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 20:59:41[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada...[0m
 40%|████      | 2/5 [00:33<00:48, 16.06s/trial, best loss: -0.8473541789127749]









[2m2025-05-19 20:59:44[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo [0m
 40%|████      | 2/5 [00:35<00:48, 16.06s/trial, best loss: -0.8473541789127749]



Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 53.45it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 73.95it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 69.57it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 77.31it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 90.48it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 86.82it/s]


2025/05/19 20:59:51 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

2025/05/19 20:59:52 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...

2025/05/19 20:59:53 INFO mlflow.models.evaluation.evaluators.shap: Shap explainer LinearExplainer is used.



  mlflow.evaluate(candidate_model_uri,

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        

🏃 View run with_discretizer_hyperopt at: http://127.0.0.1:5000/#/experiments/1/runs/249a9b3d7a464d6e9f8ecdab3b4de3ef

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

 60%|██████    | 3/5 [00:49<00:32, 16.15s/trial, best loss: -0.8473541789127749]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.034949 seconds

INFO:hyperopt.tpe:TPE using 3/3 trials with best loss -0.847354



[2m2025-05-19 20:59:57[0m [[32m[1minfo     [0m] [1mPreprocessamento iniciou...   [0m
[2m2025-05-19 20:59:57[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 20:59:57[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 20:59:58[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada...[0m
 60%|██████    | 3/5 [00:49<00:32, 16.15s/trial, best loss: -0.8473541789127749]









[2m2025-05-19 21:00:00[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo [0m
 60%|██████    | 3/5 [00:52<00:32, 16.15s/trial, best loss: -0.8473541789127749]



Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 49.77it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 47.27it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 59.71it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 74.88it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 85.89it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 80.81it/s]


2025/05/19 21:00:06 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

2025/05/19 21:00:06 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...

2025/05/19 21:00:08 INFO mlflow.models.evaluation.evaluators.shap: Shap explainer LinearExplainer is used.



  mlflow.evaluate(candidate_model_uri,

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        

🏃 View run with_discretizer_hyperopt at: http://127.0.0.1:5000/#/experiments/1/runs/bb7a798b6bbd46229f1f46174a73ba50

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

 80%|████████  | 4/5 [01:02<00:15, 15.26s/trial, best loss: -0.8473541789127749]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.037081 seconds

INFO:hyperopt.tpe:TPE using 4/4 trials with best loss -0.847354



[2m2025-05-19 21:00:11[0m [[32m[1minfo     [0m] [1mPreprocessamento iniciou...   [0m
[2m2025-05-19 21:00:11[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 21:00:11[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-19 21:00:11[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada...[0m
 80%|████████  | 4/5 [01:03<00:15, 15.26s/trial, best loss: -0.8473541789127749]









[2m2025-05-19 21:00:14[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo [0m
 80%|████████  | 4/5 [01:06<00:15, 15.26s/trial, best loss: -0.8473541789127749]



Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 47.07it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 70.97it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 70.85it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 80.06it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 95.09it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 91.58it/s]


2025/05/19 21:00:22 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

2025/05/19 21:00:22 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...

2025/05/19 21:00:23 INFO mlflow.models.evaluation.evaluators.shap: Shap explainer LinearExplainer is used.



  mlflow.evaluate(candidate_model_uri,

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        

🏃 View run with_discretizer_hyperopt at: http://127.0.0.1:5000/#/experiments/1/runs/d3f0332c474948b89f065e2ba588fcd7

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

100%|██████████| 5/5 [01:18<00:00, 15.77s/trial, best loss: -0.8473541789127749]


<Figure size 1050x700 with 0 Axes>