# 0. Imports

In [7]:
import sys
import joblib
sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

from utils.utils import load_config_file
from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from evaluation.classifier_eval import ModelEvaluation

# 1. Data Load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

[2m2025-05-13 07:53:11[0m [[32m[1minfo     [0m] [1mComeçando a carga dos dados com o nome: train_dataset_name[0m


# 2. Data Validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

[2m2025-05-13 07:53:13[0m [[32m[1minfo     [0m] [1mValidacao iniciou             [0m
[2m2025-05-13 07:53:13[0m [[32m[1minfo     [0m] [1mValidation columns passed...  [0m
[2m2025-05-13 07:53:13[0m [[32m[1minfo     [0m] [1mValidacao com sucesso.        [0m


# 3. Data Transformation

In [4]:
dt = DataTransformation(df)
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()

# 4. Experimentations 

In [5]:
import mlflow
from mlflow.tracking import MlflowClient

In [6]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

2025/05/13 07:54:23 INFO mlflow.tracking.fluent: Experiment with name 'prob_loan' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1747133663728, experiment_id='1', last_update_time=1747133663728, lifecycle_stage='active', name='prob_loan', tags={}>

In [12]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'lr_baseline')

    # 1. etapa preprocessar os dados 
    pipe = Pipeline([
        ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        ('scaler', SklearnTransformerWrapper(StandardScaler()))
    ])
    preprocessador = DataPreprocess(pipe)
    preprocessador.train(X_train)

    X_train_processed = preprocessador.transform(X_train)
    X_valid_processed = preprocessador.transform(X_valid)
    joblib.dump(preprocessador, '../models/preprocess.joblib')

    ## 1.1. logar um artefato (preprocessador)
    mlflow.log_artifact('../models/preprocess.joblib')

    ## 1.2. logar os parametros do preprocessador
    mlflow.log_params(params={'imputer': pipe['imputer'], 'scaler': pipe['scaler']})

    # 2. Inicia o exprimento com cross validation
    model = LogisticRegression()
    model_eval = ModelEvaluation(model,
                                 X_train_processed,
                                 y_train,
                                 n_splits=5)
    roc_auc_scores = model_eval.cross_val_evaluate()
    
    ## 2.1. logar o resultado da performance
    mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

    # 3. Treinar o modelo
    model.fit(X_train_processed, y_train)

    # 4. Salvar as metricas de validacao do modelo
    y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:, 1]
    val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

    mlflow.log_metric('valid_roc_auc', val_roc_auc)

    # 5. Logar o modelo
    mlflow.sklearn.log_model(model, 'lr_model', pyfunc_predict_fn='predict_proba')

    mlflow.end_run()


[2m2025-05-13 08:29:23[0m [[32m[1minfo     [0m] [1mPreprocessamento iniciou...   [0m
[2m2025-05-13 08:29:23[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-13 08:29:24[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-13 08:29:24[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada...[0m
[2m2025-05-13 08:29:37[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo [0m




🏃 View run baseline at: http://127.0.0.1:5000/#/experiments/1/runs/14b0108dc5d34315b94cc6cc5a40aa71
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


## 4.1. Experiment 01

In [13]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [None]:
with mlflow.start_run(run_name='with_discretizer'):
    mlflow.set_tag('model_name', 'lr_discretizer')

    # 1. etapa preprocessar os dados 
    pipe = Pipeline([
        ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
        ('scaler', SklearnTransformerWrapper(StandardScaler()))
    ])
    preprocessador = DataPreprocess(pipe)
    preprocessador.train(X_train)

    X_train_processed = preprocessador.transform(X_train)
    X_valid_processed = preprocessador.transform(X_valid)
    joblib.dump(preprocessador, '../models/preprocess.joblib')

    ## 1.1. logar um artefato (preprocessador)
    mlflow.log_artifact('../models/preprocess.joblib')

    ## 1.2. logar os parametros do preprocessador
    mlflow.log_params(params={'imputer': pipe['imputer'], 'discretizer': pipe['discretizer'], 'scaler': pipe['scaler']})

    # 2. Inicia o exprimento com cross validation
    model = LogisticRegression()
    model_eval = ModelEvaluation(model,
                                 X_train_processed,
                                 y_train,
                                 n_splits=5)
    roc_auc_scores = model_eval.cross_val_evaluate()
    
    ## 2.1. logar o resultado da performance
    mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

    # 3. Treinar o modelo
    model.fit(X_train_processed, y_train)

    # 4. Salvar as metricas de validacao do modelo
    y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:, 1]
    val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

    mlflow.log_metric('valid_roc_auc', val_roc_auc)

    # 5. Logar o modelo
    candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_model').model_uri
    
    #===============================================================================
    signature = infer_signature(X_valid_processed, y_valid)

    eval_data = X_valid_processed
    eval_data['label'] = y_valid

    thresholds = {
        "accuracy_score": MetricThreshold(
            threshold=0.7, # o score da acuracia precisar ser > 0.7 para modelo ser logado
            min_absolute_change=0.05,
            min_relative_change=0.05,
            greater_is_better=True
        )
    }
    
    baseline_model = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)
    baseline_model_uri = mlflow.sklearn.log_model(baseline_model, 'baseline_model', signature=signature).model_uri

    # processo responsavel por avaliar o modelo do mlflow
    mlflow.evaluate(candidate_model_uri,
                    eval_data,
                    targets='label',
                    model_type='classifier',
                    validation_thresholds=thresholds,
                    baseline_model=baseline_model_uri)

    # explicabilidade do modelo com SHAP
    mlflow.shap.log_explanation(model.predict, X_valid_processed.drop('label', axis=1))

    mlflow.end_run()


[2m2025-05-13 08:54:17[0m [[32m[1minfo     [0m] [1mPreprocessamento iniciou...   [0m
[2m2025-05-13 08:54:17[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-13 08:54:17[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou...[0m
[2m2025-05-13 08:54:17[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada...[0m
