# 0.0. Imports

In [1]:
import sys
import joblib

sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from utils.utils import load_config_file
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from evaluation.classifier_eval import ModelEvaluation

# 1.0. Data Load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

[2m2024-02-05 15:13:41[0m [[32m[1minfo     [0m] [1mComeçando a carga dos dados com o nome train_dataset_name[0m


# 2.0. Data Validation

In [3]:
dv  = DataValidation()
is_valid = dv.run(df)

[2m2024-02-05 15:13:41[0m [[32m[1minfo     [0m] [1mValidacao iniciou             [0m
[2m2024-02-05 15:13:41[0m [[32m[1minfo     [0m] [1mValidation columns passed...  [0m
[2m2024-02-05 15:13:41[0m [[32m[1minfo     [0m] [1mValidação com sucesso.        [0m


# 3.0. Data Transformation

In [4]:
dt = DataTransformation(df)
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()

# 4.0. Experimentations

In [5]:
import mlflow
from mlflow.tracking import MlflowClient

In [6]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment("prob_loan")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1707081553655, experiment_id='1', last_update_time=1707081553655, lifecycle_stage='active', name='prob_loan', tags={}>

# 4.1. Hyperparameters

In [7]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [8]:
from hyperopt import fmin, tpe, hp, STATUS_OK

In [9]:
pipe = Pipeline([('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
                 ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
                 ('scaler', SklearnTransformerWrapper(StandardScaler()))])

In [10]:
def objective(params):
    with mlflow.start_run(run_name='with_discretizer_hyperopt'):
        mlflow.set_tag('model_name', 'lr_hyperopt')

        mlflow.log_params(params)

       

        preprocessdor = DataPreprocess(pipe)    
        preprocessdor.train(X_train)

        X_train_processed = preprocessdor.transform(X_train)
        X_valid_processed = preprocessdor.transform(X_valid)
        joblib.dump(preprocessdor, '../models/preprocess.joblib')

        ## 1.1 logar um artefato(preprocessador)
        mlflow.log_artifact('../models/preprocess.joblib')

        ## 1.2. logar os parametros do preprocessador
        mlflow.log_params(params={'imputer': pipe['imputer'],
                                'discretizer' : pipe['discretizer'],
                                'scaler': pipe['scaler']})
        
        # 2.0. inicia o experimento com cross validation
        model = LogisticRegression(**params)
        model_eval = ModelEvaluation(model,
                                    X_train_processed,
                                    y_train,
                                    n_splits=5)
        roc_auc_scores = model_eval.cross_val_evaluate()

        # 2.1. logar o resultado da performance
        mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())
        
        # 3.0. trainar o modelo
        model.fit(X_train_processed,y_train)

        # 4.0. salvar as metricas de validação
        y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:, 1]
        val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

        mlflow.log_metric('valid_roc_auc', val_roc_auc)

        # 5.0. logar o modelo
        candidate_model_uri = mlflow.sklearn.log_model(model,
                                                'lr_model').model_uri
        
        signature = infer_signature(X_valid_processed, y_valid)

        eval_data = X_valid_processed
        eval_data['label'] = y_valid

        thereshold = {
            "accuracy_score": MetricThreshold(
                threshold=0.1, # o score da acuracia precisa ser > 0.7
                min_absolute_change=0.05,
                min_relative_change=0.05,
                greater_is_better=True
            )
        }

        baseline_model = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)
        baseline_model_uri = mlflow.sklearn.log_model(baseline_model,
                                                    'baseline_model',
                                                    signature=signature).model_uri
        
        # processo responsavel por avaliar o modelo do mlflow
        mlflow.evaluate(candidate_model_uri,
                        eval_data,
                        targets='label',
                        model_type='classifier',
                        validation_thresholds=thereshold,
                        baseline_model=baseline_model_uri)
        
        mlflow.end_run()

        return {'loss': -roc_auc_scores.mean(),
                'status': STATUS_OK}



In [11]:
search_space = {'warm_start' : hp.choice('warm_start', [True, False]),
                'fit_intercept' : hp.choice('fit_intercept', [True, False]),
                'tol' : hp.uniform('tol', 0.00001, 0.0001),
                'C' : hp.uniform('C', 0.05, 3),
                'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
                'max_iter' : hp.choice('max_iter', range(100,1000)),
                'multi_class' : 'auto',
                'class_weight' : hp.choice('class_weight', [None, 'balanced'])}

In [12]:
best_result = fmin(fn=objective,
                   space=search_space,
                   algo=tpe.suggest,
                   max_evals=5)

[2m2024-02-05 15:13:49[0m [[32m[1minfo     [0m] [1mPré-processamento iniciou.    [0m
[2m2024-02-05 15:13:49[0m [[32m[1minfo     [0m] [1mpré-processamento terminou    [0m
[2m2024-02-05 15:13:49[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:13:49[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:13:49[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:13:49[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:13:50[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada.  [0m
[2m2024-02-05 15:13:51[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo.[0m
  0%|          | 0/5 [00:04<?, ?trial/s, best loss=?]




  from .autonotebook import tqdm as notebook_tqdm

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 19.80it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 36.31it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 48.30it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 59.25it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 72.09it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 68.67it/s]
Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 58.26it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 96.69it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 123.60it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 113.69it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 133.11it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 126.

[2m2024-02-05 15:14:20[0m [[32m[1minfo     [0m] [1mPré-processamento iniciou.    [0m
[2m2024-02-05 15:14:21[0m [[32m[1minfo     [0m] [1mpré-processamento terminou    [0m
[2m2024-02-05 15:14:21[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:14:21[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:14:21[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:14:21[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:14:21[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada.  [0m
[2m2024-02-05 15:14:23[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo.[0m
 20%|██        | 1/5 [00:36<02:10, 32.74s/trial, best loss: -0.8062423189854571]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

[2m2024-02-05 15:14:35[0m [[32m[1minfo     [0m] [1mPré-processamento iniciou.    [0m
[2m2024-02-05 15:14:36[0m [[32m[1minfo     [0m] [1mpré-processamento terminou    [0m
[2m2024-02-05 15:14:36[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:14:36[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:14:36[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:14:36[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:14:36[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada.  [0m
[2m2024-02-05 15:14:38[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo.[0m
 40%|████      | 2/5 [00:50<01:07, 22.48s/trial, best loss: -0.8062426560063856]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

[2m2024-02-05 15:14:50[0m [[32m[1minfo     [0m] [1mPré-processamento iniciou.    [0m
[2m2024-02-05 15:14:50[0m [[32m[1minfo     [0m] [1mpré-processamento terminou    [0m
[2m2024-02-05 15:14:50[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:14:50[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:14:50[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:14:51[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:14:51[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada.  [0m
[2m2024-02-05 15:14:52[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo.[0m
 60%|██████    | 3/5 [01:04<00:37, 18.89s/trial, best loss: -0.8062426560063856]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

[2m2024-02-05 15:15:05[0m [[32m[1minfo     [0m] [1mPré-processamento iniciou.    [0m
[2m2024-02-05 15:15:05[0m [[32m[1minfo     [0m] [1mpré-processamento terminou    [0m
[2m2024-02-05 15:15:05[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:15:05[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:15:05[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador iniciou.[0m
[2m2024-02-05 15:15:05[0m [[32m[1minfo     [0m] [1mTransformação dos dados com preprocessador terminou.[0m
[2m2024-02-05 15:15:05[0m [[32m[1minfo     [0m] [1mIniciou a validação cruzada.  [0m
[2m2024-02-05 15:15:07[0m [[32m[1minfo     [0m] [1mIniciou a validação do modelo.[0m
 80%|████████  | 4/5 [01:19<00:17, 17.12s/trial, best loss: -0.8062426560063856]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

100%|██████████| 5/5 [01:31<00:00, 18.31s/trial, best loss: -0.8066865626572994]
