# Imports

In [10]:
import os, sys, warnings

warnings.filterwarnings('ignore')

ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)
    
import mlflow
import joblib

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.imputation import MeanMedianImputer
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
from mlflow.models import MetricThreshold
from hyperopt import fmin, hp, STATUS_OK, tpe

from level_2.src.utils import utils
from level_2.src.data.data_load import DataLoad
from level_2.src.data.data_validation import DataValidation
from level_2.src.data.data_transform import DataTransform
from level_2.src.data.data_preprocess import DataPreprocess
from level_2.src.train.model_training import ModelTraining
from level_2.src.evaluation.classifier_eval import ClassifierEvaluation

# Data Load

In [11]:
train_data_file = os.path.join(ROOT_DIR, 'level_2', 'data', 'raw', 'train.csv')

data_load = DataLoad()
df = data_load.run(train_data_file, index_col=0)

[2m2023-12-11 10:35:09[0m [[32m[1minfo     [0m] [1mReading data from CSV file...[0m
[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mData read successfully.[0m


# Data Validation

In [12]:
data_validation = DataValidation()

is_valid = data_validation.run(df)

[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mValidation started[0m
[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mValidation passed[0m
[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mValidation successeful[0m


# Data Transformation

In [13]:
data_transform = DataTransform(df)

x_train, x_test, y_train, y_test = data_transform.train_test_split()

# Experiments

## Baseline

In [14]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("prob_loan")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1701960364665, experiment_id='1', last_update_time=1701960364665, lifecycle_stage='active', name='prob_loan', tags={}>

In [15]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'baseline')
    
    # 1. preprocessing
    pipe = Pipeline(
        [
            ('imputer', MeanMedianImputer(variables=utils.load_config().get('imputer_variables'))),
            ('scaler', SklearnTransformerWrapper(StandardScaler()))
        ]
    )
    
    preprocessor = DataPreprocess(pipe)    
    preprocessor.train(x_train)
    
    x_train_processed = preprocessor.transform(x_train)    
    x_test_processed = preprocessor.transform(x_test)
    
    joblib.dump(preprocessor, os.path.join(ROOT_DIR, 'level_2', 'models', 'preprocessor.joblib'))
    
    # 1.1. artifact logging - preprocessor
    mlflow.log_artifact(os.path.join(ROOT_DIR, 'level_2', 'models', 'preprocessor.joblib'))
    
    # 1.2. artifact params - config
    log_params = {
        'imputer': pipe['imputer'],
        'scaler': pipe['scaler']
    }
    mlflow.log_params(params=log_params)
    
    # 2. begin with cross validation
    model = LogisticRegression(random_state=utils.load_config().get('random_state'))
    model_eval = ClassifierEvaluation(model, x_train_processed, y_train, k_fold=5)
    roc_auc_scores = model_eval.cross_val_eval()
    
    
    # 2.1 log metrics
    mlflow.log_metric('roc_auc_scores', roc_auc_scores.mean())
    
    # 3. train model
    model.fit(x_train_processed, y_train)
    
    # 4. evaluate model under test data
    y_pred = model_eval.model.predict_proba(x_test_processed)[:, 1]
    val_roc_auc_score = model_eval.evaluate_predictions(y_test, y_pred)
    
    # 4.1 log metrics
    mlflow.log_metric('val_roc_auc_score', val_roc_auc_score)
    
    # 5. log model
    mlflow.sklearn.log_model(model, 'lr_model', pyfunc_predict_fn='predict_proba')
    
    mlflow.end_run()

[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m
[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mTransforming data[0m


[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:10[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m
[2m2023-12-11 10:35:11[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression finished.[0m
[2m2023-12-11 10:35:11[0m [[32m[1minfo     [0m] [1mEvaluation of predictions started.[0m
[2m2023-12-11 10:35:11[0m [[32m[1minfo     [0m] [1mROC AUC score: 0.7076126693231048[0m


## With Discretiser

In [16]:
def objective(params):
    with mlflow.start_run(run_name='hyperopt'):
        mlflow.set_tag('model_name', 'lr_hyperopt')
        mlflow.log_params(params)
        
        # 1. preprocessing
        pipe = Pipeline(
            [
                ('imputer', MeanMedianImputer(variables=utils.load_config().get('imputer_variables'))),
                ('discretiser', EqualFrequencyDiscretiser(variables=utils.load_config().get('discretiser_variables'))),
                ('scaler', SklearnTransformerWrapper(StandardScaler()))
            ]
        )
        
        preprocessor = DataPreprocess(pipe)    
        preprocessor.train(x_train)
        
        x_train_processed = preprocessor.transform(x_train)    
        x_test_processed = preprocessor.transform(x_test)
        
        joblib.dump(preprocessor, os.path.join(ROOT_DIR, 'level_2', 'models', 'preprocessor.joblib'))
        
        # 1.1. artifact logging - preprocessor
        mlflow.log_artifact(os.path.join(ROOT_DIR, 'level_2', 'models', 'preprocessor.joblib'))
        
        # 1.2. artifact params - config
        log_params = {
            'imputer': pipe['imputer'],
            'discretiser': pipe['discretiser'],
            'scaler': pipe['scaler']
        }
        mlflow.log_params(params=log_params)
        
        # 2. begin with cross validation
        model = LogisticRegression(**params)
        model_eval = ClassifierEvaluation(model, x_train_processed, y_train, k_fold=5)
        roc_auc_scores = model_eval.cross_val_eval()
        
        
        # 2.1 log metrics
        mlflow.log_metric('roc_auc_scores', roc_auc_scores.mean())
        
        # 3. train model
        model.fit(x_train_processed, y_train)
        
        # 4. evaluate model under test data
        y_pred = model_eval.model.predict_proba(x_test_processed)[:, 1]
        val_roc_auc_score = model_eval.evaluate_predictions(y_test, y_pred)
        
        # 4.1 log metrics
        mlflow.log_metric('val_roc_auc_score', val_roc_auc_score)
        
        # 5. log model
        candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_hyperopt').model_uri
        
        # 6. infer signature
        signature = infer_signature(x_test_processed, y_test)
        eval_data = x_test_processed
        eval_data['label'] = y_test
        
        threshold = {
            'accuracy_score': MetricThreshold(
                threshold=0.1,
                min_absolute_change=0.05,
                min_relative_change=0.05,
                greater_is_better=True
            )
        }
        
        baseline_model = DummyClassifier(
                            strategy='uniform',
                            random_state=utils.load_config().get('random_state')
                        ).fit(x_train_processed, y_train)
        baseline_model_uri = mlflow.sklearn.log_model(baseline_model, 'baseline_model', signature=signature).model_uri
        
        # 7. log model with threshold
        mlflow.evaluate(
            candidate_model_uri,
            eval_data,
            targets='label',
            model_type='classifier',
            validation_thresholds=threshold,
            baseline_model=baseline_model_uri
        )
        
        mlflow.end_run()
        
        return {'loss': -roc_auc_scores.mean(), 'status': STATUS_OK}

In [17]:
search_space = {
    'warm_start': hp.choice('warm_start', [True, False]),
    'fit_intercept': hp.choice('fit_intercept', [True, False]),
    'tol': hp.uniform('tol', 0.00001, 0.0001),
    'C': hp.uniform('C', 0.05, 3),
    'solver': hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
    'max_iter': hp.choice('max_iter', range(100, 1000)),
    'multi_class': 'auto',
    'class_weight': hp.choice('class_weight', [None, 'balanced'])
}

In [18]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=5
)

[2m2023-12-11 10:35:14[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m
[2m2023-12-11 10:35:14[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-11 10:35:14[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:14[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-11 10:35:14[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:14[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m
[2m2023-12-11 10:35:17[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression finished.[0m
[2m2023-12-11 10:35:18[0m [[32m[1minfo     [0m] [1mEvaluation of predictions started.[0m
[2m2023-12-11 10:35:18[0m [[32m[1minfo     [0m] [1mROC AUC score: 0.7987182283105457[0m
  0%|          | 0/5 [00:03<?, ?trial/s, best loss=?]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 25.27it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 41.89it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 57.84it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 71.40it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 84.90it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 82.33it/s]
Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 30.83it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 55.31it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 69.99it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 88.77it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 104.28it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 101.32it/s]
2023/12/11 10:35:23 INFO mlflow.models.evaluatio

[2m2023-12-11 10:35:28[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m
[2m2023-12-11 10:35:28[0m [[32m[1minfo     [0m] [1mTransforming data[0m  
[2m2023-12-11 10:35:28[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:28[0m [[32m[1minfo     [0m] [1mTransforming data[0m  
[2m2023-12-11 10:35:28[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:28[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m
[2m2023-12-11 10:35:31[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression finished.[0m
[2m2023-12-11 10:35:31[0m [[32m[1minfo     [0m] [1mEvaluation of predictions started.[0m
[2m2023-12-11 10:35:31[0m [[32m[1minfo     [0m] [1mROC AUC score: 0.7986326519894509[0m
 20%|██        | 1/5 [00:17<00:54, 13.70s/trial, best loss: -0.7922994429115655]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 46.88it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 57.14it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 76.27it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 94.65it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 110.99it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 104.23it/s]
Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 37.56it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 58.05it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 81.36it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 97.79it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 113.18it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 108.01it/s]
2023/12/11 10:35:36 INFO mlflow.models.evaluat

[2m2023-12-11 10:35:41[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m
[2m2023-12-11 10:35:41[0m [[32m[1minfo     [0m] [1mTransforming data[0m  
[2m2023-12-11 10:35:41[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:41[0m [[32m[1minfo     [0m] [1mTransforming data[0m  
[2m2023-12-11 10:35:42[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:42[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m
[2m2023-12-11 10:35:42[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression finished.[0m
[2m2023-12-11 10:35:43[0m [[32m[1minfo     [0m] [1mEvaluation of predictions started.[0m
[2m2023-12-11 10:35:43[0m [[32m[1minfo     [0m] [1mROC AUC score: 0.7985288678012609[0m
 40%|████      | 2/5 [00:28<00:40, 13.50s/trial, best loss: -0.7922994429115655]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 33.42it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 53.99it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 75.95it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 93.90it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 111.63it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 107.73it/s]
Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 194.26it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 137.08it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 164.93it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 175.20it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 197.45it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 186.15it/s]
2023/12/11 10:35:48 INFO mlflow.models.eva

[2m2023-12-11 10:35:52[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m
[2m2023-12-11 10:35:52[0m [[32m[1minfo     [0m] [1mTransforming data[0m  
[2m2023-12-11 10:35:53[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:53[0m [[32m[1minfo     [0m] [1mTransforming data[0m  
[2m2023-12-11 10:35:53[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:35:53[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m
[2m2023-12-11 10:35:54[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression finished.[0m
[2m2023-12-11 10:35:54[0m [[32m[1minfo     [0m] [1mEvaluation of predictions started.[0m
[2m2023-12-11 10:35:54[0m [[32m[1minfo     [0m] [1mROC AUC score: 0.838976892162301[0m
 60%|██████    | 3/5 [00:39<00:24, 12.40s/trial, best loss: -0.7922994429115655]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 37.92it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 59.07it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 78.27it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 98.24it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 116.22it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 110.19it/s]
Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 40.07it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 70.67it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 95.79it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 118.11it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 138.01it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 132.68it/s]
2023/12/11 10:35:58 INFO mlflow.models.evalua

[2m2023-12-11 10:36:03[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m
[2m2023-12-11 10:36:03[0m [[32m[1minfo     [0m] [1mTransforming data[0m  
[2m2023-12-11 10:36:03[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:36:03[0m [[32m[1minfo     [0m] [1mTransforming data[0m  
[2m2023-12-11 10:36:03[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-11 10:36:03[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m
[2m2023-12-11 10:36:13[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression finished.[0m
[2m2023-12-11 10:36:16[0m [[32m[1minfo     [0m] [1mEvaluation of predictions started.[0m
[2m2023-12-11 10:36:16[0m [[32m[1minfo     [0m] [1mROC AUC score: 0.8513431481653442[0m
 80%|████████  | 4/5 [01:01<00:11, 11.76s/trial, best loss: -0.8349269358560152]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 65.81it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 79.60it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 61.74it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 78.21it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 93.75it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 91.37it/s]
Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]
Downloading artifacts:  20%|##        | 1/5 [00:00<00:00, 30.66it/s]
Downloading artifacts:  40%|####      | 2/5 [00:00<00:00, 50.15it/s]
Downloading artifacts:  60%|######    | 3/5 [00:00<00:00, 66.47it/s]
Downloading artifacts:  80%|########  | 4/5 [00:00<00:00, 78.85it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 89.67it/s]
Downloading artifacts: 100%|##########| 5/5 [00:00<00:00, 82.44it/s]
2023/12/11 10:36:21 INFO mlflow.models.evaluation.

100%|██████████| 5/5 [01:11<00:00, 14.23s/trial, best loss: -0.8473732319757177]


# Select Best Model

In [21]:
current_experiment = dict(mlflow.get_experiment_by_name('prob_loan'))
experiment_id = current_experiment['experiment_id']

df_mlflow = mlflow.search_runs(
    experiment_ids=experiment_id,
    filter_string='metrics.val_roc_auc_score < 1'
).sort_values(by='metrics.val_roc_auc_score', ascending=False)

best_run_id = df_mlflow.iloc[0]['run_id']

In [24]:
df_mlflow.loc[df_mlflow['run_id'] == best_run_id, col_params]

Unnamed: 0,params.imputer,params.discretiser,params.scaler,params.class_weight,params.warm_start,params.solver,params.max_iter,params.fit_intercept,params.tol,params.C,params.multi_class
0,"MeanMedianImputer(variables=['RendaMensal', 'N...",EqualFrequencyDiscretiser(variables=['TaxaDeUt...,SklearnTransformerWrapper(transformer=Standard...,,False,saga,852,False,4.157161747165617e-05,0.6849956166046255,auto


In [30]:
df_mlflow = mlflow.search_runs(
    filter_string='metrics.val_roc_auc_score < 1'
).sort_values(by='metrics.val_roc_auc_score', ascending=False)

best_run_id = df_mlflow.iloc[0]['run_id']

col_params = [
    'params.imputer',
    'params.discretiser',
    'params.scaler',
    'params.class_weight',
    'params.warm_start',
    'params.solver',
    'params.max_iter',
    'params.fit_intercept',
    'params.tol',
    'params.C',
    'params.multi_class'            
]

df_best_params = df_mlflow.loc[df_mlflow['run_id'] == best_run_id, :]

best_roc_auc_score = df_mlflow.iloc[0]['metrics.val_roc_auc_score']

0.8513431481653442
