# Imports

In [1]:
import os, sys, warnings

warnings.filterwarnings('ignore')

ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)
    
import mlflow
import joblib

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.imputation import MeanMedianImputer
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
from mlflow.models import MetricThreshold

from level_2.src.utils import utils
from level_2.src.data.data_load import DataLoad
from level_2.src.data.data_validation import DataValidation
from level_2.src.data.data_transform import DataTransform
from level_2.src.data.data_preprocess import DataPreprocess
from level_2.src.train.model_training import ModelTraining
from level_2.src.evaluation.classifier_eval import ClassifierEvaluation

# Data Load

In [2]:
train_data_file = os.path.join(ROOT_DIR, 'level_2', 'data', 'raw', 'train.csv')

data_load = DataLoad()
df = data_load.run(train_data_file, index_col=0)

[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mReading data from CSV file...[0m
[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mData read successfully.[0m


# Data Validation

In [3]:
data_validation = DataValidation()

is_valid = data_validation.run(df)

[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mValidation started[0m
[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mValidation passed[0m
[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mValidation successeful[0m


# Data Transformation

In [4]:
data_transform = DataTransform(df)

x_train, x_test, y_train, y_test = data_transform.train_test_split()

# Experiments

## Baseline

In [5]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("prob_loan")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1701960364665, experiment_id='1', last_update_time=1701960364665, lifecycle_stage='active', name='prob_loan', tags={}>

In [6]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'baseline')
    
    # 1. preprocessing
    pipe = Pipeline(
        [
            ('imputer', MeanMedianImputer(variables=utils.load_config().get('imputer_variables'))),
            ('scaler', SklearnTransformerWrapper(StandardScaler()))
        ]
    )
    
    preprocessor = DataPreprocess(pipe)    
    preprocessor.train(x_train)
    
    x_train_processed = preprocessor.transform(x_train)    
    x_test_processed = preprocessor.transform(x_test)
    
    joblib.dump(preprocessor, os.path.join(ROOT_DIR, 'level_2', 'models', 'preprocessor.joblib'))
    
    # 1.1. artifact logging - preprocessor
    mlflow.log_artifact(os.path.join(ROOT_DIR, 'level_2', 'models', 'preprocessor.joblib'))
    
    # 1.2. artifact params - config
    log_params = {
        'imputer': pipe['imputer'],
        'scaler': pipe['scaler']
    }
    mlflow.log_params(params=log_params)
    
    # 2. begin with cross validation
    model = LogisticRegression(random_state=utils.load_config().get('random_state'))
    model_eval = ClassifierEvaluation(model, x_train_processed, y_train, k_fold=5)
    roc_auc_scores = model_eval.cross_val_eval()
    
    
    # 2.1 log metrics
    mlflow.log_metric('roc_auc_scores', roc_auc_scores.mean())
    
    # 3. train model
    model.fit(x_train_processed, y_train)
    
    # 4. evaluate model under test data
    y_pred = model_eval.model.predict_proba(x_test_processed)[:, 1]
    val_roc_auc_score = model_eval.evaluate_predictions(y_test, y_pred)
    
    # 4.1 log metrics
    mlflow.log_metric('val_roc_auc_score', val_roc_auc_score)
    
    # 5. log model
    mlflow.sklearn.log_model(model, 'lr_model', pyfunc_predict_fn='predict_proba')
    
    mlflow.end_run()

[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m
[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-07 14:26:59[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m
[2m2023-12-07 14:27:00[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression finished.[0m
[2m2023-12-07 14:27:01[0m [[32m[1minfo     [0m] [1mEvaluation of predictions started.[0m
[2m2023-12-07 14:27:01[0m [[32m[1minfo     [0m] [1mROC AUC score: 0.7076126693231048[0m


## With Discretiser

In [10]:
with mlflow.start_run(run_name='with_discretiser'):
    mlflow.set_tag('model_name', 'lr_discretiser')
    
    # 1. preprocessing
    pipe = Pipeline(
        [
            ('imputer', MeanMedianImputer(variables=utils.load_config().get('imputer_variables'))),
            ('discretiser', EqualFrequencyDiscretiser(variables=utils.load_config().get('discretiser_variables'))),
            ('scaler', SklearnTransformerWrapper(StandardScaler()))
        ]
    )
    
    preprocessor = DataPreprocess(pipe)    
    preprocessor.train(x_train)
    
    x_train_processed = preprocessor.transform(x_train)    
    x_test_processed = preprocessor.transform(x_test)
    
    joblib.dump(preprocessor, os.path.join(ROOT_DIR, 'level_2', 'models', 'preprocessor.joblib'))
    
    # 1.1. artifact logging - preprocessor
    mlflow.log_artifact(os.path.join(ROOT_DIR, 'level_2', 'models', 'preprocessor.joblib'))
    
    # 1.2. artifact params - config
    log_params = {
        'imputer': pipe['imputer'],
        'discretiser': pipe['discretiser'],
        'scaler': pipe['scaler']
    }
    mlflow.log_params(params=log_params)
    
    # 2. begin with cross validation
    model = LogisticRegression(random_state=utils.load_config().get('random_state'))
    model_eval = ClassifierEvaluation(model, x_train_processed, y_train, k_fold=5)
    roc_auc_scores = model_eval.cross_val_eval()
    
    
    # 2.1 log metrics
    mlflow.log_metric('roc_auc_scores', roc_auc_scores.mean())
    
    # 3. train model
    model.fit(x_train_processed, y_train)
    
    # 4. evaluate model under test data
    y_pred = model_eval.model.predict_proba(x_test_processed)[:, 1]
    val_roc_auc_score = model_eval.evaluate_predictions(y_test, y_pred)
    
    # 4.1 log metrics
    mlflow.log_metric('val_roc_auc_score', val_roc_auc_score)
    
    # 5. log model
    candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_discretiser').model_uri
    
    # 6. infer signature
    signature = infer_signature(x_test_processed, y_test)
    eval_data = x_test_processed
    eval_data['label'] = y_test
    
    threshold = {
        'accuracy_score': MetricThreshold(
            threshold=0.7,
            min_absolute_change=0.05,
            min_relative_change=0.05,
            greater_is_better=True
        )
    }
    
    baseline_model = DummyClassifier(
                        strategy='uniform',
                        random_state=utils.load_config().get('random_state')
                    ).fit(x_train_processed, y_train)
    baseline_model_uri = mlflow.sklearn.log_model(baseline_model, 'baseline_model', signature=signature).model_uri
    
    # 7. log model with threshold
    mlflow.evaluate(
        candidate_model_uri,
        eval_data,
        targets='label',
        model_type='classifier',
        validation_thresholds=threshold,
        baseline_model=baseline_model_uri
    )
    
    mlflow.shap.log_explanation(model.predict, x_test_processed.drop('label', axis=1))
    
    mlflow.end_run()

[2m2023-12-07 14:30:00[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m
[2m2023-12-07 14:30:00[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-07 14:30:00[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-07 14:30:00[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-07 14:30:00[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-07 14:30:00[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m
[2m2023-12-07 14:30:01[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression finished.[0m
[2m2023-12-07 14:30:01[0m [[32m[1minfo     [0m] [1mEvaluation of predictions started.[0m
[2m2023-12-07 14:30:01[0m [[32m[1minfo     [0m] [1mROC AUC score: 0.7985133358298327[0m


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 472.67it/s] 
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 153.10it/s]
2023/12/07 14:30:06 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/12/07 14:30:06 INFO mlflow.models.evaluation.default_evaluator: Evaluating candidate model:
2023/12/07 14:30:06 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/12/07 14:30:06 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/12/07 14:30:06 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/12/07 14:30:09 INFO mlflow.models.evaluation.default_evaluator: Shap explainer LinearExplainer is used.
2023/12/07 14:30:12 INFO mlflow.models.evaluation.default_evaluator: Evaluating baseline model:
2023/12/07 14:30:12 INFO mlflow.models.evaluation.default_evaluator: Computing model pre

KeyboardInterrupt: 