# 🧾 Table of Contents

- [5. Fine-tunning the Model](#5.-Fine-tunning-the-Model)
  - [Preparing our toolbox](#Preparing-our-toolbox)
  - [5.1 CatBoost](#5.1-CatBoost)
  - [5.2 LightGBM](#5.2-LightGBM)
  - [5.3 Final Model](#5.3-Final-Model)
    - [Validation Score](#Validation-Score)
    - [Test Submission](#Test-Submission)

# 5. Fine-tunning the Model

The CatBoost and LightGBM will be fine-tunned using RandomizedSearchCV and then combined in an ensemble to generate the final model. These 3 Classifiers will be validated, saved to ```/model``` folder, and used to generate the submission data into ```data/processed```. TO reproduce this process, use:

```make models```

## Preparing our toolbox

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Libraries created for this project
from src.data.explore_data import (
        list_datasets, 
        describe_feature, 
        overview_data, 
        create_dataframe, 
        describe_features, 
        create_exploratory_dataset
)
from src.features.pipeline import (
        NanTransformer,
        trim_outliers, 
        run_preprocessing_pipeline,
        create_classifiers_dataset
)

#Basic Imports
import pandas as pd
import numpy as np
from pathlib import Path

# Sklearn Base
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

# Models
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Models Persistence
from joblib import dump, load

# Imblearn
from imblearn import FunctionSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

# Feature Engine
from feature_engine.imputation import  MeanMedianImputer, CategoricalImputer, RandomSampleImputer
from feature_engine.encoding import OrdinalEncoder
from feature_engine.outliers import OutlierTrimmer
from feature_engine.creation import MathFeatures, RelativeFeatures
from feature_engine.selection import DropFeatures

In [2]:
# Paths
models_directory = Path.cwd().resolve().parent / 'models'
processed_data_directory = Path.cwd().resolve().parent / "data" / "processed"

# Defining Cross-Validation parameters
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Loading DataFrames
df = create_dataframe("raw", "application_train.csv")
df_decision = create_dataframe("interim", "application_decision.csv")

# Creating X and y
X = df.drop(labels=("TARGET"), axis=1).copy()
y = df["TARGET"].copy()

# Splitting into Train and Validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Defining features to be processed in the pipeline - following the decision dataframe
drop_cols = df_decision["Column"][df_decision.isin(["drop"]).any(axis=1)].to_list()
xna_cols = ["CODE_GENDER", "ORGANIZATION_TYPE"]
unknown_cols = ["NAME_FAMILY_STATUS"]
ordinal_encoder_cols = df_decision["Column"][df_decision["TypeDecision"].str.contains("OrdinalEncoder")].to_list()
nan_to_frequent_cols = df_decision["Column"][df_decision["NanDecision"] == "frequent"].to_list()
nan_to_random_cols = df_decision["Column"][df_decision["NanDecision"] == "random"].to_list()
outliers_cols = df_decision["Column"][ df_decision["OutliersDecision"] == "remove"].to_list()
group_cols = df_decision["Column"][df_decision["CorrDecision"] == "group"].to_list()
rate_annuity_cols = ["AMT_CREDIT", "AMT_ANNUITY"]
rate_credit_cols = ["AMT_INCOME_TOTAL", "AMT_CREDIT"]
rate_income_cols = ["AMT_INCOME_TOTAL", "CNT_FAM_MEMBERS"]

# Pre-Processing Pipeline
preprocessing_pipeline = imbpipeline([
    # Step 1. Drop Columns with NanPercentage > 30% and SK_ID_CURR
    ('drop_features', DropFeatures(features_to_drop=drop_cols)),
   
    # Step 2. Reveal Hidden NaNs
    ('xna_transformer', NanTransformer(nan_string='XNA', variables=xna_cols)),
    ('unknown_transformer', NanTransformer(nan_string='Unknown', variables=unknown_cols)),

    # Step 3. Impute NaN Values
    ('num_imputer', MeanMedianImputer()),
    ('cat1_imputer', CategoricalImputer(imputation_method='frequent', variables=nan_to_frequent_cols)),
    ('cat2_imputer', RandomSampleImputer(random_state=42, variables=nan_to_random_cols)),

    # Step 4. Encode Categorical Features
    ('ordinal_encoder', OrdinalEncoder(variables=ordinal_encoder_cols)),

    # Step 5. Trim Outliers (with Z-score > +-3)
    ('trim_outliers', FunctionSampler(func=trim_outliers, validate=False, kw_args={
        'capping_method': 'gaussian',
        'tail': 'both', 
        'variables': outliers_cols})
    ),

    # Step 6. Group Document Columns
    ('grouper', MathFeatures(
        variables=group_cols, 
        func=[np.sum], 
        new_variables_names=['DOCUMENTS_PROVIDED'], 
        drop_original=True)
    ),

    # Step 7. Create new Features
    ('AMT_CREDIT_div_AMT_ANNUITY_creator', RelativeFeatures(
        variables=[rate_annuity_cols[0]],
        reference=[rate_annuity_cols[1]],
        func=['div'],
        drop_original=False)
    ),
    ('AMT_INCOME_TOTAL_div_AMT_CREDIT_creator', RelativeFeatures(
        variables=[rate_credit_cols[0]],
        reference=[rate_credit_cols[1]],
        func=['div'],
        drop_original=False)
    ),
    ('AMT_INCOME_TOTAL_div_CNT_FAM_MEMBERS_creator', RelativeFeatures(
        variables=[rate_income_cols[0]],
        reference=[rate_income_cols[1]],
        func=['div'],
        drop_original=False)
    ),

    # Step 8. Scale features
    ('scaler', MinMaxScaler()),
])


## 5.1 CatBoost 

In [None]:
# Appending Classifier Step to the Pre-processing Pipeline
catboost_pipeline = clone(preprocessing_pipeline)
catboost_pipeline.steps.append(('classifier', CatBoostClassifier(random_state=42, verbose=False)))

# Tunning Parameters
catboost_params = {
    'classifier__iterations': np.linspace(10, 1000, 5, dtype=int),
    'classifier__learning_rate': np.linspace(0.01, 0.8, 5, dtype=float),
    'classifier__depth': np.linspace(6, 10, 5, dtype=int),
}
catboost = RandomizedSearchCV(
    estimator=catboost_pipeline,
    param_distributions=catboost_params,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# Fitting and saving model
catboost.fit(X_train, y_train)
dump(catboost, (models_directory / 'catboost.joblib'))

In [32]:
display(pd.DataFrame(catboost.cv_results_))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__iterations,param_classifier__depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,296.20911,5.384989,86.362103,1.998849,0.2075,505,9,{'classifier__learning_rate': 0.20750000000000...,0.713496,0.715297,0.714467,0.71442,0.000736,3
1,112.102418,1.609196,94.723661,0.986087,0.6025,10,9,"{'classifier__learning_rate': 0.6025, 'classif...",0.717755,0.715683,0.706403,0.71328,0.004936,4
2,622.57326,29.725158,106.599369,8.442459,0.8,1000,10,"{'classifier__learning_rate': 0.8, 'classifier...",0.659058,0.66299,0.65373,0.658593,0.003795,9
3,225.160938,27.780598,128.983145,10.912766,0.8,10,10,"{'classifier__learning_rate': 0.8, 'classifier...",0.70928,0.704835,0.704143,0.706086,0.002276,6
4,322.756382,13.74253,100.906166,5.162454,0.2075,257,7,{'classifier__learning_rate': 0.20750000000000...,0.741095,0.745928,0.741572,0.742865,0.002175,1
5,525.59065,5.997923,103.356982,12.564448,0.2075,1000,8,{'classifier__learning_rate': 0.20750000000000...,0.712035,0.706621,0.708479,0.709045,0.002246,5
6,304.474179,5.385816,112.651954,4.693916,0.8,257,8,"{'classifier__learning_rate': 0.8, 'classifier...",0.662293,0.669268,0.666217,0.665926,0.002855,8
7,463.617831,32.844576,157.799687,53.964925,0.405,1000,6,"{'classifier__learning_rate': 0.405, 'classifi...",0.696082,0.701891,0.697781,0.698585,0.002439,7
8,406.300391,21.345398,114.135451,30.463676,0.8,257,10,"{'classifier__learning_rate': 0.8, 'classifier...",0.650038,0.658004,0.651109,0.653051,0.00353,10
9,324.873014,74.621244,48.157219,21.828678,0.405,257,6,"{'classifier__learning_rate': 0.405, 'classifi...",0.728468,0.736478,0.730125,0.73169,0.003452,2


## 5.2 LightGBM

In [3]:
# Appending Classifier Step to the Pre-processing Pipeline
lgbm_pipeline = clone(preprocessing_pipeline)
lgbm_pipeline.steps.append(('classifier', LGBMClassifier(random_state=42)))

# Tunning Parameters
lgbm_params = {
    'classifier__learning_rate': np.linspace(0.01, 0.8, 5, dtype=float),
    'classifier__max_depth': np.linspace(2, 16, 5, dtype=int),
    'classifier__n_estimators': np.linspace(100, 6000, 5, dtype=int),
    'classifier__num_leaves': np.linspace(20, 3000, 5, dtype=int)
}
lgbm = RandomizedSearchCV(
    estimator=lgbm_pipeline,
    param_distributions=lgbm_params,
    n_iter=15,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# Fitting and saving model
lgbm.fit(X_train, y_train)
dump(lgbm, (models_directory / 'lgbm.joblib'))

['/Users/ewerthon/Documents/Materiais/[Curso] Data Science Degree/Módulo 11/ml-credit-default-risk/models/lgbm.joblib']

In [5]:
display(pd.DataFrame(lgbm.cv_results_))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__num_leaves,param_classifier__n_estimators,param_classifier__max_depth,param_classifier__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1926.218041,0.836559,1234.121669,7.514487,1510,8000,16,0.01,"{'classifier__num_leaves': 1510, 'classifier__...",0.719827,0.726793,0.724546,0.723722,0.002903,3
1,197.066629,0.856642,99.14377,1.293485,1510,2037,12,0.8,"{'classifier__num_leaves': 1510, 'classifier__...",0.693122,0.689131,0.69242,0.691558,0.001739,9
2,242.017041,16.562043,75.585676,2.031068,765,8000,2,0.6025,"{'classifier__num_leaves': 765, 'classifier__n...",0.703917,0.710228,0.706095,0.706747,0.002617,6
3,417.947717,61.06294,215.375987,3.220026,3000,8000,5,0.01,"{'classifier__num_leaves': 3000, 'classifier__...",0.745903,0.75077,0.742915,0.746529,0.003238,1
4,293.236993,75.377523,102.881106,64.774933,3000,8000,5,0.8,"{'classifier__num_leaves': 3000, 'classifier__...",0.541482,0.680921,0.568845,0.597083,0.060326,14
5,395.914094,1.256572,157.133748,1.668404,765,6012,16,0.8,"{'classifier__num_leaves': 765, 'classifier__n...",0.690265,0.690225,0.688998,0.689829,0.000588,10
6,518.644343,1.362663,205.986101,5.006974,3000,8000,12,0.6025,"{'classifier__num_leaves': 3000, 'classifier__...",0.697812,0.692867,0.700167,0.696949,0.003042,7
7,122.025583,1.923021,57.216039,4.5987,2255,50,12,0.01,"{'classifier__num_leaves': 2255, 'classifier__...",0.672236,0.676092,0.671206,0.673178,0.002103,13
8,221.631046,11.900131,71.092915,7.602598,2255,4025,2,0.2075,"{'classifier__num_leaves': 2255, 'classifier__...",0.743401,0.748097,0.743164,0.744887,0.002272,2
9,349.986683,4.437315,195.295102,3.891278,3000,6012,5,0.6025,"{'classifier__num_leaves': 3000, 'classifier__...",0.685729,0.682968,0.684639,0.684445,0.001135,12


## 5.3 Final Model

Creating a voting-based Ensemble of the classifiers as the **final model**.

In [33]:
# Appending Classifier Step to the Pre-processing Pipeline
model = clone(preprocessing_pipeline)
model.steps.append(
    ('classifier', VotingClassifier(
        estimators=(
            ('LightGBM', load(models_directory / 'lgbm.joblib').best_estimator_[-1]),
            ('CatBoost', load(models_directory / 'catboost.joblib').best_estimator_[-1])
        ),
        voting='soft'
    )
))

# Fitting and saving model
model.fit(X_train, y_train)
dump(model, (models_directory / 'final_model.joblib'))

['/Users/ewerthon/Documents/Materiais/[Curso] Data Science Degree/Módulo 11/ml-credit-default-risk/models/final_model.joblib']

### Validation Score

In [34]:
roc_auc_score(y_valid, model.predict_proba(X_valid)[:,1])

0.7558946697342068

### Test Submission

In [46]:
X_test = create_dataframe('raw', 'application_test_student.csv')
predict_probas = model.predict_proba(X_test)[:,1]
submission = pd.DataFrame({
        'SK_ID_CURR': X_test['SK_ID_CURR'],
        'TARGET': predict_probas
})
submission.to_csv(processed_data_directory / 'submission.csv', index=False)


The final model produced a ROC-AUC score of 75.6%. Access the report of the project to find more details, conclusions, roadmaps, and learnings here: [LINK]