In [None]:
import pandas as pd 
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
import random 
import numpy as np
import optuna
from optuna.samplers import TPESampler
from sasviya.ml.linear_model import LogisticRegression
from sasviya.ml.tree import ForestClassifier
import matplotlib.pyplot as plt
import os

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
sampler=TPESampler(seed=SEED)
optuna.logging.set_verbosity(optuna.logging.WARNING)

### Data Splitting and Modelling
In this section we start by defining key variables such as:
- data_path = here we pass the path to the modelling_data we have previously prepared 
- columns_to_exclude = we specify which columns from the set of available ones we should exclude when fitting the models
- target = specify the target variable
- train_frac = portion of data for training
- valid_frac = portion of data for validation
- test_frac = portion of data for testing

Next, we split the data into train, validation and test by stratifying on the target variable

In [None]:
data_path = '../data/cleaned_data/train_valid_test.csv'
features = ['CreditPolicy', 'PublicRecord',  'Purpose', 'InterestRate', 'Installment','Delinquencies2Yrs', 
            'BIN_CreditLineAge', 'BIN_DebtIncRatio', 'BIN_FICOScore','BIN_Inquiries6Mnths', 'BIN_LogAnnualInc', 
            'BIN_RevBalance','BIN_RevUtilization']
target = 'Default'

In [None]:
woe_transform_credit_data = pd.read_csv(data_path)

train = woe_transform_credit_data[woe_transform_credit_data['_PartInd_']==1].reset_index(drop=True)
valid = woe_transform_credit_data[woe_transform_credit_data['_PartInd_']==2].reset_index(drop=True)
test = woe_transform_credit_data[woe_transform_credit_data['_PartInd_']==3].reset_index(drop=True)

train_defaults = train[target].sum()
valid_defaults = valid[target].sum()
test_defaults = test[target].sum()
print('Train Size:', train.shape[0], f'--- {target} Frequency:', f'{round(100*train_defaults/train.shape[0],2)}%')
print('Valid Size:', valid.shape[0], f'--- {target} Frequency:', f'{round(100*valid_defaults/valid.shape[0],2)}%')
print('Test Size:', test.shape[0], f'--- {target} Frequency:', f'{round(100*test_defaults/test.shape[0],2)}%')

Here we are defining some key functions to automate the hyperparameter search and the entire model fitting.

For instance:
- objective_logistic() => is a function that defines the search space for the logistic regression hyperparameters. In a given trial, we will then fit the model and return the performance metric we are trying to maximize/minimize.
- optimize_model() => creates the optimization study, produces validation metrics and returns the train F1 score, the validation F1 score, the best hyperparameters and the best probability cutoff to include.
- test_fit() => performance a fit on test data of the developed model
- optimize_p_threshold() => iterates over a set of possible probability cutoffs and identifies the ones that maximizes performance on validation data.

In [None]:
def objective_logistic(trial, train, valid):
    selection = trial.suggest_categorical('selection', ["backward", "forward", "lasso", "stepwise", None])
    model = LogisticRegression(selection=selection)
    model.fit(train[features], train[target])
    preds = model.predict(valid[features])
    return f1_score(valid[target].to_numpy(), preds)

def objective_forest(trial, train, valid):
    n_estimators = trial.suggest_int('n_estimators', 100, 300, step=50)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    max_features = trial.suggest_categorical('max_features', [len(features), int(len(features)**0.5), int(np.log2(len(features)))])
    n_bins = trial.suggest_int('n_bins', 10, 100, step=10)
    model = ForestClassifier(n_estimators=n_estimators, max_depth=max_depth,max_features=max_features, 
                             n_bins=n_bins, random_state=SEED, oob_score=False)
    model.fit(train[features], train[target])
    preds = model.predict(valid[features])
    return f1_score(valid[target].to_numpy(), preds)

def optimize_model(model, n_trials, train, valid):
    study = optuna.create_study(direction='maximize', sampler = sampler)
    if model == 'Logistic':
        study.optimize(lambda trial: objective_logistic(trial, train, valid), n_trials=n_trials)
        optimized_model = LogisticRegression(**study.best_params)
    if model == 'Forest':
        study.optimize(lambda trial: objective_forest(trial, train, valid), n_trials=n_trials)
        optimized_model = ForestClassifier(**study.best_params)

    optimized_model.fit(train[features], train[target])
    prob_preds = optimized_model.predict_proba(valid[features]).iloc[:,1].to_numpy()
    valid_f1, best_prob_thresh = optimize_p_threshold(prob_preds, valid[target])
    train_preds = optimized_model.predict_proba(train[features]).iloc[:,1].to_numpy()
    train_preds[train_preds>=best_prob_thresh] = 1
    train_preds[train_preds<best_prob_thresh] = 0
    train_f1 = f1_score(train[target].to_numpy(), train_preds)
    print(f'Finished {model} optimization')
    return train_f1, valid_f1, study.best_params, best_prob_thresh

def test_fit(model, hyperparams, train_valid, best_prob_thresh):
    train_valid = pd.concat([train, valid])
    if model == 'Logistic':
        optimized_model = LogisticRegression(**hyperparams)
    if model == 'Forest':
        optimized_model = ForestClassifier(**hyperparams)
    
    optimized_model.fit(train_valid[features], train_valid[target])
    prob_preds = optimized_model.predict_proba(test[features]).iloc[:,1].to_numpy()
    prob_preds[prob_preds>=best_prob_thresh] = 1
    prob_preds[prob_preds<best_prob_thresh] = 0
    test_f1 = f1_score(test[target].to_numpy(), prob_preds)
    return optimized_model, test_f1, prob_preds

def optimize_p_threshold(prob_preds, true_y):
    best_prob_thresh = 0
    best_f1_achieved = 0
    for i in np.arange(0.01, 1, 0.01):
        transf_preds = prob_preds.copy()
        transf_preds[transf_preds>=i] = 1
        transf_preds[transf_preds<i] = 0
        curr_f1 = f1_score(true_y, transf_preds)
        if curr_f1 > best_f1_achieved:
            best_prob_thresh=i
            best_f1_achieved = curr_f1
    return best_f1_achieved, best_prob_thresh

In [None]:
train_f1_logistic, valid_f1_logistic, logistic_hp, prob_thresh_logistic = optimize_model('Logistic', 20, train, valid)
train_f1_forest, valid_f1_forest, forest_hp, prob_thresh_forest = optimize_model('Forest', 20, train, valid)

train_valid = pd.concat([train, valid])

optimized_logistic, test_f1_logistic, test_logistic_preds = test_fit('Logistic', logistic_hp, train_valid, prob_thresh_logistic)
optimized_forest, test_f1_forest, test_forest_preds = test_fit('Forest', forest_hp, train_valid, prob_thresh_forest)

In [None]:
model_comparison = pd.DataFrame(
    {'Logistic': [train_f1_logistic, valid_f1_logistic, test_f1_logistic, prob_thresh_logistic], 
    'Forest': [train_f1_forest, valid_f1_forest, test_f1_forest, prob_thresh_forest]},
    ['Train F1', 'Valid F1', 'Test F1', 'Prob Threshold'])
100*model_comparison.round(4)

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(20,5))
disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_logistic_preds, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[0])
axs[0].set_title('Test Confusion Matrix - Logistic')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_forest_preds, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[1])
axs[1].set_title('Test Confusion Matrix - Forest')

plt.show()

### Your Task

Develop a Gradient Boosting Model, optimize it and compare its performance against the above trained Logistic and Forest Models.

![GB Classifier Overview](../img/GB_Details_Python.png)

For further guidance: https://go.documentation.sas.com/doc/en/workbenchcdc/v_001/explore/n1kiea90s0276wn1xr0ig0hvkix6.htm

In [None]:
from sasviya.ml.tree import GradientBoostingClassifier

In [None]:
# Instantiate the model, fit it and evaluate it
# Note: Optuna can suggest the following: suggest_int, suggest_float, suggest_categorical
def objective_gb(trial, train, valid):
    #Your Hyperparameters definition

    model = GradientBoostingClassifier(YOUR HYPERPARAMETERS)
    model.fit(train[features], train[target])
    preds = model.predict(valid[features])
    return f1_score(valid[target].to_numpy(), preds)

def optimize_gb_model(model, n_trials, train, valid):
    study = optuna.create_study(direction='maximize', sampler = sampler)
    study.optimize(lambda trial: objective_gb(trial, train, valid), n_trials=n_trials)
    optimized_model = GradientBoostingClassifier(**study.best_params)
    optimized_model.fit(train[features], train[target])
    prob_preds = optimized_model.predict_proba(valid[features]).iloc[:,1].to_numpy()
    valid_f1, best_prob_thresh = optimize_p_threshold(prob_preds, valid[target])
    train_preds = optimized_model.predict_proba(train[features]).iloc[:,1].to_numpy()
    train_preds[train_preds>=best_prob_thresh] = 1
    train_preds[train_preds<best_prob_thresh] = 0
    train_f1 = f1_score(train[target].to_numpy(), train_preds)
    print(f'Finished {model} optimization')
    return train_f1, valid_f1, study.best_params, best_prob_thresh

def test_fit_gb(model, hyperparams, train_valid, best_prob_thresh):
    train_valid = pd.concat([train, valid])
    optimized_model = GradientBoostingClassifier(**hyperparams)  
    optimized_model.fit(train_valid[features], train_valid[target])
    prob_preds = optimized_model.predict_proba(test[features]).iloc[:,1].to_numpy()
    prob_preds[prob_preds>=best_prob_thresh] = 1
    prob_preds[prob_preds<best_prob_thresh] = 0
    test_f1 = f1_score(test[target].to_numpy(), prob_preds)
    return optimized_model, test_f1, prob_preds

In [None]:
# Instantiate the model, fit it and evaluate it


In [None]:
# Make sure to compute the Train F1, Valid F1, Test F1 and Prob Threshold
model_comparison = pd.DataFrame(
    {'Logistic': [train_f1_logistic, valid_f1_logistic, test_f1_logistic, prob_thresh_logistic], 
    'Forest': [train_f1_forest, valid_f1_forest, test_f1_forest, prob_thresh_forest],
    'GB': [_____]},
    ['Train F1', 'Valid F1', 'Test F1', 'Prob Threshold'])
100*model_comparison.round(4)

In [None]:
# Produce confusion matrices to compare performance against the Logistic Regression and the Gradient Boosting
fig, axs = plt.subplots(ncols=3, figsize=(20,5))
disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_logistic_preds, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[0])
axs[0].set_title('Test Confusion Matrix - Logistic')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_forest_preds, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[1])
axs[1].set_title('Test Confusion Matrix - Forest')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], ____, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[2])
axs[2].set_title('Test Confusion Matrix - GB')

plt.show()

### Train with Synthetic Data

In [None]:
synthetic_data_path = '../data/cleaned_data/synthetic_data.csv'

In [None]:
synthetic_data = pd.read_csv(synthetic_data_path)

train = pd.concat([train, synthetic_data])
train_defaults = train[target].sum()
valid_defaults = valid[target].sum()
test_defaults = test[target].sum()
print('Train Size:', train.shape[0], f'--- {target} Frequency:', f'{round(100*train_defaults/train.shape[0],2)}%')
print('Valid Size:', valid.shape[0], f'--- {target} Frequency:', f'{round(100*valid_defaults/valid.shape[0],2)}%')
print('Test Size:', test.shape[0], f'--- {target} Frequency:', f'{round(100*test_defaults/test.shape[0],2)}%')

### Your Task

Train a GradientBoosting Classifier with the newly augmented train dataset, optimize it and evaluate its performance.

- Are there any deltas in the Fit Metrics?
- Produce a table to directly compare fit metrics with and without synthetic data
- Produce the newly achieved confusion matrices

In [None]:
model_comparison = pd.DataFrame(
    {'Logistic': [train_f1_logistic, valid_f1_logistic, test_f1_logistic, prob_thresh_logistic], 
    'Forest': [train_f1_forest, valid_f1_forest, test_f1_forest, prob_thresh_forest],
    'GB': [_______],
    'GB Synth': [_____]},
    ['Train F1', 'Valid F1', 'Test F1', 'Prob Threshold'])
100*model_comparison.round(4)

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(20,5))
disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_logistic_preds, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[0])
axs[0].set_title('Test Confusion Matrix - Logistic')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_forest_preds, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[1])
axs[1].set_title('Test Confusion Matrix - Forest')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], ______, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[2])
axs[2].set_title('Test Confusion Matrix - GB')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], ______, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[3])
axs[3].set_title('Test Confusion Matrix - GB+Synth')

plt.show()

### Saving Models
We create an artifacts folder and save the models

In [None]:
mypath = 'artifacts'
if not os.path.isdir(mypath):
   os.makedirs(mypath)

optimized_logistic.save('artifacts/logistics_model.pkl')
optimized_forest.save('artifacts/forest_model.pkl')
# Your GB Model
# Your GB+Synth Model
model_comparison.to_csv('artifacts/model_comparison.csv', index=False)