In [None]:
import pandas as pd 
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
import random 
import numpy as np

from sasviya.ml.linear_model import LogisticRegression
from sasviya.ml.tree import ForestClassifier
import matplotlib.pyplot as plt
import os

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

### Data Splitting and Modelling
In this section we start by defining key variables such as:
- data_path = here we pass the path to the modelling_data we have previously prepared 
- columns_to_exclude = we specify which columns from the set of available ones we should exclude when fitting the models
- target = specify the target variable
- train_frac = portion of data for training
- valid_frac = portion of data for validation
- test_frac = portion of data for testing

Next, we split the data into train, validation and test by stratifying on the target variable

In [None]:
data_path = '../data/cleaned_data/train_valid_test.csv'
features = ['CreditPolicy', 'PublicRecord',  'Purpose', 'InterestRate', 'Installment','Delinquencies2Yrs', 
            'BIN_CreditLineAge', 'BIN_DebtIncRatio', 'BIN_FICOScore','BIN_Inquiries6Mnths', 'BIN_LogAnnualInc', 
            'BIN_RevBalance','BIN_RevUtilization']
target = 'Default'

In [None]:
woe_transform_credit_data = pd.read_csv(data_path)

train = woe_transform_credit_data[woe_transform_credit_data['_PartInd_']==1].reset_index(drop=True)
valid = woe_transform_credit_data[woe_transform_credit_data['_PartInd_']==2].reset_index(drop=True)
test = woe_transform_credit_data[woe_transform_credit_data['_PartInd_']==3].reset_index(drop=True)

train_defaults = train[target].sum()
valid_defaults = valid[target].sum()
test_defaults = test[target].sum()
print('Train Size:', train.shape[0], f'--- {target} Frequency:', f'{round(100*train_defaults/train.shape[0],2)}%')
print('Valid Size:', valid.shape[0], f'--- {target} Frequency:', f'{round(100*valid_defaults/valid.shape[0],2)}%')
print('Test Size:', test.shape[0], f'--- {target} Frequency:', f'{round(100*test_defaults/test.shape[0],2)}%')

In [None]:
logistic_model = LogisticRegression()
#Fit on Train Data
logistic_model.fit(train[features], train[target])
train_preds_logistic = logistic_model.predict(train[features])
valid_preds_logistic = logistic_model.predict(valid[features])
#Fit on Train and Valid Data
train_valid = pd.concat([train, valid])
logistic_model.fit(train_valid[features], train_valid[target])
test_preds_logistic = logistic_model.predict(test[features])
#Compute Fit Metrics
train_f1_logistic = f1_score(train[target], train_preds_logistic)
valid_f1_logistic = f1_score(valid[target], valid_preds_logistic)
test_f1_logistic = f1_score(test[target], test_preds_logistic)

In [None]:
forest_model = ForestClassifier()
#Fit on Train Data
forest_model.fit(train[features], train[target])
train_preds_forest = forest_model.predict(train[features])
valid_preds_forest = forest_model.predict(valid[features])
#Fit on Train and Valid Data
train_valid = pd.concat([train, valid])
forest_model.fit(train_valid[features], train_valid[target])
test_preds_forest = forest_model.predict(test[features])
#Compute Fit Metrics
train_f1_forest = f1_score(train[target], train_preds_forest)
valid_f1_forest = f1_score(valid[target], valid_preds_forest)
test_f1_forest = f1_score(test[target], test_preds_forest)

In [None]:
model_comparison = pd.DataFrame(
    {'Logistic': [train_f1_logistic, valid_f1_logistic, test_f1_logistic], 
    'Forest': [train_f1_forest, valid_f1_forest, test_f1_forest]},
    ['Train F1', 'Valid F1', 'Test F1'])
100*model_comparison.round(4)

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(20,5))
disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_preds_logistic, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[0])
axs[0].set_title('Test Confusion Matrix - Logistic')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_preds_forest, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[1])
axs[1].set_title('Test Confusion Matrix - Forest')

plt.show()

### Your Task

Develop a Gradient Boosting Model, test different hyperparameters and compare its performance against the above trained Logistic and Forest Models.

![GB Classifier Overview](../img/GB_Details_Python.png)

For further guidance: https://go.documentation.sas.com/doc/en/workbenchcdc/v_001/explore/n1kiea90s0276wn1xr0ig0hvkix6.htm

In [None]:
from sasviya.ml.tree import GradientBoostingClassifier

In [None]:
# Instantiate the model, fit it and evaluate it

In [None]:
# Make sure to compute the Train F1, Valid F1 and Test F1
model_comparison = pd.DataFrame(
    {'Logistic': [train_f1_logistic, valid_f1_logistic, test_f1_logistic], 
    'Forest': [train_f1_forest, valid_f1_forest, test_f1_forest],
    'GB': [_________]},
    ['Train F1', 'Valid F1', 'Test F1'])
100*model_comparison.round(4)

In [None]:
# Produce confusion matrices to compare performance against the Logistic Regression and the Gradient Boosting
fig, axs = plt.subplots(ncols=3, figsize=(20,5))
disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_preds_logistic, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[0])
axs[0].set_title('Test Confusion Matrix - Logistic')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_preds_forest, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[1])
axs[1].set_title('Test Confusion Matrix - Forest')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], _______, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[2])
axs[2].set_title('Test Confusion Matrix - GB')

plt.show()

### Load Synthetic Data

In [None]:
synthetic_data_path = '../data/cleaned_data/synthetic_data.csv'

In [None]:
synthetic_data = pd.read_csv(synthetic_data_path)

train = pd.concat([train, synthetic_data])
train_defaults = train[target].sum()
valid_defaults = valid[target].sum()
test_defaults = test[target].sum()
print('Train Size:', train.shape[0], f'--- {target} Frequency:', f'{round(100*train_defaults/train.shape[0],2)}%')
print('Valid Size:', valid.shape[0], f'--- {target} Frequency:', f'{round(100*valid_defaults/valid.shape[0],2)}%')
print('Test Size:', test.shape[0], f'--- {target} Frequency:', f'{round(100*test_defaults/test.shape[0],2)}%')

### Your Task

Train a GradientBoosting Classifier with the newly augmented train dataset and evaluate its performance.

- Are there any deltas in the Fit Metrics?
- Produce a table to directly compare fit metrics with and without synthetic data
- Produce the newly achieved confusion matrices

In [None]:
# Make sure to compute the Train F1, Valid F1 and Test F1
model_comparison = pd.DataFrame(
    {'Logistic': [train_f1_logistic, valid_f1_logistic, test_f1_logistic], 
    'Forest': [train_f1_forest, valid_f1_forest, test_f1_forest],
    'GB': [________],
    'GB+Synth':[________]},
    ['Train F1', 'Valid F1', 'Test F1'])
100*model_comparison.round(4)

In [None]:
# Produce confusion matrices to compare performance against the Logistic Regression and the Gradient Boosting
fig, axs = plt.subplots(ncols=4, figsize=(20,5))
disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_preds_logistic, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[0])
axs[0].set_title('Test Confusion Matrix - Logistic')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], test_preds_forest, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[1])
axs[1].set_title('Test Confusion Matrix - Forest')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], ______, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[2])
axs[2].set_title('Test Confusion Matrix - GB')

disp = ConfusionMatrixDisplay(confusion_matrix(test[target], ________, normalize='true'))
disp.plot(cmap=plt.cm.Blues, ax=axs[3])
axs[3].set_title('Test Confusion Matrix - GB+Synth')

plt.show()

### Saving Models
We create an artifacts folder and save the models

In [None]:
mypath = 'artifacts'
if not os.path.isdir(mypath):
   os.makedirs(mypath)

logistic_model.save('artifacts/logistic_model.pkl')
forest_model.save('artifacts/forest_model.pkl')
#Your GB Model
#Your GB + Synth Model