# Credit Default Classification

* SeriousDlqin2yrs
* RevolvingUtilizationOfUnsecuredLines
* age
* NumberOfTime30-59DaysPastDueNotWorse
* DebtRatio
* MonthlyIncome
* NumberOfOpenCreditLinesAndLoans
* NumberOfTimes90DaysLate
* NumberRealEstateLoansOrLines
* NumberOfTime60-89DaysPastDueNotWorse
* NumberOfDependents

Source: https://www.kaggle.com/c/GiveMeSomeCredit/data

# Import required packages

In [85]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import average_precision_score, log_loss, accuracy_score, precision_score
import mlflow


# Set the parameters for the run

In [93]:
model_name = 'credit_default.joblib'


# Get the data

In [25]:
df = pd.read_csv('cs-training.csv')

In [26]:
df.columns

Index(['Unnamed: 0', 'SeriousDlqin2yrs',
       'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [27]:
df.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [28]:
df.head()

Unnamed: 0,id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [29]:
df.dtypes

id                                        int64
SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

In [30]:
len(df)

150000

# Train Test Split

In [31]:
y = df['SeriousDlqin2yrs']
X = df.drop('SeriousDlqin2yrs', axis=1)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Exploratory Data Analysis

In [33]:
y_train.head()

33237     0
98431     0
13250     0
60278     0
145808    0
Name: SeriousDlqin2yrs, dtype: int64

In [34]:
type(y_train[1])

numpy.int64

In [35]:
y_train.describe()

count    105000.000000
mean          0.066857
std           0.249776
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: SeriousDlqin2yrs, dtype: float64

In [36]:
len(X_train)

105000

# Set up the model pipeline

In [37]:
gbt_pipeline = Pipeline(steps=[
    ('impute', ColumnTransformer(transformers=[
                        ('scalar imputing mean', SimpleImputer(), X_train.columns),
                        ], remainder='drop')),
    ('scale', ColumnTransformer(transformers=[
                        ('scalar scaling', MinMaxScaler(feature_range=(0, 1)), np.arange(0, len(X_train.columns))),
                        ], remainder='drop')),
    ('GBT', GradientBoostingClassifier())
    ])

In [38]:
rf_pipeline = Pipeline(steps=[
    ('impute', ColumnTransformer(transformers=[
                        ('scalar imputing mean', SimpleImputer(), X_train.columns),
                        ], remainder='drop')),
    ('scale', ColumnTransformer(transformers=[
                        ('scalar scaling', MinMaxScaler(feature_range=(0, 1)), np.arange(0, len(X_train.columns))),
                        ], remainder='drop')),
    ('RF', RandomForestClassifier())
    ])

In [39]:
lr_pipeline = Pipeline(steps=[
    ('impute', ColumnTransformer(transformers=[
                        ('scalar imputing mean', SimpleImputer(), X_train.columns),
                        ], remainder='drop')),
    ('scale', ColumnTransformer(transformers=[
                        ('scalar scaling', MinMaxScaler(feature_range=(0, 1)), np.arange(0, len(X_train.columns))),
                        ], remainder='drop')),
    ('LR', LogisticRegression())
    ])

In [40]:
rf_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'impute', 'scale', 'RF', 'impute__n_jobs', 'impute__remainder', 'impute__sparse_threshold', 'impute__transformer_weights', 'impute__transformers', 'impute__verbose', 'impute__scalar imputing mean', 'impute__scalar imputing mean__add_indicator', 'impute__scalar imputing mean__copy', 'impute__scalar imputing mean__fill_value', 'impute__scalar imputing mean__missing_values', 'impute__scalar imputing mean__strategy', 'impute__scalar imputing mean__verbose', 'scale__n_jobs', 'scale__remainder', 'scale__sparse_threshold', 'scale__transformer_weights', 'scale__transformers', 'scale__verbose', 'scale__scalar scaling', 'scale__scalar scaling__copy', 'scale__scalar scaling__feature_range', 'RF__bootstrap', 'RF__class_weight', 'RF__criterion', 'RF__max_depth', 'RF__max_features', 'RF__max_leaf_nodes', 'RF__min_impurity_decrease', 'RF__min_impurity_split', 'RF__min_samples_leaf', 'RF__min_samples_split', 'RF__min_weight_fraction_leaf', 'RF__n_estimators', '

# Set up the grid Search

Set the parameters for a grid search over the selected family of models.

In [41]:
# This trains 360 GBTs with 5 fold CV
gbt_grid = {
    'GBT__n_estimators' : [25, 50],
    'GBT__max_depth'    : [2, 5],
}

In [None]:
# This trains 360 GBTs with 5 fold CV
gbt_grid = {
    'impute__scalar imputing mean__strategy' : ['mean', 'median'],
    'GBT__n_estimators' : [25, 50, 100],
    'GBT__max_depth'    : [2, 5, 9],
    'GBT__learning_rate': [0.1, 0.5],
    'GBT__loss': ['deviance', 'exponential']
}

In [None]:
rf_grid = {
    'impute__scalar imputing mean__strategy' : ['mean', 'median'],
    'RF__n_estimators' : [25, 50, 100],
    'RF__max_depth'    : [2, 5, 9],
    'RF__min_samples_split' : [2, 20]
}

In [None]:
lr_grid = {
    'impute__scalar imputing mean__strategy' : ['mean', 'median'],
    'LR__penalty' : ['l1', 'l2'],
    'LR__C'    : np.logspace(0, 4, 10)
}

## Train the Models

In [42]:
gbt_grid_search = GridSearchCV(gbt_pipeline, gbt_grid, cv=5, return_train_score=False
                   , scoring=['accuracy', 'precision', 'average_precision', 'neg_log_loss']
                   , refit='average_precision', n_jobs=-1 )
gbt_model = gbt_grid_search.fit(X_train, y_train)

In [None]:
rf_grid_search = GridSearchCV(rf_pipeline, rf_grid, cv=5, return_train_score=False
                   , scoring=['accuracy', 'precision', 'average_precision', 'neg_log_loss']
                   , refit='average_precision', n_jobs=-1 )
rf_model = rf_grid_search.fit(X_train, y_train)

In [None]:
lr_grid_search = GridSearchCV(lr_pipeline, lr_grid, cv=5, return_train_score=False
                   , scoring=['accuracy', 'precision', 'average_precision', 'neg_log_loss']
                   , refit='average_precision', n_jobs=-1 )
lr_model = lr_grid_search.fit(X_train, y_train)

## Candidate Model Evaluation

In [43]:
# Estimation of performance of GBT on the Validation Set:
average_precision_score(y_test, gbt_model.predict_proba(X_test)[:, 1])

0.39911498461656275

In [None]:
# Estimation of performance of RF on the Validation Set:
average_precision_score(y_test, rf_model.predict_proba(X_test)[:, 1])

In [None]:
# Estimation of performance of LR on the Validation Set:
average_precision_score(y_test, lr_model.predict_proba(X_test)[:, 1])

# Log metrics to MLFlow

In [None]:
gbt_model.best_estimator_

In [64]:
gbt_model.best_score_

0.3768689526131899

In [63]:
gbt_model.best_params_

{'GBT__max_depth': 5, 'GBT__n_estimators': 50}

In [99]:
experiment_name = 'kenney_sandbox|credit_default'
mlflow_tracking_uri = 'http://lx8527:5000/'

In [108]:
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_experiment(experiment_name)
with mlflow.start_run(nested = True) as r:
    for i in range(0, len(gbt_model.cv_results_['params'])):
        with mlflow.start_run(nested = True) as inner_run:
            for param in gbt_grid:
                mlflow.log_param(param, gbt_model.cv_results_['params'][i][param])
            mlflow.log_metric('total_fit_minutes', (gbt_model.cv_results_['mean_fit_time'][i]*5)/60)
            mlflow.log_metric('mean_test_accuracy', gbt_model.cv_results_['mean_test_accuracy'][i])
            mlflow.log_metric('mean_test_average_precision', gbt_model.cv_results_['mean_test_average_precision'][i])
            mlflow.log_metric('mean_test_neg_log_loss', gbt_model.cv_results_['mean_test_neg_log_loss'][i])
            mlflow.log_metric('mean_test_precision', gbt_model.cv_results_['mean_test_precision'][i])



In [None]:
mlflow.

In [62]:
pd.DataFrame(gbt_model.cv_results_).T

Unnamed: 0,0,1,2,3
mean_fit_time,2.80063,4.73953,6.82559,10.0826
mean_score_time,0.120357,0.130629,0.136692,0.141059
mean_test_accuracy,0.935905,0.936733,0.936552,0.936457
mean_test_average_precision,0.343732,0.36158,0.37154,0.376869
mean_test_neg_log_loss,-0.194938,-0.191339,-0.18861,-0.18609
mean_test_precision,0.615802,0.609755,0.609355,0.579012
param_GBT__max_depth,2,2,5,5
param_GBT__n_estimators,25,50,25,50
params,"{'GBT__max_depth': 2, 'GBT__n_estimators': 25}","{'GBT__max_depth': 2, 'GBT__n_estimators': 50}","{'GBT__max_depth': 5, 'GBT__n_estimators': 25}","{'GBT__max_depth': 5, 'GBT__n_estimators': 50}"
rank_test_accuracy,4,1,2,3


In [None]:



print('Test ({} samples) performance metrics'.format(len(y_test)))
print('average precision: {}'.format(average_precision_score(y_test, 
                                                            grid_search.predict_proba(X_test)[:, 1])))
print('log loss: {}'.format(log_loss(y_test, grid_search.predict_proba(X_test)[:, 1])))
print('accuracy: {}'.format(accuracy_score(y_test, grid_search.predict(X_test))))
print('precision: {}'.format(precision_score(y_test, grid_search.predict(X_test))))

# Estimation of performance of GBT on the Validation Set:
average_precision_score(y_test, gbt_model.predict_proba(X_test)[:, 1])
time_of_run_in_hours = (pd.DataFrame(grid_search.cv_results_)['mean_fit_time'] * 5).sum()
grid_search.best_estimator_

# Time of Run

In [None]:
pd.DataFrame(grid_search.cv_results_)['mean_fit_time']

In [None]:
time_of_run_in_hours = (pd.DataFrame(grid_search.cv_results_)['mean_fit_time'] * 10).sum() / 60 / 60
print('time of run in hours: {}'.format(time_of_run_in_hours))
hours_per_record = time_of_run_in_hours / 20000
print('hours per record: {}'.format(hours_per_record))
records_in_an_hour = 1 / hours_per_record
print('number of records in 1 hour {}'.format(records_in_an_hour))

# Use Dimensionality Reduction to Reduce Training Time

In [15]:
pca_pipeline = Pipeline(steps=[
    ('impute', ColumnTransformer(transformers=[
                        ('scalar imputing mean', SimpleImputer(), X_train.columns),
                        ], remainder='drop')),
    ('scale', ColumnTransformer(transformers=[
                        ('scalar scaling', MinMaxScaler(feature_range=(0, 1)), np.arange(0, len(X_train.columns))),
                        ], remainder='drop')),
    ('PCA', PCA()),
    ('GBT', GradientBoostingClassifier())
    ])

In [16]:
pca_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'impute', 'scale', 'PCA', 'GBT', 'impute__n_jobs', 'impute__remainder', 'impute__sparse_threshold', 'impute__transformer_weights', 'impute__transformers', 'impute__verbose', 'impute__scalar imputing mean', 'impute__scalar imputing mean__add_indicator', 'impute__scalar imputing mean__copy', 'impute__scalar imputing mean__fill_value', 'impute__scalar imputing mean__missing_values', 'impute__scalar imputing mean__strategy', 'impute__scalar imputing mean__verbose', 'scale__n_jobs', 'scale__remainder', 'scale__sparse_threshold', 'scale__transformer_weights', 'scale__transformers', 'scale__verbose', 'scale__scalar scaling', 'scale__scalar scaling__copy', 'scale__scalar scaling__feature_range', 'PCA__copy', 'PCA__iterated_power', 'PCA__n_components', 'PCA__random_state', 'PCA__svd_solver', 'PCA__tol', 'PCA__whiten', 'GBT__criterion', 'GBT__init', 'GBT__learning_rate', 'GBT__loss', 'GBT__max_depth', 'GBT__max_features', 'GBT__max_leaf_nodes', 'GBT__min_

In [17]:
pca_grid = {
    'PCA__n_components' : [2, 3, 5],
    'GBT__n_estimators' : [25, 50, 100],
    'GBT__max_depth'    : [2, 5, 9],
    'GBT__loss': ['deviance', 'exponential']
}

In [18]:
pca_grid_search = GridSearchCV(pca_pipeline, pca_grid, cv=5, return_train_score=False
                   , scoring=['accuracy', 'precision', 'average_precision', 'neg_log_loss']
                   , refit='average_precision', n_jobs=-1 )
pca_model = pca_grid_search.fit(X_train, y_train)

In [21]:
# Estimation of performance of PCA on the Validation Set:
average_precision_score(y_test, pca_model.predict_proba(X_test)[:, 1])

0.36358262394456076

# Analyze the Model Results

In [22]:
pd.DataFrame(pca_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
mean_fit_time,1.91998,2.34541,3.2018,3.05865,3.82781,5.50863,5.55917,7.41248,11.3408,3.83147,...,30.0279,8.3814,13.0224,26.5845,16.6562,25.0572,46.893,33.4631,51.413,69.2342
mean_score_time,0.115892,0.125998,0.122672,0.134209,0.125429,0.128329,0.182808,0.185658,0.194964,0.148442,...,0.348052,0.201453,0.211037,0.234167,0.294702,0.322113,0.366753,0.525166,0.57697,0.456517
mean_test_accuracy,0.933143,0.933219,0.933943,0.933124,0.933343,0.934695,0.933095,0.933352,0.935143,0.933038,...,0.935524,0.933095,0.933162,0.934648,0.933057,0.933229,0.934829,0.933057,0.93319,0.934505
mean_test_average_precision,0.10121,0.124701,0.249261,0.10191,0.127642,0.27824,0.104114,0.129832,0.305048,0.101717,...,0.331888,0.100373,0.128519,0.321264,0.100555,0.128934,0.322573,0.0988681,0.125953,0.323218
mean_test_neg_log_loss,-0.237852,-0.235702,-0.21887,-0.237777,-0.235166,-0.21313,-0.237857,-0.23495,-0.207141,-0.238189,...,-0.197649,-0.239207,-0.235788,-0.199606,-0.239956,-0.236546,-0.19979,-0.241848,-0.238636,-0.201811
mean_test_precision,0,0.58553,0.60407,0.3,0.609425,0.635885,0.376923,0.580609,0.611525,0.317749,...,0.560719,0.419444,0.504184,0.587937,0.384444,0.537209,0.55331,0.384127,0.522008,0.539141
param_GBT__loss,deviance,deviance,deviance,deviance,deviance,deviance,deviance,deviance,deviance,deviance,...,exponential,exponential,exponential,exponential,exponential,exponential,exponential,exponential,exponential,exponential
param_GBT__max_depth,2,2,2,2,2,2,2,2,2,5,...,5,9,9,9,9,9,9,9,9,9
param_GBT__n_estimators,25,25,25,50,50,50,100,100,100,25,...,100,25,25,25,50,50,50,100,100,100
param_PCA__n_components,2,3,5,2,3,5,2,3,5,2,...,5,2,3,5,2,3,5,2,3,5


In [None]:
pd.DataFrame(grid_search.cv_results_)[['mean_fit_time', 'param_GBT__max_depth', 'param_GBT__n_estimators',
                                      'mean_test_accuracy', 'mean_test_precision', 'mean_test_average_precision',
                                      'mean_test_neg_log_loss']].T#.to_csv('grid_search_cv_results.csv')

In [None]:
#pd.DataFrame(grid_sesarch.cv_results_).columns

In [None]:
grid_search.best_estimator_

# Feature Importance

In [None]:
feature_imp = pd.DataFrame({'column': X_train.columns,
                            'feature_importance': grid_search.best_estimator_.named_steps["GBT"].feature_importances_})
feature_imp = feature_imp.sort_values('feature_importance', ascending=False)
#feature_imp.to_csv('feature_importance_90k_lapse.csv')

In [None]:
feature_imp.head(15)

# Save the Model Output

In [None]:
dump(grid_search, '{}.joblib'.format(model_name))

# Write the Predictions

Evaluate on the Test set

In [None]:
from sklearn.metrics import average_precision_score, log_loss, accuracy_score, precision_score
print('Test ({} samples) performance metrics'.format(len(y_test)))
print('average precision: {}'.format(average_precision_score(y_test, 
                                                            grid_search.predict_proba(X_test)[:, 1])))
print('log loss: {}'.format(log_loss(y_test, grid_search.predict_proba(X_test)[:, 1])))
print('accuracy: {}'.format(accuracy_score(y_test, grid_search.predict(X_test))))
print('precision: {}'.format(precision_score(y_test, grid_search.predict(X_test))))


Save the results

In [None]:
test_scores = pd.DataFrame({'id': X_test['id'],
                            'probability': grid_search.predict_proba(X_test)[:, 1],
                            'prediction': grid_search.predict(X_test),
                             'actual': y_test})

In [None]:
test_scores.head()

In [None]:
#test_scores_with_data = pd.concat([test_scores, X_train.drop('id', axis=1)], axis=1)

In [None]:
#test_scores_with_data.head()

In [None]:
#test_scores_with_data.to_csv('{}_scores_with_data.csv'.format(model_name))

# Predcitions Confusion Matrix

In [None]:
cm = confusion_matrix(test_scores['actual'], test_scores['prediction'])
cm

In [None]:
total = len(test_scores)
perc_vals = [[cm[0][0]/total*100, cm[0][1]/total*100], 
             [cm[1][0]/total*100, cm[1][1]/total*100]]
perc_vals