In [141]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from scipy.stats import randint

In [142]:
df = pd.read_excel('default.xls', header = 1) 

In [143]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [144]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

From the above, we notice that most of our variables are numeric, and the rest are categorical. We'll need to normalize the numeric variables and change the categorical variables into dummy variables.

In [145]:
df2 = pd.get_dummies(df, columns = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'])

In [146]:
target = df2['default payment next month']

df2 = df2.drop(['default payment next month'], axis = 1)

In [147]:
numeric_vars = df2[['AGE', 'LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]

numeric_normalized = normalize(numeric_vars)

In [148]:
# all non-numeric columns
cat_vars = df2.drop(['AGE', 'LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'], axis = 1)

df_normalized = pd.DataFrame(numeric_normalized)
df_normalized['ID'] = cat_vars['ID']

In [149]:
# Merge the data back together with 'ID'
df_merged = pd.merge(cat_vars, df_normalized, on = 'ID')

In [150]:
keys = list(range(15))
names = numeric_vars.columns
columns_dict = dict(zip(keys, names))

df_merged = df_merged.rename(columns = columns_dict)

In [151]:
df_merged.head()

Unnamed: 0,ID,SEX_1,SEX_2,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,1,0,1,0,0,1,0,0,0,0,...,0.033387,0.0,0.0,0.0,0.0,0.033387,0.0,0.0,0.0,0.0
1,2,0,1,0,0,1,0,0,0,0,...,0.022305,0.027212,0.028734,0.027121,0.0,0.008317,0.008317,0.008317,0.0,0.016633
2,3,0,1,0,0,1,0,0,0,0,...,0.135329,0.143034,0.149192,0.15519,0.015151,0.014971,0.009981,0.009981,0.009981,0.049904
3,4,0,1,0,0,1,0,0,0,0,...,0.450145,0.258575,0.264465,0.269835,0.018265,0.018438,0.010959,0.010046,0.009763,0.009132
4,5,1,0,0,0,1,0,0,0,0,...,0.441331,0.25789,0.235795,0.235611,0.024631,0.45175,0.123157,0.110841,0.008485,0.008362


Now we have our full DataFrame appropriately transformed: all categorical variables have appropriate dummy variables, and all numerical variables have been normalized to have values between 0 and 1. 

In [152]:
df_merged.columns

Index(['ID', 'SEX_1', 'SEX_2', 'EDUCATION_0', 'EDUCATION_1', 'EDUCATION_2',
       'EDUCATION_3', 'EDUCATION_4', 'EDUCATION_5', 'EDUCATION_6',
       'MARRIAGE_0', 'MARRIAGE_1', 'MARRIAGE_2', 'MARRIAGE_3', 'PAY_0_-2',
       'PAY_0_-1', 'PAY_0_0', 'PAY_0_1', 'PAY_0_2', 'PAY_0_3', 'PAY_0_4',
       'PAY_0_5', 'PAY_0_6', 'PAY_0_7', 'PAY_0_8', 'PAY_2_-2', 'PAY_2_-1',
       'PAY_2_0', 'PAY_2_1', 'PAY_2_2', 'PAY_2_3', 'PAY_2_4', 'PAY_2_5',
       'PAY_2_6', 'PAY_2_7', 'PAY_2_8', 'PAY_3_-2', 'PAY_3_-1', 'PAY_3_0',
       'PAY_3_1', 'PAY_3_2', 'PAY_3_3', 'PAY_3_4', 'PAY_3_5', 'PAY_3_6',
       'PAY_3_7', 'PAY_3_8', 'PAY_4_-2', 'PAY_4_-1', 'PAY_4_0', 'PAY_4_1',
       'PAY_4_2', 'PAY_4_3', 'PAY_4_4', 'PAY_4_5', 'PAY_4_6', 'PAY_4_7',
       'PAY_4_8', 'PAY_5_-2', 'PAY_5_-1', 'PAY_5_0', 'PAY_5_2', 'PAY_5_3',
       'PAY_5_4', 'PAY_5_5', 'PAY_5_6', 'PAY_5_7', 'PAY_5_8', 'PAY_6_-2',
       'PAY_6_-1', 'PAY_6_0', 'PAY_6_2', 'PAY_6_3', 'PAY_6_4', 'PAY_6_5',
       'PAY_6_6', 'PAY_6_7', 'PAY_6_8', '

In [153]:
X_train, X_test, Y_train, Y_test = train_test_split(df_merged, target, test_size = 0.2, random_state = 10)

# Logistic Regression

In [154]:
logistic = LogisticRegression(solver = 'liblinear')

In [155]:
logistic.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [156]:
predictions = dict()

predictions['Logistic'] = logistic.predict(X_test)
accuracy_score(Y_test, predictions['Logistic'])

0.799

In [157]:
# Hyperparameter tuning

# Set up a "grid" of values we'd like to test to find out which results in the best performance
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Perform a grid search for the logistic regression classifier, then re-fit the data
logistic_cv = GridSearchCV(logistic, param_grid, cv = 5)
logistic_cv.fit(X_train, Y_train)

# Print best parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logistic_cv.best_params_)) 
print("Best score is {}".format(logistic_cv.best_score_))

Tuned Logistic Regression Parameters: {'C': 0.4393970560760795}
Best score is 0.7995833333333333


In [158]:
predictions['LogisticCV'] = logistic_cv.predict(X_test)
accuracy_score(Y_test, predictions['LogisticCV'])

0.799

In [159]:
print(classification_report(Y_test, predictions['LogisticCV']), sep='\n')

              precision    recall  f1-score   support

           0       0.80      0.98      0.88      4683
           1       0.69      0.15      0.25      1317

   micro avg       0.80      0.80      0.80      6000
   macro avg       0.75      0.57      0.57      6000
weighted avg       0.78      0.80      0.75      6000



# Decision Tree Classifier

In [160]:
tree = DecisionTreeClassifier()
tree.fit(X_train, Y_train)

param_dist = {"max_depth": [20, 10, 5, 3, None],
              "max_features": randint(1, 50),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

tree_cv = RandomizedSearchCV(tree, param_dist, cv = 5)
tree_cv.fit(X_train, Y_train)
predictions['TreeCV'] = tree_cv.predict(X_test)

# Score predictions
accuracy_score(Y_test, predictions['TreeCV'])

0.8211666666666667

In [161]:
print("Tuned Tree Parameters: {}".format(tree_cv.best_params_)) 
print("Best score is {}".format(tree_cv.best_score_))

Tuned Tree Parameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': 49, 'min_samples_leaf': 8}
Best score is 0.8170416666666667


In [162]:
print(classification_report(Y_test, predictions['TreeCV']), sep='\n')

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      4683
           1       0.70      0.32      0.44      1317

   micro avg       0.82      0.82      0.82      6000
   macro avg       0.77      0.64      0.67      6000
weighted avg       0.81      0.82      0.79      6000



# Naive Bayes

In [163]:
nb = GaussianNB()
nb.fit(X_train, Y_train)

# Score predictions
predictions['NB'] = nb.predict(X_test)
accuracy_score(Y_test, predictions['NB'])

0.8115

In [164]:
print(classification_report(Y_test, predictions['NB']), sep='\n')

              precision    recall  f1-score   support

           0       0.85      0.92      0.88      4683
           1       0.60      0.41      0.49      1317

   micro avg       0.81      0.81      0.81      6000
   macro avg       0.73      0.67      0.69      6000
weighted avg       0.79      0.81      0.80      6000



It seems that none of these models are getting especially strong scores on predicting when someone will default. For defaults, the most important stat is recall, because it measures the ratio of true positives to (true positives + false negatives). A low ratio would mean that many instances of default were missed, and were predicted not to be at risk.

An ideal classifier would catch defaults almost always (a recall close to 1 for the Default class), even if it produced many false positives.

The best score was given by the Naive Bayes, which had a recall of 41% for default. The worst performer was the Logistic Regression, which had a recall of 15% for default.