# Base Model for Client Default Payment Prediction

### Importing libraries

In [2]:
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split

### Loading data into memory

In [3]:
creditDataPath = "../dataset/default_of_credit_card_clients.csv"
creditData = pd.read_csv(creditDataPath, header = 1, index_col = 0)

### Selecting the dependent and independent variables

In [4]:
creditFeatures = creditData.drop('default payment next month', axis = 1)
creditTarget = creditData.loc[:, 'default payment next month']

### Data sample

In [5]:
#Sampling 40% of the data using the 'train_test_split' method to keep the proportion in the dependent variable.
_, sampleFeatures, _, sampleTarget = train_test_split(creditFeatures, creditTarget, 
                                                       stratify = creditTarget, test_size = 0.4, random_state = 0)

### Split into training and test set

In [6]:
x_train, x_test, y_train, y_test = train_test_split(sampleFeatures, sampleTarget, 
                                                    stratify = sampleTarget, test_size = 0.3, random_state = 0)

### Model training

In [7]:
# Random Forest
rf = RandomForestClassifier( random_state = 0)
rf = rf.fit(x_train, y_train)

# SVM
svm = SVC()
svm = svm.fit(x_train, y_train)

# K-NN
knn = KNeighborsClassifier()
knn = knn.fit(x_train, y_train)




### Cross-Validation Scores

In [8]:
rfCVScores = cross_val_score(rf, x_train, y_train, cv = 10)
svmCVScores = cross_val_score(svm, x_train, y_train, cv = 10)
knnCVScores = cross_val_score(knn, x_train, y_train, cv = 10)



In [9]:
# Random Forest
rfCVScoresMean = rfCVScores.mean()
rfCVScoresStd = rfCVScores.std()
print("Random Forest mean Cross-Validation Score: %.6f" % rfCVScoresMean)
print("Random Forest Standard Deviation Cross-Validation Score: %.6f \n" % rfCVScoresStd)

# SVM
svmCVScoresMean = svmCVScores.mean()
svmCVScoresStd = svmCVScores.std()
print("SVM mean Cross-Validation Score: %.6f" % svmCVScoresMean)
print("SVM Standard Deviation Cross-Validation Score: %.6f \n" % svmCVScoresStd)

# K-NN
knnCVScoresMean = knnCVScores.mean()
knnCVScoresStd = knnCVScores.std()
print("K-NN mean Cross-Validation Score: %.6f" % knnCVScoresMean)
print("K-NN Standard Deviation Cross-Validation Score: %.6f" % knnCVScoresStd)

Random Forest mean Cross-Validation Score: 0.809050
Random Forest Standart Deviation Cross-Validation Score: 0.010441 

SVM mean Cross-Validation Score: 0.779762
SVM Standart Deviation Cross-Validation Score: 0.002207 

K-NN mean Cross-Validation Score: 0.748333
K-NN Standart Deviation Cross-Validation Score: 0.007476


### Predictions and metrics

In [10]:
# Random Forest
rfPredictions = rf.predict(x_test)

accuracy_rf = accuracy_score(y_test, rfPredictions)
kappa_rf = cohen_kappa_score(y_test, rfPredictions)

print("Random Forest Accuracy Score: %.6f" % accuracy_rf)
print("Random Forest Kappa Score: %.6f \n" % kappa_rf)

# SVM
svmPredictions = svm.predict(x_test)

accuracy_svm = accuracy_score(y_test, svmPredictions)
kappa_svm = cohen_kappa_score(y_test, svmPredictions)

print("SVM Accuracy Score: %.6f" % accuracy_svm)
print("SVM Kappa Score: %.6f \n" % kappa_svm)

# K-NN
knnPredictions = knn.predict(x_test)

accuracy_knn = accuracy_score(y_test, knnPredictions)
kappa_knn = cohen_kappa_score(y_test, knnPredictions)

print("K-NN Accuracy Score: %.6f" % accuracy_knn)
print("K-NN Kappa Score: %.6f" % kappa_knn)

Random Forest Accuracy Score: 0.801389
Random Forest Kappa Score: 0.288675 

SVM Accuracy Score: 0.776389
SVM Kappa Score: 0.003359 

K-NN Accuracy Score: 0.747778
K-NN Kappa Score: 0.102148


### Variables Importance

In [19]:
var_imp = pd.DataFrame({'Variable' : creditData.drop('default payment next month', axis = 1).columns, 
                        'Score' : rf.feature_importances_})

var_imp.sort_values('Score', ascending = False)

Unnamed: 0,Variable,Score
5,PAY_0,0.122411
4,AGE,0.066388
0,LIMIT_BAL,0.058434
11,BILL_AMT1,0.054421
12,BILL_AMT2,0.053625
14,BILL_AMT4,0.051683
17,PAY_AMT1,0.051183
15,BILL_AMT5,0.050359
19,PAY_AMT3,0.047939
13,BILL_AMT3,0.047512


## Feature Selection

In [22]:
# Removing variables with high colinearity
x_train_fs = x_train.drop(['BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6'], axis = 1)

# Removing variables least important
x_train_fs = x_train_fs.drop(['SEX', 'MARRIAGE', 'PAY_6', 'PAY_5', 'PAY_4', 'PAY_3'], axis = 1)