# Importing libraries

In [2]:
#Importing libraries
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt
import random 
import sklearn 
import imblearn.over_sampling
from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'

In [25]:
#Suppressing warnings 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Importing the dataset

In [49]:
credit = pd.read_csv('Preprocess2.csv', header=0)

In [4]:
credit.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29466 entries, 0 to 29465
Data columns (total 9 columns):
Unnamed: 0      29466 non-null int64
MeanBill        29466 non-null float64
MeanPaid        29466 non-null float64
CurBill_Sept    29466 non-null int64
Risky           29466 non-null object
CREDIT_RATIO    29466 non-null float64
AGE             29466 non-null int64
EDUCATION       29466 non-null object
DEFAULT         29466 non-null object
dtypes: float64(3), int64(3), object(3)
memory usage: 2.0+ MB


In [50]:
#convert all categories into numerical variables so they can be properly used to model with
credit.EDUCATION = pd.CategoricalIndex(credit.EDUCATION)
credit.Risky = pd.CategoricalIndex(credit.Risky)

credit['EDUCATION'] = credit.EDUCATION.cat.codes
credit['Risky'] = credit.Risky.cat.codes

In [51]:
#Selecting the independent variables
credit_indep = credit.iloc[:, 1:8]
credit_indep.head()

Unnamed: 0,MeanBill,MeanPaid,CurBill_Sept,Risky,CREDIT_RATIO,AGE,EDUCATION
0,1284.0,-0.333333,3913,1,0.1551,24,5
1,2846.166667,0.5,2682,1,0.014375,26,5
2,16942.166667,0.0,29239,0,0.155856,34,5
3,38555.666667,0.0,46990,0,0.96466,37,5
4,18223.166667,-0.333333,8617,0,0.1134,57,5


In [52]:
#Selecting the dependent variable
credit_dep = credit['DEFAULT']
credit_dep.head()

0    defaulted
1    defaulted
2         paid
3         paid
4         paid
Name: DEFAULT, dtype: object

# Importing models and model metrics 

In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn import preprocessing

# Creating training set and standardizing

In [54]:
#Setting seed and creating training set
random.seed(123)
x_train, x_test, y_train, y_test = train_test_split(credit_indep, credit_dep, test_size = 0.5)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(14733, 7) (14733,)
(14733, 7) (14733,)


In [None]:
# balance the trainingdata
smote = SMOTE(random_state = 4)
X_sm, y_sm = smote.fit_sample(X_train, y_train)

print(X_sm.shape, y_sm.shape)

In [55]:
#Standardizing
names = credit_indep.columns
scaler = preprocessing.StandardScaler()
scaled_credit = scaler.fit_transform(credit_indep)
scaled_credit = pd.DataFrame(scaled_credit, columns=names)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


# Support Vector Machine

In [77]:
#Running SVM on training data
random.seed(123)
modelSVM = SVC()  
modelSVM.fit(x_train, y_train) 
pred_svm = cross_val_predict(modelSVM, x_train, y_train, cv = 5)
print('Accuracy SVM is:%.3f' % accuracy_score(y_train, pred_svm))
print('Kappa SVM is:%.3f' % cohen_kappa_score(y_train, pred_svm))

Accuracy SVM is:0.798
Kappa SVM is:0.012


# Random Forest

In [73]:
#Running Random Forest on training data
random.seed(123)
modelRF = RandomForestClassifier()
modelRF.fit(x_train, y_train) 
pred_rf = cross_val_predict(modelRF, x_train, y_train, cv = 5)
print('Accuracy RF is:%.3f' % accuracy_score(y_train, pred_rf))
print('Kappa RF is:%.3f' % cohen_kappa_score(y_train, pred_rf))

Accuracy RF is:0.796
Kappa RF is:0.308


# KNN

In [74]:
#Running KNN on training data
random.seed(123)
modelKNN = KNeighborsClassifier()
modelKNN.fit(x_train, y_train) 
pred_knn = cross_val_predict(modelKNN, x_train, y_train, cv = 5)
print('Accuracy KNN is:%.3f' % accuracy_score(y_train, pred_knn))
print('Kappa KNN is:%.3f' % cohen_kappa_score(y_train, pred_knn))

Accuracy KNN is:0.764
Kappa KNN is:0.066


# Making predictions

As Random Forest was the model that had the highest score, we will focus on making predictions with it

In [83]:
predictionsRF = modelRF.predict(x_test)
print('Accuracy is: %.3f' % accuracy_score(y_test, predictionsRF))
print('Kappa is: %.3f' % cohen_kappa_score(y_test, predictionsRF))
print(confusion_matrix(y_test, predictionsRF))
print(classification_report(y_test, predictionsRF))

Accuracy is: 0.792
Kappa is: 0.316
[[ 1214  1899]
 [ 1172 10448]]
              precision    recall  f1-score   support

   defaulted       0.51      0.39      0.44      3113
        paid       0.85      0.90      0.87     11620

   micro avg       0.79      0.79      0.79     14733
   macro avg       0.68      0.64      0.66     14733
weighted avg       0.77      0.79      0.78     14733



In [84]:
predictionsSVM = modelSVM.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, predictionsSVM))
print('Kappa: %.3f' % cohen_kappa_score(y_test, predictionsSVM))
print(confusion_matrix(y_test, predictionsSVM))
print(classification_report(y_test, predictionsSVM))

Accuracy: 0.789
Kappa: 0.010
[[   25  3088]
 [   18 11602]]
              precision    recall  f1-score   support

   defaulted       0.58      0.01      0.02      3113
        paid       0.79      1.00      0.88     11620

   micro avg       0.79      0.79      0.79     14733
   macro avg       0.69      0.50      0.45     14733
weighted avg       0.75      0.79      0.70     14733



In [87]:
predictionsKNN = modelKNN.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, predictionsKNN))
print('Kappa: %.3f' % cohen_kappa_score(y_test, predictionsKNN))
print(confusion_matrix(y_test, predictionsKNN))
print(classification_report(y_test, predictionsKNN))

Accuracy: 0.761
Kappa: 0.071
[[  383  2730]
 [  790 10830]]
              precision    recall  f1-score   support

   defaulted       0.33      0.12      0.18      3113
        paid       0.80      0.93      0.86     11620

   micro avg       0.76      0.76      0.76     14733
   macro avg       0.56      0.53      0.52     14733
weighted avg       0.70      0.76      0.72     14733

