In [40]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

df = pd.read_csv('data/CreditcardDefaults.csv')
df.rename(columns={'default payment next month':'default'}, inplace=True)

#set index to the "ID" value and remove the ID column
df.index = df.ID
del df['ID']

#Create Lists for Analysis
continuous_features = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2','BILL_AMT3',
                       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
                       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                       'PAY_AMT6']
ordinal_features = ['EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0','PAY_2', 'PAY_3',
                    'PAY_4', 'PAY_5', 'PAY_6','default']

#Convert datatypes
df[continuous_features] = df[continuous_features].astype(np.float64)
df[ordinal_features] = df[ordinal_features].astype(np.int64)

#convert any non-identified education categories to 'OTHER'
df['EDUCATION'] = df['EDUCATION'].replace(to_replace=(0,5,6),value=4)

#convert any non-identified marriage categories to 'OTHER'
df['MARRIAGE'] = df['MARRIAGE'].replace(to_replace=(0),value=3)

#Log transform continuous variables; as they each have a mostly 
##exponential distribution
df["log_LIMIT_BAL"]=np.log(df.LIMIT_BAL)
df["log_PAY_AMT1"]=np.log(df.PAY_AMT1+1)

# #bin the ages based on various age groups 
bins = [18, 25, 35, 45, 55, 65, 100]
labels = [0,1,2,3,4,5]
df['AGEGROUP'] = pd.cut(df['AGE'], bins=bins, labels=labels)


# One-hot encoding of "EDUCATION" and "MARRIAGE".
tmp_df_1 = pd.get_dummies(df.EDUCATION,prefix='EDUCATION')
tmp_df_2 = pd.get_dummies(df.MARRIAGE,prefix='MARRIAGE')
tmp_df_3 = pd.get_dummies(df.AGEGROUP,prefix='AGEGROUP')
df = pd.concat((df,tmp_df_1,tmp_df_2,tmp_df_3),axis=1)


# flag all the payment histor to late vs not late
payments = ['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']
bins = [-10, 2, 10]
labels = [0,1]
for fi,feature in enumerate(payments):
    df[feature] = pd.cut(df[feature], bins=bins, labels=labels).astype(np.int)
#count how many total late payments have been made
df['TotalLatePayments'] = df[payments].sum(axis=1)

# Creating an Attribute for % of billed Amounts Paid.  Cards not used have a rate of 1000
# Charts showing relationship of this variable to Default is in the Appendix.
df['TotalBilled'] = df.BILL_AMT1+df.BILL_AMT2+df.BILL_AMT3+df.BILL_AMT4+df.BILL_AMT5+df.BILL_AMT5
df['TotalPaid'] = df.PAY_AMT1+df.PAY_AMT2+df.PAY_AMT3+df.PAY_AMT4+df.PAY_AMT5+df.BILL_AMT5

df['PayRateCalc']  =  df['TotalPaid']/df['TotalBilled']
df['PayRateLimit'] = 0
df['PayRate'] = df['PayRateCalc'].where(df['PayRateCalc'] < 1.25, 1.25)
df['PayRate'] = df['PayRate'].where(df['TotalBilled'] > 0, 1000) # Approximately isolates Cards not used.
df['PayRate'] = df['PayRate'].where(df['PayRate'] > 0, 0)

df['PayrateGroup'] = df['PayRate']*100//5*5

#Create a separate dataset in case we need to come back to original
dfsub = df.copy()
#dfsub = pd.concat((df,tmp_df_1,tmp_df_2),axis=1)

#We will not need these attributes. We are using log of them instead.
deleteVar = ['LIMIT_BAL','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6',
            'BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6',
            'EDUCATION','MARRIAGE','AGEGROUP','AGE','TotalBilled','TotalPaid',
            'PayRateCalc','PayRateLimit','PayRate','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','Unnamed: 25','Unnamed: 26']

for fi,feature in enumerate(deleteVar):
    del dfsub[feature]

dfsub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 1 to 30000
Data columns (total 20 columns):
SEX                  30000 non-null int64
PAY_0                30000 non-null int32
default              30000 non-null int64
log_LIMIT_BAL        30000 non-null float64
log_PAY_AMT1         30000 non-null float64
EDUCATION_1          30000 non-null uint8
EDUCATION_2          30000 non-null uint8
EDUCATION_3          30000 non-null uint8
EDUCATION_4          30000 non-null uint8
MARRIAGE_1           30000 non-null uint8
MARRIAGE_2           30000 non-null uint8
MARRIAGE_3           30000 non-null uint8
AGEGROUP_0           30000 non-null uint8
AGEGROUP_1           30000 non-null uint8
AGEGROUP_2           30000 non-null uint8
AGEGROUP_3           30000 non-null uint8
AGEGROUP_4           30000 non-null uint8
AGEGROUP_5           30000 non-null uint8
TotalLatePayments    30000 non-null int64
PayrateGroup         30000 non-null float64
dtypes: float64(3), int32(1), int64(3), uint8

#### KNN to predict Default

In [21]:
# Target variable is unbalanced

target_count = dfsub.default.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

Class 0: 23364
Class 1: 6636
Proportion: 3.52 : 1


In [22]:
# Create new dataset by oversampling Defaults
#--------------------------------------------

# Class count
df_class_0, df_class_1 = dfsub.default.value_counts()

# Divide by class
df_class_0 = dfsub[dfsub['default'] == 0]
df_class_1 = dfsub[dfsub['default'] == 1]

df_class_1_over = df_class_1.sample(frac=target_count[0]/target_count[1], replace=True)
df_OverSampled = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_OverSampled.default.value_counts())

Random over-sampling:
1    23364
0    23364
Name: default, dtype: int64


In [23]:
# Load Oversampled Set into CV Object, Scale X Variables

from sklearn.model_selection import ShuffleSplit

#Isolate the "default" variable into y and keep everythign else in X to use for predictions:
if 'default' in df_OverSampled:
    y = df_OverSampled['default'].values
    del df_OverSampled['default'] 
    X = df_OverSampled.values

#Create a reuseable cv_object:  random_state keeps the seed.
num_cv_iterations = 10
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,test_size  = 0.2, random_state=0)

#train_indices
for train_indices, test_indices in cv_object.split(X,y): 
    X_train = X[train_indices]
    y_train = y[train_indices] 
    X_test = X[test_indices]
    y_test = y[test_indices]

# Just commenting out the size-check to cut down on the number of cells.    
#print("Dimensions of training features are " , X_train.shape)
#print("Dimensions of training target are " , y_train.shape)
#print("Dimensions of testing features are " , X_test.shape)
#print("Dimensions of testing target are " , y_test.shape)

from sklearn.preprocessing import StandardScaler
# scale attributes by the training set
scl_obj = StandardScaler()
scl_obj.fit(X_train)

X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) # apply those means and std to the test set (without snooping at the test set values)



In [24]:
%%time
from sklearn.neighbors import KNeighborsClassifier
svcEstimator = KNeighborsClassifier()
cv = 5

#compare various values of C, kernels (rbf vs linear vs poly),decision_function_shape (ovo vs ovr) 
parameters = {'n_neighbors': [3,5,7,11]}

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
svcGridSearch = GridSearchCV(estimator=svcEstimator
                    , n_jobs=8 # jobs to run in parallel
                    , verbose=1 # low verbosity
                    , param_grid=parameters
                    , cv=cv # KFolds = 5
                    , scoring='accuracy')

svcGridSearch.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:  8.3min finished


Wall time: 8min 25s


In [25]:
#Display the best estimator parameters
svcGridSearch.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [29]:
%%time
from sklearn import metrics as mt
clf = svcGridSearch.best_estimator_

clf.fit(X_train_scaled,y_train)  # train object

y_hat = clf.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)

print('accuracy:', acc )
print(conf )
print('f1_score:', mt.f1_score(y_test, y_hat))
print('Precision:', mt.precision_score(y_test, y_hat))
print('Recall:', mt.recall_score(y_test, y_hat))  

accuracy: 0.762572223411085
[[3019 1587]
 [ 632 4108]]
f1_score: 0.7873502635361763
Precision: 0.7213345039508341
Recall: 0.8666666666666667
Wall time: 18.5 s


#### KNN To Predict Education

In [41]:
# Class count
target_count = dfsub.EDUCATION_2.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

#Making judgement call that 14% imbalance is OK.  Will proceed without oversampling.

Class 0: 15970
Class 1: 14030
Proportion: 1.14 : 1


In [42]:
# Create Target.  We will focus on University vs all Others.
# Since Education is one-hot encoded, deleting the other Education One-Hots, leaving target only.
dfEduc = dfsub

del dfEduc['EDUCATION_1']
del dfEduc['EDUCATION_3']
del dfEduc['EDUCATION_4']


In [43]:
#Isolate the "default" variable into y and keep everythign else in X to use for predictions:
if 'EDUCATION_2' in dfEduc:
    y = dfEduc['EDUCATION_2'].values
    del dfEduc['EDUCATION_2'] 
    X = dfEduc.values

#Create a reuseable cv_object:  random_state keeps the seed.
num_cv_iterations = 10
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,test_size  = 0.2, random_state=0)

#train_indices
for train_indices, test_indices in cv_object.split(X,y): 
    X_train = X[train_indices]
    y_train = y[train_indices] 
    X_test = X[test_indices]
    y_test = y[test_indices]

# Just commenting out the size-check to cut down on the number of cells.    
#print("Dimensions of training features are " , X_train.shape)
#print("Dimensions of training target are " , y_train.shape)
#print("Dimensions of testing features are " , X_test.shape)
#print("Dimensions of testing target are " , y_test.shape)

from sklearn.preprocessing import StandardScaler
# scale attributes by the training set
scl_obj = StandardScaler()
scl_obj.fit(X_train)

X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) # apply those means and std to the test set (without snooping at the test set values)


In [44]:
%%time
from sklearn.neighbors import KNeighborsClassifier
svcEstimator = KNeighborsClassifier()
cv = 5

#compare various values of C, kernels (rbf vs linear vs poly),decision_function_shape (ovo vs ovr) 
parameters = {'n_neighbors': [3,5,7,11]}

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
svcGridSearch = GridSearchCV(estimator=svcEstimator
                    , n_jobs=8 # jobs to run in parallel
                    , verbose=1 # low verbosity
                    , param_grid=parameters
                    , cv=cv # KFolds = 5
                    , scoring='accuracy')

svcGridSearch.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:  3.1min finished


Wall time: 3min 8s


In [45]:
#Display the best estimator parameters
svcGridSearch.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [46]:
%%time
from sklearn import metrics as mt
clf = svcGridSearch.best_estimator_

clf.fit(X_train_scaled,y_train)  # train object

y_hat = clf.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)

print('accuracy:', acc )
print(conf )
print('f1_score:', mt.f1_score(y_test, y_hat))
print('Precision:', mt.precision_score(y_test, y_hat))
print('Recall:', mt.recall_score(y_test, y_hat))  

accuracy: 0.5666666666666667
[[1978 1208]
 [1392 1422]]
f1_score: 0.5224099926524615
Precision: 0.5406844106463878
Recall: 0.5053304904051172
Wall time: 6.36 s


# Appendix

## Data Meaning Type

#### Attribute Information
The data used is "Default of Credit Card Clients" from UCI. It was attained by I-Cheng Yeh with Chung Hua University and Tamkang University in Taiwan. The original goal was to predict default rates.

The data has a 6 month history of 30,000 Taiwanese credit account balances and transactions. Each observation contains a binary reponse variable "default" with values 1 indicating a default occured and 0 indicating no default occured.

The following explanatory variables are included:

 - LIMIT_BAL = Total credit amount allowed
 
 - SEX
     -  1 = Male
     -  2 = Female
 
 - EDUCATION
     - 1 = Graduate School
     - 2 = University
     - 3 = High School
     - 4 = Other
   
 - MARRIAGE
     - 1 = Married
     - 2 = Single
     - 3 = Other
       
 - AGE = Credit holder age in years
 
Payment history (2005)
 - PAY_0 = September
 - PAY_2 = August
 - PAY_3 = July
 - PAY_4 = June
 - PAY_5 = May
 - PAY_6 = April
      -  -1 = payment received on time
      -   1 = payment received one month late
      -   2 = payment received two months late
      -   "......"
      -   9 = payment received nine months late or more
         
Statement amount (NT dollars, 2005)
 - BILL_AMT1 = September
 - BILL_AMT2 = August
 - BILL_AMT3 = July
 - BILL_AMT4 = June
 - BILL_AMT5 = May
 - BILL_AMT6 = April
 
Payment amount (NT dollars, 2005).
 - PAY_AMT1 = September
 - PAY_AMT2 = August
 - PAY_AMT3 = July
 - PAY_AMT4 = June
 - PAY_AMT5 = May
 - PAY_AMT6 = April

Original Source Data Set Information  
https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients#