In [20]:
#import all packages for this notebook
import pandas as pd
import numpy as np

df = pd.read_csv('DefaultCreditcardClients.csv')
df.rename(columns={'default payment next month':'default'}, inplace=True)

#set index to the "ID" value and remove the ID column
df.index = df.ID
del df['ID']

#Create Lists for Analysis
continuous_features = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2','BILL_AMT3',
                       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
                       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                       'PAY_AMT6']
ordinal_features = ['EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0','PAY_2', 'PAY_3',
                    'PAY_4', 'PAY_5', 'PAY_6','default']

#Convert datatypes
df[continuous_features] = df[continuous_features].astype(np.float64)
df[ordinal_features] = df[ordinal_features].astype(np.int64)

#convert any non-identified education categories to 'OTHER'
df['EDUCATION'] = df['EDUCATION'].replace(to_replace=(0,5,6),value=4)

#convert any non-identified marriage categories to 'OTHER'
df['MARRIAGE'] = df['MARRIAGE'].replace(to_replace=(0),value=3)

#Log transform continuous variables; as they each have a mostly 
##exponential distribution
df["log_LIMIT_BAL"]=np.log(df.LIMIT_BAL)
df["log_PAY_AMT1"]=np.log(df.PAY_AMT1+1)

# #bin the ages based on various age groups 
bins = [18, 25, 35, 45, 55, 65, 100]
labels = [0,1,2,3,4,5]
df['AGEGROUP'] = pd.cut(df['AGE'], bins=bins, labels=labels)


# One-hot encoding of "EDUCATION" and "MARRIAGE".
tmp_df_1 = pd.get_dummies(df.EDUCATION,prefix='EDUCATION')
tmp_df_2 = pd.get_dummies(df.MARRIAGE,prefix='MARRIAGE')
tmp_df_3 = pd.get_dummies(df.AGEGROUP,prefix='AGEGROUP')
df = pd.concat((df,tmp_df_1,tmp_df_2,tmp_df_3),axis=1)


# flag all the payment histor to late vs not late
payments = ['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']
bins = [-10, 2, 10]
labels = [0,1]
for fi,feature in enumerate(payments):
    df[feature] = pd.cut(df[feature], bins=bins, labels=labels).astype(np.int)
#count how many total late payments have been made
df['TotalLatePayments'] = df[payments].sum(axis=1)

# Creating an Attribute for % of billed Amounts Paid.  Cards not used have a rate of 1000
# Charts showing relationship of this variable to Default is in the Appendix.
df['TotalBilled'] = df.BILL_AMT1+df.BILL_AMT2+df.BILL_AMT3+df.BILL_AMT4+df.BILL_AMT5+df.BILL_AMT5
df['TotalPaid'] = df.PAY_AMT1+df.PAY_AMT2+df.PAY_AMT3+df.PAY_AMT4+df.PAY_AMT5+df.BILL_AMT5

df['PayRateCalc']  =  df['TotalPaid']/df['TotalBilled']
df['PayRateLimit'] = 0
df['PayRate'] = df['PayRateCalc'].where(df['PayRateCalc'] < 1.25, 1.25)
df['PayRate'] = df['PayRate'].where(df['TotalBilled'] > 0, 1000) # Approximately isolates Cards not used.
df['PayRate'] = df['PayRate'].where(df['PayRate'] > 0, 0)

df['PayrateGroup'] = df['PayRate']*100//5*5

#Create a separate dataset in case we need to come back to original
dfsub = df.copy()
#dfsub = pd.concat((df,tmp_df_1,tmp_df_2),axis=1)

#We will not need these attributes. We are using log of them instead.
deleteVar = ['LIMIT_BAL','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6',
            'BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6',
            'EDUCATION','MARRIAGE','AGEGROUP','AGE','TotalBilled','TotalPaid',
            'PayRateCalc','PayRateLimit','PayRate','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']

for fi,feature in enumerate(deleteVar):
    del dfsub[feature]

dfsub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 1 to 30000
Data columns (total 20 columns):
SEX                  30000 non-null int64
PAY_0                30000 non-null int32
default              30000 non-null int64
log_LIMIT_BAL        30000 non-null float64
log_PAY_AMT1         30000 non-null float64
EDUCATION_1          30000 non-null uint8
EDUCATION_2          30000 non-null uint8
EDUCATION_3          30000 non-null uint8
EDUCATION_4          30000 non-null uint8
MARRIAGE_1           30000 non-null uint8
MARRIAGE_2           30000 non-null uint8
MARRIAGE_3           30000 non-null uint8
AGEGROUP_0           30000 non-null uint8
AGEGROUP_1           30000 non-null uint8
AGEGROUP_2           30000 non-null uint8
AGEGROUP_3           30000 non-null uint8
AGEGROUP_4           30000 non-null uint8
AGEGROUP_5           30000 non-null uint8
TotalLatePayments    30000 non-null int64
PayrateGroup         30000 non-null float64
dtypes: float64(3), int32(1), int64(3), uint8

In [21]:
#Isolate the "default" variable into y and keep everythign else in X to use for predictions:
if 'default' in dfsub:
    y = dfsub['default'].values
    del dfsub['default'] 
    X = dfsub.values

#Create a reuseable cv_object:  Random State keeps the seed.
num_cv_iterations = 10
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,test_size  = 0.2, random_state=0)

#train_indices
for train_indices, test_indices in cv_object.split(X,y): 
    X_train = X[train_indices]
    y_train = y[train_indices] 
    X_test = X[test_indices]
    y_test = y[test_indices]

print("Dimensions of training features are " , X_train.shape)
print("Dimensions of training target are " , y_train.shape)
print("Dimensions of testing features are " , X_test.shape)
print("Dimensions of testing target are " , y_test.shape)

Dimensions of training features are  (24000, 19)
Dimensions of training target are  (24000,)
Dimensions of testing features are  (6000, 19)
Dimensions of testing target are  (6000,)


In [22]:
from sklearn.preprocessing import StandardScaler
# scale attributes by the training set
scl_obj = StandardScaler()
scl_obj.fit(X_train)

X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) # apply those means and std to the test set (without snooping at the test set values)


In [24]:
from sklearn.neighbors import KNeighborsClassifier
svcEstimator = KNeighborsClassifier()
cv = 5

#compare various values of C, kernels (rbf vs linear vs poly),decision_function_shape (ovo vs ovr) 
parameters = {'n_neighbors': [3,5,7,10]}

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
svcGridSearch = GridSearchCV(estimator=svcEstimator
                    , n_jobs=8 # jobs to run in parallel
                    , verbose=1 # low verbosity
                    , param_grid=parameters
                    , cv=cv # KFolds = 5
                    , scoring='accuracy')

svcGridSearch.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:  2.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'n_neighbors': [3, 5, 7, 10]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=1)

In [25]:
#Display the best estimator parameters
svcGridSearch.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [26]:
clf = svcGridSearch.best_estimator_

clf.fit(X_train_scaled,y_train)  # train object

y_hat = clf.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)

print('accuracy:', acc )
print(conf )

accuracy: 0.7868333333333334
[[4565  148]
 [1131  156]]
