# Run Multiple Classifiers

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

In [2]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv
data_location: default2.csv


In [3]:
data = pd.read_csv(data_location)
data.sample(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
11130,11131,50000,2,2,1,43,1,2,0,0,...,48020,47120,50319,0,3000,4000,0,4000,2000,0
371,372,160000,1,1,2,30,-1,-1,-1,-1,...,8578,13028,21712,2977,15086,9123,13028,29712,50000,0
18361,18362,130000,2,2,2,22,0,-1,-1,0,...,7816,16578,16129,527,11249,1200,16578,0,33232,0
17523,17524,200000,2,2,1,37,1,2,0,0,...,10155,12470,13571,0,2000,1155,2470,1265,0,0
23514,23515,120000,2,2,1,30,2,-1,3,2,...,1410,479,3158,1701,0,9,0,3158,0,0


In [4]:
label_col = 'default'

feature_columns = data.columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)

In [5]:

X = data[feature_columns]
y = data[[label_col]]

print (X.shape)
print (y.shape)

(30000, 23)
(30000, 1)


In [6]:
from sklearn.model_selection import train_test_split

## TODO : split data into train / test, with 20% for test
## hint : 20% is 0.2

X_train,X_test,y_train, y_test = train_test_split(X,y,  test_size=0.2, random_state=123)


print ("x_train :" , X_train.shape )
print ("x_test :", X_test.shape)
print ("y_train :", y_train.shape)
print ("y_test :", y_test.shape)

x_train : (24000, 23)
x_test : (6000, 23)
y_train : (24000, 1)
y_test : (6000, 1)


## Setup Algorithms

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

algorithms = [
    LogisticRegression(max_iter=500),
    RandomForestClassifier(),
    LinearSVC(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(n_neighbors=5),
    SGDClassifier(),
    DecisionTreeClassifier()
]

## Run Algorithms and print stats

In [8]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

cm_labels = np.unique(y)
    
for algo in algorithms:
    print ()
    print ("============== Running {} ======".format(algo))
    t1 = time.perf_counter()
    model = algo.fit (X_train, np.ravel(y_train))
    t2 = time.perf_counter()
    y_pred = model.predict(X_test)
    t3 = time.perf_counter()
    
    cm_array = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm_array, index=cm_labels, columns=cm_labels)
    
    print ("Training time : {:,.1f} secs ({:,.1f} ms)".format ( (t2-t1), (t2-t1)*1e3))
    print ("Prediction time : {:,.1f} secs ({:,.1f} ms)".format ( (t3-t2), (t3-t2)*1e3))
    print ("Training score : ", model.score(X_train, y_train))
    print ("Testing score : ", model.score(X_test, y_test))
    print ("Confusion matrix : ")
    print (cm_df)
    
    



Training time : 0.4 secs (417.3 ms)
Prediction time : 0.0 secs (3.1 ms)
Training score :  0.77775
Testing score :  0.7825
Confusion matrix : 
      0  1
0  4695  1
1  1304  0

Training time : 4.0 secs (3,990.6 ms)
Prediction time : 0.1 secs (93.8 ms)
Training score :  0.9992916666666667
Testing score :  0.818
Confusion matrix : 
      0    1
0  4394  302
1   790  514





Training time : 1.7 secs (1,675.7 ms)
Prediction time : 0.0 secs (1.3 ms)
Training score :  0.7430833333333333
Testing score :  0.7511666666666666
Confusion matrix : 
      0    1
0  4397  299
1  1194  110

Training time : 5.7 secs (5,650.5 ms)
Prediction time : 0.0 secs (6.9 ms)
Training score :  0.8256666666666667
Testing score :  0.8228333333333333
Confusion matrix : 
      0    1
0  4440  256
1   807  497

Training time : 0.0 secs (41.6 ms)
Prediction time : 0.4 secs (375.9 ms)
Training score :  0.8160416666666667
Testing score :  0.7586666666666667
Confusion matrix : 
      0    1
0  4325  371
1  1077  227

Training time : 0.3 secs (258.4 ms)
Prediction time : 0.0 secs (1.3 ms)
Training score :  0.27070833333333333
Testing score :  0.2655
Confusion matrix : 
     0     1
0  346  4350
1   57  1247

Training time : 0.4 secs (376.9 ms)
Prediction time : 0.0 secs (2.0 ms)
Training score :  0.9993333333333333
Testing score :  0.7201666666666666
Confusion matrix : 
      0    1
0  3769 