In [30]:
%matplotlib inline

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score, KFold

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [31]:
df = pd.read_csv('bank_sam.csv')

In [32]:
#Output value must be numerical, so I replaced categorical 'yes'/'no' with 1/0
#df2 = df.copy()
df.y.replace(['yes','no'],[1,0], inplace=True)
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,0


In [7]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int64
dtype: object

In [33]:
#convert categorical data to numeric dummy variables
df = pd.get_dummies(df)

In [37]:
#df.dtypes

In [34]:
#Important note: this attribute highly affects the output target (e.g., if duration=0 
#then y='no'). Yet, the duration is not known before a call is performed. 
df['output'] = df.y
del df['duration']
del df['y']

In [38]:
df.head(5)

Unnamed: 0,age,balance,day,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,output
0,30,1787,19,1,-1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,33,4789,11,1,339,4,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,35,1350,16,1,330,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,30,1476,3,4,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,59,0,5,1,-1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0


In [41]:
print df

      age  balance  day  campaign  pdays  previous  job_admin.  \
0      30     1787   19         1     -1         0           0   
1      33     4789   11         1    339         4           0   
2      35     1350   16         1    330         1           0   
3      30     1476    3         4     -1         0           0   
4      59        0    5         1     -1         0           0   
5      35      747   23         2    176         3           0   
6      36      307   14         1    330         2           0   
7      39      147    6         2     -1         0           0   
8      41      221   14         2     -1         0           0   
9      43      -88   17         1    147         2           0   
10     39     9374   20         1     -1         0           0   
11     43      264   17         2     -1         0           1   
12     36     1109   13         2     -1         0           0   
13     20      502   30         1     -1         0           0   
14     31 

In [78]:
train_split, test_split = train_test_split(df)
#train, test, train_t, test_t = train_test_split

X = df.iloc[:,:50]
# Scale the X values
X_scaled = preprocessing.scale(X.astype(float), copy=False)
Y = df.iloc[:,50]

train = train_split.iloc[:, :50]
train = preprocessing.scale(train.astype(float), copy=False)
train_t = train_split.iloc[:, 50]
test =  test_split.iloc[:, :50]
test = preprocessing.scale(test.astype(float), copy=False)
test_t = test_split.iloc[:, 50]

In [79]:
kf = cross_validation.KFold(len(X_scaled), n_folds=10, shuffle=True)

In [12]:
print "%d columns:" % len(X.columns), [str(c) for c in X.columns]
print "%d values:" % len(X_scaled[0]), X_scaled[0]

50 columns: ['age', 'balance', 'day', 'campaign', 'pdays', 'previous', 'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married', 'marital_single', 'education_primary', 'education_secondary', 'education_tertiary', 'education_unknown', 'default_no', 'default_yes', 'housing_no', 'housing_yes', 'loan_no', 'loan_yes', 'contact_cellular', 'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep', 'poutcome_failure', 'poutcome_other', 'poutcome_success', 'poutcome_unknown']
50 values: [-1.05626965  0.12107186  0.37405206 -0.57682947 -0.4072183  -0.32041282
 -0.3438445  -0.5144078  -0.19645374 -0.15938189 -0.52230648 -0.23151795
 -0.2053907  -0.31876039 -0.13759255 -0.4523674

In [80]:
svc_model = SVC()
svc_model.fit(train, train_t)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [127]:
#scores before scaling x values
mean_svc_precision = np.mean(cross_val_score(svc_model, X, Y, cv=10,scoring='precision'))
mean_svc_accuracy = np.mean(cross_val_score(svc_model, X, Y, cv=10,scoring='accuracy'))
mean_svc_recall = np.mean(cross_val_score(svc_model, X, Y, cv=10,scoring='recall'))
mean_svc_f1 = np.mean(cross_val_score(svc_model, X, Y, cv=10,scoring='f1'))

print mean_svc_precision, mean_svc_accuracy,mean_svc_recall,mean_svc_f1

0.35 0.885202875618 0.00961538461538 0.60917938135


In [137]:
#scores post scaling
mean_svc_precision = np.mean(cross_val_score(svc_model, X_scaled, Y, cv=10,scoring='precision'))
mean_svc_accuracy = np.mean(cross_val_score(svc_model, X_scaled, Y, cv=10,scoring='accuracy'))
mean_svc_recall = np.mean(cross_val_score(svc_model, X_scaled, Y, cv=10,scoring='recall'))
mean_svc_f1 = np.mean(cross_val_score(svc_model, X_scaled, Y, cv=10,scoring='f1'))
mean_svc_roc = np.mean(cross_val_score(svc_model, X, Y, cv=10,scoring='roc_auc'))
print mean_svc_precision, mean_svc_accuracy,mean_svc_recall,mean_svc_f1,mean_sv

0.619393939394 0.889847428158 0.120827285922 0.60917938135


In [141]:
#scores post scaling with shuffle
mean_svc_precision = np.mean(cross_val_score(svc_model, X_scaled, Y, cv=kf,scoring='precision'))
mean_svc_accuracy = np.mean(cross_val_score(svc_model, X_scaled, Y, cv=kf,scoring='accuracy'))
mean_svc_recall = np.mean(cross_val_score(svc_model, X_scaled, Y, cv=kf,scoring='recall'))
mean_svc_f1 = np.mean(cross_val_score(svc_model, X_scaled, Y, cv=kf,scoring='f1'))
mean_svc_auc = np.mean(cross_val_score(svc_model, X, Y, cv=kf,scoring='roc_auc'))
print mean_svc_precision, mean_svc_accuracy,mean_svc_recall,mean_svc_f1,mean_svc_auc


 0.596551226551 0.888742698627 0.118186527907 0.194392317926 0.614353363644


In [81]:
models = {'logistic': LogisticRegression(),
          'rf': RandomForestClassifier(n_estimators=200),
          'knn': KNeighborsClassifier(n_neighbors=30),
          'svc': SVC(probability=True),
          'tree': DecisionTreeClassifier(),
          'gbm1': GradientBoostingClassifier(learning_rate=0.4,  n_estimators=200, subsample=1.0, random_state=1, verbose=1),
          'gbm2': GradientBoostingClassifier(learning_rate=0.04, n_estimators=200, subsample=0.9, random_state=1, verbose=1),
         }
stage_preds = {}
final_preds = {}
scoring = ['precision','accuracy','recall','f1','roc_auc']
scores = []

In [83]:
for mname, m in models.iteritems():
    print "*** %s" % mname
    m.fit(train, train_t)
    for s in scoring:
        score = np.mean(cross_val_score(m, X_scaled, Y, cv=kf,scoring=s))
        scores.append([mname, s, score])
    
    if hasattr(m, "staged_predict_proba"):
        stage_preds[mname] = {'train': list(m.staged_predict_proba(train)),  'test': list(m.staged_predict_proba(test))}
    #if hasattr(m, "predict_proba"):
    final_preds[mname] = {'train': m.predict_proba(train),  'test': m.predict_proba(test)}

*** knn
*** tree
*** svc
*** gbm2
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.7116           0.0119            2.15s
         2           0.6999           0.0019            2.14s
         3           0.6962           0.0048            2.07s
         4           0.6856           0.0041            1.97s
         5           0.6820           0.0048            1.96s
         6           0.6831           0.0032            1.99s
         7           0.6750           0.0021            1.97s
         8           0.6628           0.0028            1.99s
         9           0.6778           0.0032            1.96s
        10           0.6674           0.0009            1.87s
        20           0.6327           0.0014            1.41s
        30           0.6155           0.0003            1.31s
        40           0.6194           0.0001            1.22s
        50           0.5998           0.0002            1.16s
        60           0.5787        

In [98]:
scores_cols = ['Precision','Accuracy','Recall','F1','ROC_AUC']
scoresdf = pd.DataFrame(scores)
score_df = pd.DataFrame()
scoresdf.head()
#score_df.columns = scores_cols


Unnamed: 0,0,1,2
0,knn,precision,0.571667
1,knn,accuracy,0.886307
2,knn,recall,0.058666
3,knn,f1,0.104504
4,knn,roc_auc,0.704883


In [100]:
for i in range(5):
    print i
    scoresdf[]
    

1
2
3
4
5


In [67]:
sc = []
mname = 'knn'
test = .99
sc.append([mname, test])
print sc

[['knn', 0.99]]
