In [80]:
%matplotlib inline
# print(__doc__)
# from __future__ import print_function

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [81]:
df = pd.read_csv('bank-additional.csv')

In [82]:
#Output value must be numerical, so I replaced categorical 'yes'/'no' with 1/0
df.replace(['yes','no', 'unknown'],[1,0, np.nan], inplace=True)
df = df.dropna()

In [83]:
def ed_preproc(ed):
    ed_new = []
    for e in ed:
        if e == 'basic.4y' or e == 'basic.6y' or e == 'basic.9y' or e == 'illiterate' or e == 'high.school':
            e = 'some'
            ed_new.append(e)
        else:
            ed_new.append(e)
    return ed_new
df.education = ed_preproc(df.education)

In [84]:
def job_proc(job):
    job_new = []
    for j in job:
        if j == 'entrepreneur':
            j = 'self-employed'
            job_new.append(j)
        elif j == 'admin.' or j == 'services' or j == 'technician':
            j = 'professional'
            job_new.append(j)
        elif j == 'housemaid':
            j = 'unemployed'
            job_new.append(j)
        else:
            job_new.append(j)
    return job_new
df.job = job_proc(df.job)

In [85]:
df.head(3)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,some,0,1,0,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,0
1,39,professional,single,some,0,0,0,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,0
2,25,professional,married,some,0,1,0,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,0


In [86]:
#convert categorical data to numeric dummy variables
#Important note: this attribute highly affects the output target (e.g., if duration=0 
#then y='no'). Yet, the duration is not known before a call is performed. 
df = pd.get_dummies(df)
df['output'] = df.y
del df['duration']
del df['y']

In [87]:
df.head()

Unnamed: 0,age,default,housing,loan,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,output
0,30,0,1,0,2,999,0,-1.8,92.893,-46.2,...,0,1,0,0,0,0,0,1,0,0
1,39,0,0,0,4,999,0,1.1,93.994,-36.4,...,0,1,0,0,0,0,0,1,0,0
2,25,0,1,0,1,999,0,1.4,94.465,-41.8,...,0,0,0,0,0,1,0,1,0,0
4,47,0,1,0,1,999,0,-0.1,93.2,-42.0,...,0,0,1,0,0,0,0,1,0,0
5,32,0,0,0,3,999,2,-1.1,94.199,-37.5,...,1,0,0,1,0,0,1,0,0,0


In [88]:
df = df.reindex(np.random.permutation(df.index))
df.head()

Unnamed: 0,age,default,housing,loan,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,output
1424,30,0,1,0,2,999,0,1.4,93.918,-42.7,...,0,0,0,0,0,1,0,1,0,0
2857,27,0,1,0,1,3,1,-1.1,94.199,-37.5,...,1,0,0,1,0,0,0,0,1,1
1807,57,0,1,0,1,999,0,1.4,94.465,-41.8,...,0,0,0,1,0,0,0,1,0,0
3891,31,0,0,0,1,999,0,1.1,93.994,-36.4,...,0,0,0,0,0,1,0,1,0,0
2721,50,0,1,0,1,999,0,-0.1,93.2,-42.0,...,0,0,0,0,1,0,0,1,0,0


In [89]:
X = df.iloc[:,:45]
y = df.iloc[:,45]

x_train, x_test, y_train, y_test  = train_test_split(X, y, test_size = 0.2, random_state = 42)



std_scale = preprocessing.StandardScaler().fit(x_train)
x_train_std = std_scale.transform(x_train)
x_test_std = std_scale.transform(x_test)

# Scale the X values
X_scaled = preprocessing.scale(X.astype(float), copy=False)
Y = df.iloc[:,45]

print "training set size: " + str(len(x_train_std))
print "test set size: " + str(len(x_test_std))
print "training_t set size: " + str(len(y_train))
print "test_t set size: " + str(len(y_test))

training set size: 2472
test set size: 618
training_t set size: 2472
test_t set size: 618


In [90]:
kf = cross_validation.KFold(len(X_scaled), n_folds=10, shuffle=True)

In [91]:
#find best number for KNN
def best_neighbor(N):
    best_k = 1
    best_score = 0
    for i in range(1,N+1):
        neigh = KNeighborsClassifier(n_neighbors=i)
        neigh.fit(x_train_std, y_train)
        new_score = accuracy_score(y_test,neigh.predict(x_test_std))
        if new_score > best_score:
            best_score = new_score
            best_k = i
    
    return best_k
best_neighbor(30)

11

In [92]:
models = [LogisticRegression(penalty='l2',C=0.001), SVC(C=100,gamma=.0001,probability = True), GaussianNB(), DecisionTreeClassifier(max_features='log2',max_depth=3,criterion='entropy'), RandomForestClassifier(n_estimators=30,criterion='entropy'), KNeighborsClassifier(best_neighbor(30))]
scores = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]

# create empty lists
LogisticReg = []
SVM = []
GaussNB = []
DecisionTree = []
RandomForest = []
kNN = []
ROC_AUC = []

# list of lists
lists = [LogisticReg, SVM, GaussNB, DecisionTree, RandomForest, kNN, ROC_AUC]

# populate lists with scores of each scoring method
for i, model in enumerate(models):
    for score in scores:
        est = model
        est.fit(x_train_std, y_train)
        pred = est.predict(x_test_std)
        lists[i].append(score(y_test, pred))

# create a dataframe which aggregates the lists
    scores_df = pd.DataFrame(data = [LogisticReg, SVM, GaussNB, DecisionTree, RandomForest, kNN])
    scores_df.index = ["LogisticReg", "SVM", "GaussNB", "DecisionTree", "RandomForest", "kNN"]
    scores_df.columns = ["Accuracy", "Precision", "Recall", "F1","ROC_AUC"]
    
print scores_df
scores = [accuracy_score, precision_score, recall_score, f1_score, ROC_AUC]


              Accuracy  Precision    Recall        F1   ROC_AUC
LogisticReg   0.875405   0.520833  0.316456  0.393701  0.636892
SVM           0.881877   0.625000  0.189873  0.291262  0.586588
GaussNB       0.716828   0.269231  0.708861  0.390244  0.713429
DecisionTree  0.877023   0.250000  0.037975  0.244898  0.577640
RandomForest  0.880259   0.608696  0.215190  0.290909  0.584733
kNN           0.885113   0.653846  0.215190  0.323810  0.599246


In [93]:
models = {'logistic': LogisticRegression(),
          'rf': RandomForestClassifier(n_estimators=200),
          'knn': KNeighborsClassifier(n_neighbors=30),
          'svc': SVC(probability=True),
          'tree': DecisionTreeClassifier(),
          #'gbm1': GradientBoostingClassifier(learning_rate=0.4,  n_estimators=200, subsample=1.0, random_state=1, verbose=1),
          #'gbm2': GradientBoostingClassifier(learning_rate=0.04, n_estimators=200, subsample=0.9, random_state=1, verbose=1),
         }
stage_preds = {}
final_preds = {}


In [94]:
for mname, m in models.iteritems():
    print "*** %s" % mname
    m.fit(train_std, train_t)
 
    
    if hasattr(m, "staged_predict_proba"):
        stage_preds[mname] = {'train': list(m.staged_predict_proba(train_std)),  'test': list(m.staged_predict_proba(test_std))}
    #if hasattr(m, "predict_proba"):
    final_preds[mname] = {'train': m.predict_proba(train),  'test': m.predict_proba(test)}

*** knn


NameError: name 'train_std' is not defined

In [None]:
def model_search(estimator, tuned_params, scores, X_train, y_train, X_test, y_test):  
    cv = cross_validation.ShuffleSplit(len(X_train), n_iter=3, test_size=0.30, random_state=0)
    for score in scores:
        print"# Tuning hyper-parameters for %s" % score
        print

        clf = GridSearchCV(estimator, tuned_params, cv=cv,scoring='%s' % score)
        clf.fit(X_train, y_train)

        print"Best parameters set found on development set:"
        print
        print clf.best_params_
        print
        print "Grid scores on development set:"
        print
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
        print

        print "Detailed classification report:"
        print
        print "The model is trained on the full development set."
        print "The scores are computed on the full evaluation set."
        print
        y_true, y_pred = y_test, clf.predict(X_test)
        print classification_report(y_true, y_pred)
        print

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}]
                    #{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']
model_search(SVC(), tuned_parameters, scores, x_train_std, y_train, x_test_std, y_test)

In [None]:
tuned_parameters = [{'n_estimators': [10,20,30,40,50], 'max_features': ['auto','sqrt','log2'],
                     'criterion': ['gini', 'entropy']}]
scores = ['precision', 'recall']
model_search(RandomForestClassifier(), tuned_parameters, scores, x_train_std, y_train, x_test_std, y_test)

In [None]:
tuned_parameters = [{'penalty': ['l1','l2'], 'C': [0.001,0.01,1.0,10,100,1000]}]
scores = ['precision', 'recall']
model_search(LogisticRegression(), tuned_parameters, scores, x_train_std, y_train, x_test_std, y_test)

In [None]:
tuned_parameters = [{'criterion': ['gini', 'entropy'],'max_depth': [3,4,5,6,7], 'max_features': ['auto','log2']}]
scores = ['precision', 'recall']
model_search(DecisionTreeClassifier(), tuned_parameters, scores, x_train_std, y_train, x_test_std, y_test)

In [95]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier()
X_new = clf.fit(x_train_std, y_train).transform(x_train_std)
clf.feature_importances_  
# X_new.shape               

array([  1.16478745e-01,   9.75632859e-05,   3.92699523e-02,
         2.36682161e-02,   7.73930535e-02,   3.42172182e-02,
         2.12343219e-02,   4.53909754e-02,   1.60276181e-02,
         2.39171768e-02,   9.86863477e-02,   6.73824278e-02,
         1.27716762e-02,   1.19737273e-02,   2.43646079e-02,
         1.23682852e-02,   7.79424596e-03,   5.33385439e-03,
         1.24094786e-02,   1.37798785e-02,   2.41323362e-02,
         2.19709634e-02,   2.01355989e-02,   1.98991812e-02,
         2.06785571e-02,   1.11492764e-02,   8.61876783e-03,
         4.46245805e-03,   5.25975234e-03,   3.06256701e-03,
         7.22579682e-03,   5.54524074e-03,   1.03339997e-02,
         1.00349720e-02,   6.32406575e-03,   8.42426175e-03,
         4.29539199e-03,   2.10968270e-02,   1.98241941e-02,
         1.86560507e-02,   1.95733004e-02,   1.94812761e-02,
         1.30892799e-02,   9.39970138e-03,   2.27668132e-02])

In [96]:
df.dtypes

age                                int64
default                          float64
housing                          float64
loan                             float64
campaign                           int64
pdays                              int64
previous                           int64
emp.var.rate                     float64
cons.price.idx                   float64
cons.conf.idx                    float64
euribor3m                        float64
nr.employed                      float64
job_blue-collar                  float64
job_management                   float64
job_professional                 float64
job_retired                      float64
job_self-employed                float64
job_student                      float64
job_unemployed                   float64
marital_divorced                 float64
marital_married                  float64
marital_single                   float64
education_professional.course    float64
education_some                   float64
education_univer