In [2]:
%matplotlib inline
print(__doc__)
from __future__ import print_function

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV

from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

Automatically created module for IPython interactive environment


In [3]:
df = pd.read_csv('bank-additional.csv')

In [3]:
#Output value must be numerical, so I replaced categorical 'yes'/'no' with 1/0
#df2 = df.copy()
#df.y.replace(['yes','no'],[1,0], inplace=True)
df.replace(['yes','no', 'unknown'],[1,0, np.nan], inplace=True)
#df.replace('unknown', np.nan)
df = df.dropna()

In [4]:
def ed_preproc(ed):
    ed_new = []
    for e in ed:
        if e == 'basic.4y' or e == 'basic.6y' or e == 'basic.9y' or e == 'illiterate' or e == 'high.school':
            e = 'some'
            ed_new.append(e)
        else:
            ed_new.append(e)
    return ed_new
df.education = ed_preproc(df.education)

In [5]:
def job_proc(job):
    job_new = []
    for j in job:
        if j == 'entrepreneur':
            j = 'self-employed'
            job_new.append(j)
        elif j == 'admin.' or j == 'services' or j == 'technician':
            j = 'professional'
            job_new.append(j)
        elif j == 'housemaid':
            j = 'unemployed'
            job_new.append(j)
        else:
            job_new.append(j)
    return job_new
df.job = job_proc(df.job)

In [6]:
#convert categorical data to numeric dummy variables
#Important note: this attribute highly affects the output target (e.g., if duration=0 
#then y='no'). Yet, the duration is not known before a call is performed. 
df = pd.get_dummies(df)
df['output'] = df.y
del df['duration']
del df['y']

In [8]:
X = df.iloc[:,:45]
y = df.iloc[:,45]

x_train, x_test, y_train, y_test  = train_test_split(X, y, test_size = 0.2, random_state = 42)



std_scale = preprocessing.StandardScaler().fit(x_train)
x_train_std = std_scale.transform(x_train)
x_test_std = std_scale.transform(x_test)

# Scale the X values

X_scaled = preprocessing.scale(X.astype(float), copy=False)
Y = df.iloc[:,45]

print "training set size: " + str(len(x_train_std))
print "test set size: " + str(len(x_test_std))
print "training_t set size: " + str(len(y_train))
print "test_t set size: " + str(len(y_test))

training set size: 2472
test set size: 618
training_t set size: 2472
test_t set size: 618


In [8]:
kf = cross_validation.KFold(len(X_scaled), n_folds=10, shuffle=True)

In [9]:
#find best number for KNN
def best_neighbor(N):
    best_k = 1
    best_score = 0
    for i in range(1,N+1):
        neigh = KNeighborsClassifier(n_neighbors=i)
        neigh.fit(x_train_std, y_train)
        new_score = accuracy_score(y_test,neigh.predict(x_test_std))
        if new_score > best_score:
            best_score = new_score
            best_k = i
    
    return best_k
best_neighbor(30)

8

In [12]:
models = [LogisticRegression(), SVC(probability = True), GaussianNB(), DecisionTreeClassifier(max_depth = 4), RandomForestClassifier(), KNeighborsClassifier(best_neighbor(30))]
scores = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]

# create empty lists
LogisticReg = []
SVMC = []
GaussNB = []
DecisionTree = []
RandomForest = []
kNN = []
ROC_AUC = []

# list of lists
lists = [LogisticReg, SVMC, GaussNB, DecisionTree, RandomForest, kNN, ROC_AUC]

# populate lists with scores of each scoring method
for i, model in enumerate(models):
    for score in scores:
        est = model
        est.fit(x_train_std, y_train)
        pred = est.predict(x_test_std)
        lists[i].append(score(y_test, pred))

# create a dataframe which aggregates the lists
    scores_df = pd.DataFrame(data = [LogisticReg, SVMC, GaussNB, DecisionTree, RandomForest, kNN])
    scores_df.index = ["LogisticReg", "SVMC", "GaussNB", "DecisionTree", "RandomForest", "kNN"]
    scores_df.columns = ["Accuracy", "Precision", "Recall", "F1","ROC_AUC"]
    
print scores_df
scores = [accuracy_score, precision_score, recall_score, f1_score, ROC_AUC]


              Accuracy  Precision    Recall        F1   ROC_AUC
LogisticReg   0.911003   0.583333  0.237288  0.337349  0.609700
SVMC          0.907767   0.555556  0.169492  0.259740  0.577590
GaussNB       0.666667   0.200000  0.830508  0.322368  0.739941
DecisionTree  0.907767   0.541667  0.220339  0.313253  0.600330
RandomForest  0.911003   0.441176  0.305085  0.373626  0.627073
kNN           0.912621   0.666667  0.169492  0.270270  0.580273


In [None]:
models = {'logistic': LogisticRegression(),
          'rf': RandomForestClassifier(n_estimators=200),
          'knn': KNeighborsClassifier(n_neighbors=30),
          'svc': SVC(probability=True),
          'tree': DecisionTreeClassifier(),
          #'gbm1': GradientBoostingClassifier(learning_rate=0.4,  n_estimators=200, subsample=1.0, random_state=1, verbose=1),
          #'gbm2': GradientBoostingClassifier(learning_rate=0.04, n_estimators=200, subsample=0.9, random_state=1, verbose=1),
         }
stage_preds = {}
final_preds = {}


In [13]:
for mname, m in models.iteritems():
    print "*** %s" % mname
    m.fit(train_std, train_t)
 
    
    if hasattr(m, "staged_predict_proba"):
        stage_preds[mname] = {'train': list(m.staged_predict_proba(train_std)),  'test': list(m.staged_predict_proba(test_std))}
    #if hasattr(m, "predict_proba"):
    final_preds[mname] = {'train': m.predict_proba(train),  'test': m.predict_proba(test)}

*** knn
*** tree
*** svc
*** gbm2
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.7009           0.0086            2.50s
         2           0.6934           0.0049            1.78s
         3           0.6880           0.0047            1.54s
         4           0.6866           0.0054            1.41s
         5           0.6851           0.0030            1.33s
         6           0.6747           0.0025            1.27s
         7           0.6799           0.0029            1.26s
         8           0.6600           0.0037            1.22s
         9           0.6698           0.0014            1.27s
        10           0.6618           0.0012            1.24s
        20           0.6321           0.0004            1.08s
        30           0.6327           0.0001            1.01s
        40           0.6028          -0.0009            0.98s
        50           0.5960          -0.0003            0.91s
        60           0.5758        

In [17]:
def model_search(estimator, tuned_params, scores, X_train, y_train, X_test, y_test):  
   
   cv = cross_validation.ShuffleSplit(len(X_train), n_iter=3, test_size=0.30, random_state=0)

   for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [20]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}]
                    #{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']
model_search(SVC, tuned_parameters, scores, x_train_std, y_train, x_test_std, y_test)

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}

Grid scores on development set:

0.866 (+/-0.023) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.764 (+/-0.002) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.867 (+/-0.027) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.867 (+/-0.024) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.861 (+/-0.021) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.866 (+/-0.021) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.853 (+/-0.015) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.867 (+/-0.019) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.92      0.99      0.95       559
          1       0.65      0.22      0.33        59

avg / total     

In [None]:
tuned_parameters = [{'n_estimators': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    #{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']
                    
                    

In [1]:
from math import log
#nrEstimators = [pow(2,x) for x in range(7,9)]
#maxFeatures = [pow(2,x) for x in range(1,int(log(len(x_train.columns.values.tolist()))+2))]

#rfConfigurations = list(itertools.product(*[nrEstimators,maxFeatures]))
#print "configurations: ", rfConfigurations
print 'hello'

hello
