In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
# load data
banknote_data = pd.read_csv("data_banknote_authentication.txt", header = None, \
                            names = ['variance', 'skewness','curtosis','entropy','class'])

In [None]:
banknote_data['entropy'] =  -banknote_data['entropy']
banknote_data['variance'] = - banknote_data['variance']
banknote_data['skewness'] = - banknote_data['skewness']
banknote_data['curtosis'] = - banknote_data['curtosis']

In [None]:
banknote_data['entropy']

In [None]:
# shuffle data
new_index = np.arange(len(banknote_data))
np.random.shuffle(new_index)
banknote_data_shuffled = banknote_data.reindex(new_index)
banknote_data_shuffled.reset_index(drop=True)
banknote_data_shuffled.head()

In [None]:
# split feature and label
banknote_features = banknote_data_shuffled.iloc[:,0:4]
banknote_target = banknote_data_shuffled.iloc[:,4]

In [None]:
feature_list = list(banknote_features.columns)
print feature_list
model_name_list = ['Decision Tree', 'Random Forest', \
                   'Gradient Boosted', 'Logistic Regression', 'SVM']
# DECISION TREE
dt = DecisionTreeClassifier()
# RANDOM FOREST
rf = RandomForestClassifier()
# GRADIENT BOOSTED TREES
gb = GradientBoostingClassifier()
# LOGISTIC REGRESSION
lr = LogisticRegression()
# SUPPORT VECTOR MACHINE
sv = SVC(probability=True, kernel='linear', C =0.5)
model_list = [dt, rf, gb, lr, sv]




In [None]:
# split data for every fold
def split_by_folds(features, target, number, total_folds = 10):
    start_number = int(number * len(features) / 10)
    end_number = int((number + 1) * len(features)/10)
    test_features = features.iloc[start_number:end_number,:]
    test_target = target.iloc[start_number:end_number]
    train_features = features.iloc[range(start_number) +\
                                   range(end_number, len(features)),:]
    train_target = target.iloc[range(start_number) + range(end_number, len(features))]
    return test_features, test_target, train_features, train_target
    

In [None]:
def cross_viladition(model_list, features, target, \
                     model_name_list = model_name_list, total_folds = 10,\
                     feature_list=feature_list):
    auc = np.zeros((total_folds,len(model_list)))
    for i in range(total_folds):
        test_features, test_target,train_features, train_target \
        = split_by_folds(features, target, i)
        print(len(test_features))
        print(len(test_target))
        print(len(train_features))
        print(len(train_target))
        for j,jmodel in enumerate(model_list):
            jmodel.fit(train_features, train_target)
            hard_predict = jmodel.predict(test_features)
            acc = np.isclose(hard_predict, test_target).sum() / float(len(test_target))
            print("ACC from model {} of test fold {} is \
            {}".format(model_name_list[j], i, acc))
        
            # use predicted probabilities to construct ROC curve and AUC score
            soft_predict = jmodel.predict_proba(test_features)
#             print(soft_predict[:5,1])
            fpr,tpr,thresh = roc_curve(test_target, soft_predict[:,1], pos_label=1)
            auc[i,j] = roc_auc_score(test_target, soft_predict[:,1])
            print("AUC from model {} of test fold {} \
            is {}".format(model_name_list[j], i, auc[i,j]))
            plt.plot(fpr,tpr, '-v', label = model_name_list[j])
        for ifeature in feature_list:
            fpr, tpr, thresh = roc_curve(test_target, test_features[ifeature])
            plt.plot(fpr,tpr, '-o', label = ifeature)
        plt.plot([0,1], [0,1], 'r--', alpha = .5)
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
        axes = plt.gca()
        axes.set_xlim([-0.1,1.1])
        axes.set_ylim([-0.1,1.1])
        plt.xlabel('fpr')
        plt.ylabel('tpr')
        plt.savefig("fold {}".format(i), bbox_inches='tight')
        plt.show()
        
    return auc

In [None]:
auc = cross_viladition(model_list, banknote_features, banknote_target)

In [None]:
cross_viladition(model_list, banknote_features[['curtosis', 'variance']], \
                 banknote_target, feature_list=['curtosis', 'variance'])

In [None]:
def nested_cross_viladition(model, features, target, C= [ 1.0,0.5,0.1,0.01,0.001],\
                            total_folds = 10, feature_list=feature_list):
    auc = np.zeros((total_folds))
    for i in range(total_folds):
        test_features, test_target,train_features, train_target  = \
        split_by_folds(features, target, i)
        
        best_acc = 0
        best_C = 0
        for j, jc in enumerate(C):
            local_acc = np.zeros((total_folds-1,))
            for k in range(total_folds - 1):
                local_test_features, local_test_target,local_train_features, local_train_target  \
                = split_by_folds(train_features, train_target, k, total_folds=total_folds-1)
                model.C = jc
                model.fit(local_train_features, local_train_target)
                hard_predict = model.predict(local_test_features)
                local_acc[k] = np.isclose(hard_predict, \
                                          local_test_target).sum() / float(len(local_test_target))
                print("ACC of {} of test fold {} is {}".format(jc, k, local_acc[k]))
            if local_acc.mean() > best_acc:
                best_acc = local_acc.mean()
                best_C = jc
        print("best C is {}, and best acc is {}".format(best_C,best_acc))    
        # use predicted probabilities to construct ROC curve and AUC score
        model.C = best_C
        model.fit(train_features,train_target)
        soft_predict = model.predict_proba(test_features)
        auc[i] = roc_auc_score(test_target, soft_predict[:,1])
        print("AUC of test fold {} is {}".format( i, auc[i]))   
        
        
    return auc, best_C

In [None]:
nested_cross_viladition(lr, banknote_features, banknote_target)

In [None]:
nested_cross_viladition(sv, banknote_features, banknote_target)

In [None]:
# derive mean and variance of auc
def mean_variance_auc(auc, model_name_list=model_name_list):
    for i, imodel in enumerate(model_name_list):
        imean = auc[:,i].mean()
        ivar = auc[:,i].var()
        print( "mean and variance of model {} is {}, {}".format(imodel, imean,ivar))

In [None]:
mean_variance_auc(auc)

In [None]:
gb.feature_importances_

In [None]:
lr.coef_

In [None]:
dt.feature_importances_

In [None]:
rf.feature_importances_

In [None]:
sv.coef_