In [1]:
#Import Statements - Numpy, Pandas, Classifiers, Scorers, Metrics
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
import xgboost as xgb

In [2]:
#Read data in pandas dataframe
data = pd.read_csv("spambase/spambase.data",header=None)

In [3]:
#View rows of data
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
#Creating Classifier objects to test
LogReg = LogisticRegression()
SVM = SVC()
DTC = DecisionTreeClassifier()
MNB = MultinomialNB()
RFC = RandomForestClassifier(n_estimators=100)
XGC = xgb.XGBClassifier(n_estimators = 100,max_depth=5)

In [5]:
#Function that accepts a list of models, data and cross validation turns cv
'''Shuffle data and form feature matrix X and labels Y. Perform train-test split on the data. Create a KFold object to
segment data into k folds. Here k = cv. Try each model in the list of models. Fit and predict using each model.
Print False Positive Rate, False Negative Rate, Overal Error Rate and Average Accuracy across all folds.'''

def cross_validate(models,data,cv):
    dataShuffled = shuffle(data).reset_index(drop=True)
    X = dataShuffled.iloc[:,0:56]
    Y = dataShuffled.iloc[:,-1]
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)
    KF = KFold(n_splits=cv)
    for model in models:
        print('Model: ',model)
        i = 0
        sum_accuracy = 0
        for train_index, test_index in KF.split(X_train):
            print("TRAIN:", len(train_index), "TEST:", len(test_index))
            xtrain, xtest = X_train.iloc[train_index,:], X_train.iloc[test_index,:]
            ytrain,ytest = Y_train.iloc[train_index],Y_train.iloc[test_index]
            model.fit(xtrain,ytrain)
            y_pred = model.predict(xtest)
            tn, fp, fn, tp = confusion_matrix(ytest,y_pred).ravel()
            print('Fold {} : False Positive Rate = '.format(i),fp/(fp+tn))
            print('Fold {} : False Negative Rate = '.format(i),fn/(fn+tp))
            print('Fold {} : Overall Error Rate = '.format(i),(fn+fp)/(tn + fp + fn + tp))
#             print('Fold {} : F1 Score = '.format(i),f1_score(ytest,y_pred))
#             print('Fold {} : Accuracy = '.format(i),accuracy_score(ytest,y_pred))
            sum_accuracy += accuracy_score(ytest,y_pred)
            i+=1
        mean_accuracy = sum_accuracy/cv
        print("Average Accuracy: ",mean_accuracy)
        print('-----------------------------------------------------------')

In [6]:
#Enlist models to test
models = [LogReg,SVM,MNB,DTC,RFC,XGC]

In [7]:
#call cross-validate to test all models
cross_validate(models,data,4)

Model:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
TRAIN: 2760 TEST: 920
Fold 0 : False Positive Rate =  0.06261510128913444
Fold 0 : False Negative Rate =  0.1246684350132626
Fold 0 : Overall Error Rate =  0.08804347826086957
TRAIN: 2760 TEST: 920
Fold 1 : False Positive Rate =  0.04121863799283154
Fold 1 : False Negative Rate =  0.10220994475138122
Fold 1 : Overall Error Rate =  0.06521739130434782
TRAIN: 2760 TEST: 920
Fold 2 : False Positive Rate =  0.046125461254612546
Fold 2 : False Negative Rate =  0.12698412698412698
Fold 2 : Overall Error Rate =  0.07934782608695652
TRAIN: 2760 TEST: 920
Fold 3 : False Positive Rate =  0.0625
Fold 3 : False Negative Rate =  0.11337209302325581
Fold 3 : Overall Error Rate =  0.08152173913043478
Average Accuracy:  0.921467391304



Fold 0 : False Positive Rate =  0.07366482504604052
Fold 0 : False Negative Rate =  0.10344827586206896
Fold 0 : Overall Error Rate =  0.08586956521739131
TRAIN: 2760 TEST: 920




Fold 1 : False Positive Rate =  0.06810035842293907
Fold 1 : False Negative Rate =  0.11602209944751381
Fold 1 : Overall Error Rate =  0.08695652173913043
TRAIN: 2760 TEST: 920




Fold 2 : False Positive Rate =  0.06273062730627306
Fold 2 : False Negative Rate =  0.1164021164021164
Fold 2 : Overall Error Rate =  0.08478260869565217
TRAIN: 2760 TEST: 920




Fold 3 : False Positive Rate =  0.10590277777777778
Fold 3 : False Negative Rate =  0.125
Fold 3 : Overall Error Rate =  0.11304347826086956
Average Accuracy:  0.9073369565217391
-----------------------------------------------------------
Model:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
TRAIN: 2760 TEST: 920
Fold 0 : False Positive Rate =  0.17679558011049723
Fold 0 : False Negative Rate =  0.11140583554376658
Fold 0 : Overall Error Rate =  0.15
TRAIN: 2760 TEST: 920
Fold 1 : False Positive Rate =  0.14336917562724014
Fold 1 : False Negative Rate =  0.1850828729281768
Fold 1 : Overall Error Rate =  0.15978260869565217
TRAIN: 2760 TEST: 920
Fold 2 : False Positive Rate =  0.14391143911439114
Fold 2 : False Negative Rate =  0.21693121693121692
Fold 2 : Overall Error Rate =  0.17391304347826086
TRAIN: 2760 TEST: 920
Fold 3 : False Positive Rate =  0.16145833333333334
Fold 3 : False Negative Rate =  0.2005813953488372
Fold 3 : Overall Error Rate =  0.17608695652173914
Ave

In [8]:
#Random Forest Classifier is chosen due to its performance on different metrics across many folds against other models
RFC

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
#Shuffle data and Split in X and Y
dataShuffled = shuffle(data).reset_index(drop=True)
X = dataShuffled.iloc[:,0:56]
Y = dataShuffled.iloc[:,-1]

In [10]:
#No. of folds
cv = 5

In [11]:
#Use data to evaluate RFC across many folds and print results
KF = KFold(n_splits=cv)
i = 0
sum_accuracy = 0
for train_index, test_index in KF.split(X):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    xtrain, xtest = X.iloc[train_index,:], X.iloc[test_index,:]
    ytrain,ytest = Y.iloc[train_index],Y.iloc[test_index]
    RFC.fit(xtrain,ytrain)
    y_pred = RFC.predict(xtest)
    tn, fp, fn, tp = confusion_matrix(ytest,y_pred).ravel()
    print('Fold {} : False Positive Rate = '.format(i),fp/(fp+tn))
    print('Fold {} : False Negative Rate = '.format(i),fn/(fn+tp))
    print('Fold {} : Overall Error Rate = '.format(i),(fn+fp)/(tn + fp + fn + tp))

    sum_accuracy += accuracy_score(ytest,y_pred)
    i+=1
mean_accuracy = sum_accuracy/cv
print("Average Accuracy: ",mean_accuracy)
print('-----------------------------------------------------------')


TRAIN: 3680 TEST: 921
Fold 0 : False Positive Rate =  0.02831858407079646
Fold 0 : False Negative Rate =  0.07303370786516854
Fold 0 : Overall Error Rate =  0.04560260586319218
TRAIN: 3681 TEST: 920
Fold 1 : False Positive Rate =  0.030357142857142857
Fold 1 : False Negative Rate =  0.07777777777777778
Fold 1 : Overall Error Rate =  0.04891304347826087
TRAIN: 3681 TEST: 920
Fold 2 : False Positive Rate =  0.03231597845601436
Fold 2 : False Negative Rate =  0.07162534435261708
Fold 2 : Overall Error Rate =  0.04782608695652174
TRAIN: 3681 TEST: 920
Fold 3 : False Positive Rate =  0.02460456942003515
Fold 3 : False Negative Rate =  0.06552706552706553
Fold 3 : Overall Error Rate =  0.04021739130434782
TRAIN: 3681 TEST: 920
Fold 4 : False Positive Rate =  0.037243947858473
Fold 4 : False Negative Rate =  0.08093994778067885
Fold 4 : Overall Error Rate =  0.05543478260869565
Average Accuracy:  0.9524012179577964
-----------------------------------------------------------
