### Dataset 1

In [134]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedShuffleSplit 
from sklearn import preprocessing, metrics
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')


In [135]:
## Reading the file
def read_file(trainF,testF, Directory):
    #Directory = "./Data Set 1/splits/"
    train = pd.read_csv(Directory + trainF)
    test =  pd.read_csv(Directory + testF)
    return train, test

In [138]:
## SVM classifier
def svm_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    clf = svm.SVC(probability=True)
    clf.fit(X,y)
    y_pred = clf.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = clf.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    return accuracy, roc_auc

In [139]:
## RF classifier
def RF_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    RF = RandomForestClassifier(n_estimators = 200, random_state = 100, 
                    class_weight = 'balanced', oob_score = True)
    RF.fit(X,y)
    y_pred = RF.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = RF.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    return accuracy, roc_auc

In [140]:
## Logistic Regression
def log_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    logreg = linear_model.LogisticRegression()
    logreg.fit(X,y)
    y_pred = logreg.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = logreg.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    return accuracy, roc_auc

In [141]:
## Decision Tree
def tree_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    tree = DecisionTreeClassifier(random_state=0)
    tree.fit(X,y)
    y_pred = tree.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = tree.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    return accuracy, roc_auc

In [142]:
def model_build(filenum,Target_column, df_train, df_test, Directory,pos_label=None):
    accuracy_svm = []; roc_auc_svm = []
    accuracy_RF = []; roc_auc_RF = []
    accuracy_log = []; roc_auc_log = []
    accuracy_tree = []; roc_auc_tree = []
    Target_col = Target_column
    for i in range(1,11):
        trainF = df_train+ str(i) + '.csv'
        testF = df_test + str(i) + '.csv'
        train, test = read_file(trainF,testF,Directory)
        accuracy_svm, roc_auc_svm = svm_classifier(train, test, accuracy_svm, roc_auc_svm, Target_col,pos_label)
        accuracy_RF, roc_auc_RF = RF_classifier(train, test, accuracy_RF, roc_auc_RF, Target_col,pos_label)
        accuracy_log, roc_auc_log = log_classifier(train, test, accuracy_log, roc_auc_log, Target_col,pos_label)
        accuracy_tree, roc_auc_tree = tree_classifier(train, test, accuracy_tree, roc_auc_tree, Target_col,pos_label)
    print('Data set# ' + str(filenum))
    print('********** SVM classifier ***********')
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_svm).mean(), np.array(accuracy_svm).std())
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_svm).mean(), np.array(roc_auc_svm).std())
    print()
    print('********** RF classifier ************')
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_RF).mean(), np.array(accuracy_RF).std())
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_RF).mean(), np.array(roc_auc_RF).std())
    print()
    print('********** Logistic regression ******')
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_log).mean(), np.array(accuracy_log).std())
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_log).mean(), np.array(roc_auc_log).std())
    print()
    print('****** Decision Tree classifier *****')
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_tree).mean(), np.array(accuracy_tree).std())
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_tree).mean(), np.array(roc_auc_tree).std())
    print()

In [143]:
model_build(filenum=1,Target_column='Occupancy', df_train='d1_train', df_test='d1_test', Directory = "./Data Set 1/splits/")

Data set# 1
********** SVM classifier ***********
Accuracy mean   Accuracy Stdev  
0.877522564983 0.00510046226089
AUC mean        AUC      Stdev  
0.995785211979 0.000429191663448

********** RF classifier ************
Accuracy mean   Accuracy Stdev  
0.994209205785 0.00089930765821
AUC mean        AUC      Stdev  
0.999195619241 0.000233721065954

********** Logistic regression ******
Accuracy mean   Accuracy Stdev  
0.994026383508 0.000464904881194
AUC mean        AUC      Stdev  
0.996859217707 0.00037948927549

****** Decision Tree classifier *****
Accuracy mean   Accuracy Stdev  
0.991938114514 0.000964479038163
AUC mean        AUC      Stdev  
0.987173809891 0.0020925579289



In [144]:
model_build(filenum=3,Target_column='Class', df_train='d3_train', df_test='d3_test', Directory = "./Data Set 3/splits/",pos_label=1)

Data set# 3
********** SVM classifier ***********
Accuracy mean   Accuracy Stdev  
0.652113220864 0.00298484648335
AUC mean        AUC      Stdev  
0.371075740646 0.00711962078198

********** RF classifier ************
Accuracy mean   Accuracy Stdev  
0.871773643624 0.0017192476779
AUC mean        AUC      Stdev  
0.0712797093288 0.00129733205358

********** Logistic regression ******
Accuracy mean   Accuracy Stdev  
0.788136294373 0.00171746483583
AUC mean        AUC      Stdev  
0.161871481968 0.00156875652727

****** Decision Tree classifier *****
Accuracy mean   Accuracy Stdev  
0.809035565783 0.00425791593122
AUC mean        AUC      Stdev  
0.210574994051 0.00442274098834



In [149]:
model_build(filenum=5,Target_column='y', df_train='d5_train', df_test='d5_test', Directory = "./Data Set 5/splits/",pos_label=None)

Data set# 5
********** SVM classifier ***********
Accuracy mean   Accuracy Stdev  
0.883044455408 0.00133096376778
AUC mean        AUC      Stdev  
0.649459938988 0.0029217127845

********** RF classifier ************
Accuracy mean   Accuracy Stdev  
0.90116098434 0.00197204619918
AUC mean        AUC      Stdev  
0.926160645789 0.00220518553102

********** Logistic regression ******
Accuracy mean   Accuracy Stdev  
0.897261533395 0.00154966327141
AUC mean        AUC      Stdev  
0.878634676006 0.00266516832465

****** Decision Tree classifier *****
Accuracy mean   Accuracy Stdev  
0.872642525882 0.00162016116868
AUC mean        AUC      Stdev  
0.698964138213 0.00599464338353



In [160]:
model_build(filenum=9,Target_column='Income level', df_train='d9_train', df_test='d9_test', Directory = "./Data Set 9/splits/",pos_label=None)

Data set# 9
********** SVM classifier ***********
Accuracy mean   Accuracy Stdev  
0.81168356612 0.00178617472735
AUC mean        AUC      Stdev  
0.804264139279 0.00552444391591

********** RF classifier ************
Accuracy mean   Accuracy Stdev  
0.81266112612 0.00232561711037
AUC mean        AUC      Stdev  
0.859825038436 0.00180585459387

********** Logistic regression ******
Accuracy mean   Accuracy Stdev  
0.818570003596 0.00175720271485
AUC mean        AUC      Stdev  
0.864518128108 0.00148562203504

****** Decision Tree classifier *****
Accuracy mean   Accuracy Stdev  
0.770385156241 0.00210503493306
AUC mean        AUC      Stdev  
0.706391864289 0.00373292249475



In [166]:
model_build(filenum=11,Target_column='C7130', df_train='amlall_train', df_test='amlall_test', Directory = "./data11_amlalll/",pos_label='AML')

Data set# 11
********** SVM classifier ***********
Accuracy mean   Accuracy Stdev  
0.640140166605 0.0684837819889
AUC mean        AUC      Stdev  
0.5 0.0

********** RF classifier ************
Accuracy mean   Accuracy Stdev  
0.928418651785 0.0595627205114
AUC mean        AUC      Stdev  
0.999044117647 0.0019743708209

********** Logistic regression ******
Accuracy mean   Accuracy Stdev  
0.968803784382 0.0304010215156
AUC mean        AUC      Stdev  
0.99257918552 0.0158851243537

****** Decision Tree classifier *****
Accuracy mean   Accuracy Stdev  
0.885792156723 0.0460739755803
AUC mean        AUC      Stdev  
0.88696360339 0.0465043143248



In [167]:
model_build(filenum=13,Target_column='C7130', df_train='central_train', df_test='central_test', Directory = "./data13_central/",pos_label='Class1')

Data set# 13
********** SVM classifier ***********
Accuracy mean   Accuracy Stdev  
0.608064209519 0.139917957238
AUC mean        AUC      Stdev  
0.5 0.0

********** RF classifier ************
Accuracy mean   Accuracy Stdev  
0.583390362834 0.115612493992
AUC mean        AUC      Stdev  
0.54543077021 0.122658713957

********** Logistic regression ******
Accuracy mean   Accuracy Stdev  
0.60669917075 0.0900803543827
AUC mean        AUC      Stdev  
0.610881236819 0.101854537442

****** Decision Tree classifier *****
Accuracy mean   Accuracy Stdev  
0.554942127353 0.101606942254
AUC mean        AUC      Stdev  
0.542047658224 0.0602287538524



In [168]:
model_build(filenum=16,Target_column='C122', df_train='pros_train', df_test='pros_test', Directory = "./data16_pros/",pos_label=None)

Data set# 16
********** SVM classifier ***********
Accuracy mean   Accuracy Stdev  
0.921276547374 0.037188373285
AUC mean        AUC      Stdev  
0.965841648478 0.030981690192

********** RF classifier ************
Accuracy mean   Accuracy Stdev  
0.915864609462 0.0427633174354
AUC mean        AUC      Stdev  
0.960508190473 0.0289887651813

********** Logistic regression ******
Accuracy mean   Accuracy Stdev  
0.938578486627 0.0367554930389
AUC mean        AUC      Stdev  
0.981968332426 0.0203006221191

****** Decision Tree classifier *****
Accuracy mean   Accuracy Stdev  
0.834347346238 0.082435142252
AUC mean        AUC      Stdev  
0.836040684462 0.0850752664866

