### Dataset 1

In [41]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedShuffleSplit 
from sklearn import preprocessing, metrics
from sklearn.metrics import roc_curve, auc
import timeit
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')


In [42]:
## Reading the file
def read_file(trainF,testF, Directory):
    #Directory = "./Data Set 1/splits/"
    train = pd.read_csv(Directory + trainF)
    test =  pd.read_csv(Directory + testF)
    return train, test

In [43]:
## SVM classifier
def svm_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    clf = svm.SVC(probability=True)
    clf.fit(X,y)
    y_pred = clf.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = clf.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [44]:
## RF classifier
def RF_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    RF = RandomForestClassifier(n_estimators=10)
    RF.fit(X,y)
    y_pred = RF.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = RF.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [45]:
## Logistic Regression
def log_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    logreg = linear_model.LogisticRegression()
    logreg.fit(X,y)
    y_pred = logreg.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = logreg.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [46]:
## Decision Tree
def tree_classifier(train, test, accuracy, roc_auc, Target_col,pos_label=None):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    tree = DecisionTreeClassifier(random_state=0)
    tree.fit(X,y)
    y_pred = tree.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = tree.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1],pos_label=pos_label)
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed

In [47]:
def model_build(filenum,Target_column, df_train, df_test, Directory,pos_label=None):
    accuracy_svm = []; roc_auc_svm = []
    accuracy_RF = []; roc_auc_RF = []
    accuracy_log = []; roc_auc_log = []
    accuracy_tree = []; roc_auc_tree = []
    Target_col = Target_column
    elapsed_time_svm = [];elapsed_time_RF = []; elapsed_time_log = []; elapsed_time_tree = []
    for i in range(1,11):
        trainF = df_train+ str(i) + '.csv'
        testF = df_test + str(i) + '.csv'
        train, test = read_file(trainF,testF,Directory)
        accuracy_svm, roc_auc_svm, elapsed = svm_classifier(train, test, accuracy_svm, roc_auc_svm, Target_col,pos_label)
        elapsed_time_svm.append(elapsed)
        accuracy_RF, roc_auc_RF, elapsed = RF_classifier(train, test, accuracy_RF, roc_auc_RF, Target_col,pos_label)
        elapsed_time_RF.append(elapsed)
        accuracy_log, roc_auc_log, elapsed = log_classifier(train, test, accuracy_log, roc_auc_log, Target_col,pos_label)
        elapsed_time_log.append(elapsed)
        accuracy_tree, roc_auc_tree, elapsed = tree_classifier(train, test, accuracy_tree, roc_auc_tree, Target_col,pos_label)
        elapsed_time_tree.append(elapsed)
    print('Data set# ' + str(filenum))
    print('********** SVM classifier ***********')
    print('Individual file accuracy for SVM')
    print(np.array(accuracy_svm))
    print('Individual time taken for SVM')
    print(np.array(elapsed_time_svm))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_svm).mean(), np.array(accuracy_svm).std())
    print('Individual file AUC for SVM')
    print(np.array(roc_auc_svm))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_svm).mean(), np.array(roc_auc_svm).std())
    print()
    print('********** RF classifier ************')
    print('Individual file accuracy for RF')
    print(np.array(accuracy_RF))
    print('Individual time taken for RF')
    print(np.array(elapsed_time_RF))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_RF).mean(), np.array(accuracy_RF).std())
    print('Individual file AUC for RF')
    print(np.array(roc_auc_RF))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_RF).mean(), np.array(roc_auc_RF).std())
    print()
    print('********** Logistic regression ******')
    print('Individual file accuracy for log')
    print(np.array(accuracy_log))
    print('Individual time taken for log')
    print(np.array(elapsed_time_log))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_log).mean(), np.array(accuracy_log).std())
    print('Individual file AUC for log')
    print(np.array(roc_auc_log))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_log).mean(), np.array(roc_auc_log).std())
    print()
    print('****** Decision Tree classifier *****')
    print('Individual file accuracy for Tree')
    print(np.array(accuracy_tree))
    print('Individual time taken for tree')
    print(np.array(elapsed_time_tree))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_tree).mean(), np.array(accuracy_tree).std())
    print('Individual file AUC for tree')
    print(np.array(roc_auc_tree))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_tree).mean(), np.array(roc_auc_tree).std())
    print()

In [48]:
model_build(filenum=1,Target_column='Occupancy', df_train='d1_train', df_test='d1_test', Directory = "./Data Set 1/splits/")

Data set# 1
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.8736428   0.8814401   0.87593867  0.86993769  0.88263208  0.87268232
  0.87398061  0.88678665  0.88199721  0.87618751]
Individual time taken for SVM
[ 0.04675329  0.04620554  0.05277796  0.04446925  0.04760308  0.05691431
  0.04533729  0.04217618  0.04237784  0.04456828]
Accuracy mean   Accuracy Stdev  
0.877522564983 0.00510046226089
Individual file AUC for SVM
[ 0.9957393   0.99466022  0.99555393  0.9962568   0.99610718  0.99633515
  0.99601765  0.99558157  0.99604188  0.99549901]
AUC mean        AUC      Stdev  
0.995779266483 0.000468254654886

********** RF classifier ************
Individual file accuracy for RF
[ 0.99323367  0.99317194  0.99405507  0.99392523  0.99335653  0.99443758
  0.99430682  0.9924627   0.9945728   0.99283601]
Individual time taken for RF
[ 0.00066757  0.00056231  0.00097797  0.00055944  0.00058969  0.00099507
  0.00058577  0.00059423  0.00055853  0.00055498]
Accuracy mean

In [49]:
model_build(filenum=3,Target_column='Class', df_train='d3_train', df_test='d3_test', Directory = "./Data Set 3/splits/",pos_label=1)

Data set# 3
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.65435127  0.65266598  0.64939317  0.65668568  0.6514536   0.65125684
  0.64670517  0.64893702  0.65464289  0.65504061]
Individual time taken for SVM
[ 0.28335663  0.30746987  0.28940549  0.2850046   0.29149578  0.27810452
  0.27704248  0.28168563  0.30506648  0.29733796]
Accuracy mean   Accuracy Stdev  
0.652113220864 0.00298484648335
Individual file AUC for SVM
[ 0.37294313  0.37304327  0.3653244   0.37226842  0.35723433  0.37715698
  0.36799928  0.3754123   0.36061487  0.37923272]
AUC mean        AUC      Stdev  
0.370122969916 0.00682336841507

********** RF classifier ************
Individual file accuracy for RF
[ 0.8602057   0.85741612  0.85611626  0.86280101  0.85862206  0.85964634
  0.86005854  0.85874672  0.86483244  0.85953177]
Individual time taken for RF
[ 0.00393801  0.00422306  0.00395204  0.00520507  0.00389029  0.00395871
  0.00378911  0.00425921  0.00496832  0.00433515]
Accuracy mean 

In [50]:
model_build(filenum=5,Target_column='y', df_train='d5_train', df_test='d5_test', Directory = "./Data Set 5/splits/",pos_label=None)

Data set# 5
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.88200946  0.88379154  0.88233526  0.88332886  0.88207405  0.8854829
  0.88086147  0.88217422  0.88371859  0.88466821]
Individual time taken for SVM
[ 4.01382222  4.18249362  2.92707519  3.68961618  3.29216198  3.77185245
  3.27954877  3.97864     4.31341266  3.08516301]
Accuracy mean   Accuracy Stdev  
0.883044455408 0.00133096376778
Individual file AUC for SVM
[ 0.6485799   0.65346539  0.6506092   0.64961236  0.65034491  0.64661011
  0.65630505  0.64866974  0.65456289  0.64668176]
AUC mean        AUC      Stdev  
0.650544131562 0.0031072573009

********** RF classifier ************
Individual file accuracy for RF
[ 0.89637236  0.8996063   0.89733142  0.89471386  0.89779974  0.90060906
  0.89533092  0.89636406  0.8974402   0.89960808]
Individual time taken for RF
[ 0.00579463  0.00491222  0.00455308  0.00442722  0.00514482  0.00477882
  0.00562586  0.00556904  0.00491376  0.00471096]
Accuracy mean   

In [51]:
model_build(filenum=9,Target_column='Income level', df_train='d9_train', df_test='d9_test', Directory = "./Data Set 9/splits/",pos_label=None)

Data set# 9
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.81138371  0.81308733  0.81314879  0.81264717  0.81048567  0.81048567
  0.80903491  0.81437599  0.81320708  0.80897935]
Individual time taken for SVM
[ 0.60599818  0.60534496  0.6003244   0.61465725  0.63792905  0.64799291
  0.62162154  0.63000625  0.64237411  0.63505469]
Accuracy mean   Accuracy Stdev  
0.81168356612 0.00178617472735
Individual file AUC for SVM
[ 0.80556292  0.80504737  0.8011208   0.80566687  0.80115782  0.80115725
  0.79518648  0.81614439  0.81046384  0.80113786]
AUC mean        AUC      Stdev  
0.804264559446 0.00552453300227

********** RF classifier ************
Individual file accuracy for RF
[ 0.80897939  0.81170972  0.80711814  0.80906593  0.80553493  0.80850537
  0.80355921  0.80958728  0.80946923  0.80656449]
Individual time taken for RF
[ 0.00281206  0.00315481  0.00324888  0.00331544  0.00335002  0.00327815
  0.00320811  0.00318202  0.00327671  0.00338907]
Accuracy mean  

In [52]:
model_build(filenum=11,Target_column='C7130', df_train='amlall_train', df_test='amlall_test', Directory = "./data11_amlalll/",pos_label='AML')

Data set# 11
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.77272727  0.65384615  0.73529412  0.58333333  0.6         0.56521739
  0.68        0.55172414  0.62962963  0.62962963]
Individual time taken for SVM
[ 0.00345043  0.00266828  0.00218912  0.00280312  0.00301105  0.00274211
  0.00258337  0.00263196  0.00245584  0.00271044]
Accuracy mean   Accuracy Stdev  
0.640140166605 0.0684837819889
Individual file AUC for SVM
[ 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5]
AUC mean        AUC      Stdev  
0.5 0.0

********** RF classifier ************
Individual file accuracy for RF
[ 1.          0.73076923  0.94117647  0.70833333  0.85        0.82608696
  0.96        0.75862069  0.85185185  0.85185185]
Individual time taken for RF
[ 0.00051153  0.00048832  0.00036571  0.0004217   0.0005151   0.00046628
  0.00039455  0.00061939  0.00047988  0.00037891]
Accuracy mean   Accuracy Stdev  
0.847869038457 0.0927804021935
Individual file AUC for RF
[ 1.          0.88

In [53]:
model_build(filenum=13,Target_column='C7130', df_train='central_train', df_test='central_test', Directory = "./data13_central/",pos_label='Class1')

Data set# 13
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.76470588  0.80952381  0.65217391  0.61538462  0.58333333  0.47619048
  0.66666667  0.35294118  0.72222222  0.4375    ]
Individual time taken for SVM
[ 0.00228073  0.00171291  0.00180211  0.00225212  0.00206153  0.00163843
  0.00246381  0.00184369  0.00185882  0.00190199]
Accuracy mean   Accuracy Stdev  
0.608064209519 0.139917957238
Individual file AUC for SVM
[ 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5]
AUC mean        AUC      Stdev  
0.5 0.0

********** RF classifier ************
Individual file accuracy for RF
[ 0.64705882  0.61904762  0.65217391  0.53846154  0.66666667  0.61904762
  0.58333333  0.29411765  0.5         0.5625    ]
Individual time taken for RF
[ 0.00049568  0.00055952  0.00042692  0.0004385   0.00063506  0.00066838
  0.00060843  0.00044734  0.00056144  0.00040717]
Accuracy mean   Accuracy Stdev  
0.568240716019 0.104429507697
Individual file AUC for RF
[ 0.64423077  0.4705

In [54]:
model_build(filenum=16,Target_column='C122', df_train='pros_train', df_test='pros_test', Directory = "./data16_pros/",pos_label=None)

Data set# 16
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.9047619   0.91666667  0.93333333  1.          0.91666667  0.88636364
  0.86486486  0.95121951  0.88888889  0.95      ]
Individual time taken for SVM
[ 0.00013107  0.00016779  0.00012704  0.00014523  0.00014577  0.00012404
  0.00013696  0.00020284  0.00010011  0.00010583]
Accuracy mean   Accuracy Stdev  
0.921276547374 0.037188373285
Individual file AUC for SVM
[ 0.97685185  0.90909091  0.98529412  1.          0.9845679   0.9465812
  0.9122807   0.99761905  0.97119342  0.97493734]
AUC mean        AUC      Stdev  
0.965841648478 0.030981690192

********** RF classifier ************
Individual file accuracy for RF
[ 0.71428571  0.83333333  0.8         0.88888889  0.88888889  0.79545455
  0.64864865  0.90243902  0.88888889  0.9       ]
Individual time taken for RF
[ 0.00041003  0.00038356  0.00039032  0.00030679  0.00029837  0.00029044
  0.00026162  0.00027012  0.00032769  0.00022628]
Accuracy mean   Ac