### Dataset 2

In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedShuffleSplit 
from sklearn import preprocessing, metrics
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
import timeit
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')


In [2]:
## Reading the file
def read_file(trainF,testF, Directory, Target_col,transform,drop_cols=None,categ_transform=None):
    train = pd.read_csv(Directory + trainF)
    test =  pd.read_csv(Directory + testF)
    if transform:
        lbl_enc = preprocessing.LabelEncoder()
        labels = train[Target_col].values
        labels = lbl_enc.fit_transform(labels)
        labels_test = test[Target_col].values
        labels_test = lbl_enc.fit_transform(labels_test)
        train.drop([Target_col],axis=1)
        test.drop([Target_col],axis=1)
        train[Target_col] = labels
        test[Target_col] = labels_test
    if drop_cols is not None:
        for i in drop_cols:
            train.drop([i],axis=1,inplace=True)
            test.drop([i],axis=1,inplace=True)
    if categ_transform is not None:
        for j in categ_transform:
            lbl_enc = preprocessing.LabelEncoder()
            labels = train[j].values
            labels = lbl_enc.fit_transform(labels)
            labels_test = test[j].values
            labels_test = lbl_enc.fit_transform(labels_test)
            train.drop([j],axis=1)
            test.drop([j],axis=1)
            train[j] = labels
            test[j] = labels_test
            
    return train, test

In [3]:
## SVM classifier
def svm_classifier(train, test, accuracy, roc_auc, Target_col):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    random_state = np.random.RandomState(0)
    # Binarize the output
    y = label_binarize(y, classes=np.unique(y))
    test_labels = label_binarize(test_labels, classes=np.unique(test_labels))
    n_classes = y.shape[1]
    classifier = OneVsRestClassifier(svm.SVC(probability=True))
    y_score = classifier.fit(X, y).decision_function(test_X)
    y_pred = classifier.predict(test_X)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc_dict = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], y_score[:, i])
        roc_auc_dict[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(test_labels.ravel(), y_score.ravel())
    roc_auc_dict["micro"] = auc(fpr["micro"], tpr["micro"])
    roc_auc.append(roc_auc_dict["micro"])
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed


    """
    clf = svm.SVC(probability=True)
    clf.fit(X,y)
    y_pred = clf.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = clf.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1])
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    return accuracy, roc_auc
    """

In [4]:
## RF classifier
def RF_classifier(train, test, accuracy, roc_auc, Target_col):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    
    ##Binarize the output
    y = label_binarize(y, classes=np.unique(y))
    test_labels = label_binarize(test_labels, classes=np.unique(test_labels))
    n_classes = y.shape[1]
    RF = OneVsRestClassifier(RandomForestClassifier(n_estimators = 200))
    #RF = RandomForestClassifier(n_estimators = 200, random_state = 0, 
    #            oob_score = True)
    
    RF.fit(X,y)
    y_score = RF.fit(X, y).predict_proba(test_X)
    y_pred = RF.predict(test_X)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc_dict = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], y_score[:, i])
        roc_auc_dict[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(test_labels.ravel(), y_score.ravel())
    roc_auc_dict["micro"] = auc(fpr["micro"], tpr["micro"])
    roc_auc.append(roc_auc_dict["micro"])
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed
    """
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    elapsed = (timeit.default_timer() - start_time)/60
    #return accuracy, roc_auc, elapsed
    #y_score = RF.predict_proba(test_X)
    
    probas_ = RF.predict_proba(test_X)
    
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_)[:, 1]
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    return accuracy, roc_auc, elapsed
    """
    

In [5]:
## Logistic Regression
def log_classifier(train, test, accuracy, roc_auc, Target_col):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    ##Binarize the output
    y = label_binarize(y, classes=np.unique(y))
    test_labels = label_binarize(test_labels, classes=np.unique(test_labels))
    n_classes = y.shape[1]
    #logreg = linear_model.LogisticRegression()
    logreg = OneVsRestClassifier(linear_model.LogisticRegression())
    logreg.fit(X,y)
    y_score = logreg.fit(X, y).predict_proba(test_X)
    y_pred = logreg.predict(test_X)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc_dict = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], y_score[:, i])
        roc_auc_dict[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(test_labels.ravel(), y_score.ravel())
    roc_auc_dict["micro"] = auc(fpr["micro"], tpr["micro"])
    roc_auc.append(roc_auc_dict["micro"])
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed
    """
    logreg.fit(X,y)
    y_pred = logreg.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))

    probas_ = logreg.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1])
    roc_auc.append(auc(false_positive_rate, true_positive_rate))

    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed
    """

In [6]:
## Decision Tree
def tree_classifier(train, test, accuracy, roc_auc, Target_col):
    start_time = timeit.default_timer()
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    ##Binarize the output
    y = label_binarize(y, classes=np.unique(y))
    test_labels = label_binarize(test_labels, classes=np.unique(test_labels))
    n_classes = y.shape[1]
    tree = OneVsRestClassifier(DecisionTreeClassifier(random_state=0))
    tree.fit(X,y)
    y_score = tree.fit(X, y).predict_proba(test_X)
    y_pred = tree.predict(test_X)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc_dict = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], y_score[:, i])
        roc_auc_dict[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(test_labels.ravel(), y_score.ravel())
    roc_auc_dict["micro"] = auc(fpr["micro"], tpr["micro"])
    roc_auc.append(roc_auc_dict["micro"])
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed
    """
    #tree = DecisionTreeClassifier(random_state=0)
    tree.fit(X,y)
    y_pred = tree.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    
    probas_ = tree.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_)[:, 1]
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    
    elapsed = (timeit.default_timer() - start_time)/60
    return accuracy, roc_auc, elapsed
    """

In [7]:
def model_build(filenum,Target_column, df_train, df_test, Directory,drop_cols=None,categ_transform=None):
    accuracy_svm = []; roc_auc_svm = []
    accuracy_RF = []; roc_auc_RF = []
    accuracy_log = []; roc_auc_log = []
    accuracy_tree = []; roc_auc_tree = []
    elapsed_time_svm = [];elapsed_time_RF = []; elapsed_time_log = []; elapsed_time_tree = []
    Target_col = Target_column
    for i in range(1,11):
        trainF = df_train+ str(i) + '.csv'
        testF = df_test + str(i) + '.csv'
        train, test = read_file(trainF,testF,Directory, Target_col,transform=True,drop_cols=drop_cols,categ_transform=categ_transform)
        accuracy_svm, roc_auc_svm, elapsed = svm_classifier(train, test, accuracy_svm, roc_auc_svm, Target_col)
        elapsed_time_svm.append(elapsed)
        accuracy_RF, roc_auc_RF, elapsed = RF_classifier(train, test, accuracy_RF, roc_auc_RF, Target_col)
        elapsed_time_RF.append(elapsed)
        accuracy_log, roc_auc_log, elapsed = log_classifier(train, test, accuracy_log, roc_auc_log, Target_col)
        elapsed_time_log.append(elapsed)
        accuracy_tree, roc_auc_tree, elapsed = tree_classifier(train, test, accuracy_tree, roc_auc_tree, Target_col)
        elapsed_time_tree.append(elapsed)
    
    print('Data set# ' + str(filenum))
    print('********** SVM classifier ***********')
    print('Individual file accuracy for SVM')
    print(np.array(accuracy_svm))
    print('Individual time taken for SVM')
    print(np.array(elapsed_time_svm))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_svm).mean(), np.array(accuracy_svm).std())
    print('Individual file AUC for SVM')
    print(np.array(roc_auc_svm))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_svm).mean(), np.array(roc_auc_svm).std())
    print()
    print('********** RF classifier ************')
    print('Individual file accuracy for RF')
    print(np.array(accuracy_RF))
    print('Individual time taken for RF')
    print(np.array(elapsed_time_RF))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_RF).mean(), np.array(accuracy_RF).std())
    print('Individual file AUC for RF')
    print(np.array(roc_auc_RF))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_RF).mean(), np.array(roc_auc_RF).std())
    print()
    
    print('********** Logistic regression ******')
    print('Individual file accuracy for log')
    print(np.array(accuracy_log))
    print('Individual time taken for log')
    print(np.array(elapsed_time_log))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_log).mean(), np.array(accuracy_log).std())
    print('Individual file AUC for log')
    print(np.array(roc_auc_log))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_log).mean(), np.array(roc_auc_log).std())
    print()
    print('****** Decision Tree classifier *****')
    print('Individual file accuracy for Tree')
    print(np.array(accuracy_tree))
    print('Individual time taken for tree')
    print(np.array(elapsed_time_tree))
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_tree).mean(), np.array(accuracy_tree).std())
    print('Individual file AUC for tree')
    print(np.array(roc_auc_tree))
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_tree).mean(), np.array(roc_auc_tree).std())
    print()
    


In [8]:
model_build(filenum=2,Target_column='letter', df_train='data2_train', df_test='data2_test', Directory = "./Data Set 2/splits/")

Data set# 2
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.84315063  0.84329819  0.84483803  0.84574914  0.84754248  0.84330613
  0.84451868  0.84993179  0.84549713  0.83630458]
Individual time taken for SVM
[ 1.80775594  1.7919381   1.83458662  1.87202981  1.87198792  1.78481381
  1.88033565  1.86869355  1.84165476  1.83986859]
Accuracy mean   Accuracy Stdev  
0.844413677446 0.00336468542117
Individual file AUC for SVM
[ 0.99633476  0.99523137  0.99644127  0.99576869  0.99452019  0.99564822
  0.99656245  0.99586034  0.99495973  0.99544933]
AUC mean        AUC      Stdev  
0.995677634202 0.0006282305925

********** RF classifier ************
Individual file accuracy for RF
[ 0.78538397  0.78245482  0.7897366   0.7891727   0.78739381  0.78785131
  0.79183116  0.78603911  0.78966455  0.78440852]
Individual time taken for RF
[ 1.17536817  1.1742386   1.21403605  1.22372262  1.22805581  1.17875928
  1.19271632  1.21153584  1.1981397   1.21390193]
Accuracy mean  

In [None]:
model_build(filenum=4,Target_column='Activity', df_train='data4_train', df_test='data4_test', Directory = "./Data Set 4/splits/",drop_cols=['Tag_Identificator'],categ_transform=['Sequence_Name'])

In [None]:
model_build(filenum=6,Target_column='Class', df_train='d6_train', df_test='d6_test', Directory = "./Data Set 6/splits/",drop_cols=None,categ_transform=None)

In [23]:
model_build(filenum=7,Target_column='Class', df_train='d7_train', df_test='d7_test', Directory = "./Data Set 7/splits/",drop_cols=None,categ_transform=None)

Data set# 7
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
Individual time taken for SVM
[ 1.80806701  1.78822973  1.7746461   1.78489719  1.77833806  1.83683679
  1.83002356  1.9618278   1.99587842  1.89564617]
Accuracy mean   Accuracy Stdev  
0.0 0.0
Individual file AUC for SVM
[ 0.53402186  0.53967774  0.53528371  0.54013237  0.53685979  0.53474391
  0.53211638  0.5353484   0.54112735  0.53704218]
AUC mean        AUC      Stdev  
0.536635369139 0.00276027966934

********** RF classifier ************
Individual file accuracy for RF
[ 0.96674058  0.96991664  0.96648965  0.96632265  0.96678163  0.96758748
  0.9690408   0.96832332  0.96706696  0.96878383]
Individual time taken for RF
[ 0.42493262  0.41933029  0.43414287  0.41214833  0.42866112  0.43097503
  0.42579859  0.46625403  0.45141974  0.40434878]
Accuracy mean   Accuracy Stdev  
0.967705353729 0.00117530807564
Individual file AUC for RF
[ 0.99957075  0.99950162  0

In [22]:
model_build(filenum=12,Target_column='C118', df_train='brain_train', df_test='brain_test', Directory = "./data12_brain/",drop_cols=None,categ_transform=None)

Data set# 12
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.61111111  0.44444444  0.58333333  0.29411765  0.73333333  0.46666667
  0.64285714  0.66666667  0.5         0.8       ]
Individual time taken for SVM
[ 0.00049301  0.00041339  0.00032148  0.00043569  0.0003898   0.00032332
  0.00034136  0.00034452  0.00032489  0.00035014]
Accuracy mean   Accuracy Stdev  
0.574253034547 0.142608012281
Individual file AUC for SVM
[ 1.          0.99691358  0.99305556  0.71020761  0.99666667  0.99444444
  0.99659864  1.          0.94628906  0.98875   ]
AUC mean        AUC      Stdev  
0.962292556133 0.0853720316103

********** RF classifier ************
Individual file accuracy for RF
[ 0.88888889  0.77777778  0.83333333  0.35294118  0.93333333  0.73333333
  0.92857143  0.83333333  0.5625      0.85      ]
Individual time taken for RF
[ 0.03822374  0.03650697  0.03758509  0.03847351  0.03679219  0.03535749
  0.02835335  0.03671864  0.03588072  0.03584132]
Accuracy mean   

In [21]:
model_build(filenum=14,Target_column='C2309', df_train='srbct_train', df_test='srbct_test', Directory = "./data14_srbct/",drop_cols=None,categ_transform=None)

Data set# 14
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.57894737  0.57894737  0.69565217  0.73684211  0.6         0.52173913
  0.55172414  0.66666667  0.60869565  0.68421053]
Individual time taken for SVM
[ 0.00243665  0.00136244  0.00210215  0.00236285  0.00247499  0.00218088
  0.00186131  0.00240589  0.00211565  0.00231347]
Accuracy mean   Accuracy Stdev  
0.622342512954 0.0662090945461
Individual file AUC for SVM
[ 1.          0.98891967  0.99873976  0.98522622  0.9925      0.99558916
  0.99088387  0.99897119  1.          0.98799631]
AUC mean        AUC      Stdev  
0.993882618192 0.00521880530596

********** RF classifier ************
Individual file accuracy for RF
[ 0.78947368  0.73684211  0.65217391  0.68421053  0.75        0.65217391
  0.65517241  1.          0.73913043  0.63157895]
Individual time taken for RF
[ 0.03578797  0.02527542  0.03571479  0.03513989  0.03540471  0.03593001
  0.03416823  0.03587704  0.03386847  0.03405723]
Accuracy mean 

In [20]:
model_build(filenum=15,Target_column='C94', df_train='lymph_train', df_test='lymph_test', Directory = "./data15_lymph/",drop_cols=None,categ_transform=None)

Data set# 15
********** SVM classifier ***********
Individual file accuracy for SVM
[ 0.96153846  1.          0.94736842  0.89473684  0.95833333  0.94736842
  0.95652174  0.9         0.92592593  0.94117647]
Individual time taken for SVM
[ 0.00031356  0.00028601  0.0002832   0.00028192  0.00026745  0.00028294
  0.00028612  0.00026408  0.00028209  0.00028966]
Accuracy mean   Accuracy Stdev  
0.943296961473 0.0291909046027
Individual file AUC for SVM
[ 1.          1.          0.99445983  1.          1.          0.99861496
  1.          0.99388889  0.99931413  0.9982699 ]
AUC mean        AUC      Stdev  
0.998454770627 0.00222584367881

********** RF classifier ************
Individual file accuracy for RF
[ 0.92307692  0.94117647  0.89473684  0.94736842  0.91666667  1.
  0.95652174  0.9         0.92592593  0.94117647]
Individual time taken for RF
[ 0.02175677  0.02273735  0.02189296  0.02130092  0.02184882  0.02151644
  0.02314869  0.02342209  0.02224945  0.02201629]
Accuracy mean   Accura