### Dataset 2

In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedShuffleSplit 
from sklearn import preprocessing, metrics
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')


In [2]:
## Reading the file
def read_file(trainF,testF, Directory, Target_col,transform,drop_cols=None,categ_transform=None):
    train = pd.read_csv(Directory + trainF)
    test =  pd.read_csv(Directory + testF)
    if transform:
        lbl_enc = preprocessing.LabelEncoder()
        labels = train[Target_col].values
        labels = lbl_enc.fit_transform(labels)
        labels_test = test[Target_col].values
        labels_test = lbl_enc.fit_transform(labels_test)
        train.drop([Target_col],axis=1)
        test.drop([Target_col],axis=1)
        train[Target_col] = labels
        test[Target_col] = labels_test
    if drop_cols is not None:
        for i in drop_cols:
            train.drop([i],axis=1,inplace=True)
            test.drop([i],axis=1,inplace=True)
    if categ_transform is not None:
        for j in categ_transform:
            lbl_enc = preprocessing.LabelEncoder()
            labels = train[j].values
            labels = lbl_enc.fit_transform(labels)
            labels_test = test[j].values
            labels_test = lbl_enc.fit_transform(labels_test)
            train.drop([j],axis=1)
            test.drop([j],axis=1)
            train[j] = labels
            test[j] = labels_test
            
    return train, test

In [3]:
## SVM classifier
def svm_classifier(train, test, accuracy, roc_auc, Target_col):
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    random_state = np.random.RandomState(0)
    # Binarize the output
    y = label_binarize(y, classes=np.unique(y))
    test_labels = label_binarize(test_labels, classes=np.unique(test_labels))
    n_classes = y.shape[1]
    classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=random_state))
    y_score = classifier.fit(X, y).decision_function(test_X)
    y_pred = classifier.predict(test_X)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(test_labels.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    return accuracy, roc_auc["micro"]


    """
    clf = svm.SVC(probability=True)
    clf.fit(X,y)
    y_pred = clf.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    probas_ = clf.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1])
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    return accuracy, roc_auc
    """

In [4]:
## RF classifier
def RF_classifier(train, test, accuracy, roc_auc, Target_col):
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    
    ##Binarize the output
    y = label_binarize(y, classes=np.unique(y))
    test_labels = label_binarize(test_labels, classes=np.unique(test_labels))
    n_classes = y.shape[1]
    #RF = OneVsRestClassifier(RandomForestClassifier(n_estimators = 200, random_state = 100, 
    #                class_weight = 'balanced', oob_score = True))
    RF = RandomForestClassifier(n_estimators = 200, random_state = 0, 
                oob_score = True)
    RF.fit(X,y)
    y_pred = RF.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    return accuracy, roc_auc
    #y_score = RF.predict_proba(test_X)
    """
    probas_ = RF.predict_proba(test_X)
    
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_)[:, 1]
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    return accuracy, roc_auc
    """

In [5]:
## Logistic Regression
def log_classifier(train, test, accuracy, roc_auc, Target_col):
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    logreg = linear_model.LogisticRegression()
    logreg.fit(X,y)
    y_pred = logreg.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    """
    probas_ = logreg.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_[:, 1])
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    """
    return accuracy, roc_auc

In [6]:
## Decision Tree
def tree_classifier(train, test, accuracy, roc_auc, Target_col):
    y = train[Target_col]
    X = train.drop([Target_col],axis=1)
    test_labels = test[Target_col]
    test_X = test.drop([Target_col],axis=1)
    tree = DecisionTreeClassifier(random_state=0)
    tree.fit(X,y)
    y_pred = tree.predict(test_X)
    accuracy.append(metrics.accuracy_score(test_labels, y_pred))
    """
    probas_ = tree.predict_proba(test_X)
    # Compute ROC curve and area the curve
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, probas_)[:, 1]
    roc_auc.append(auc(false_positive_rate, true_positive_rate))
    """
    return accuracy, roc_auc

In [7]:
def model_build(filenum,Target_column, df_train, df_test, Directory,drop_cols=None,categ_transform=None):
    accuracy_svm = []; roc_auc_svm = []
    accuracy_RF = []; roc_auc_RF = []
    accuracy_log = []; roc_auc_log = []
    accuracy_tree = []; roc_auc_tree = []
    Target_col = Target_column
    for i in range(1,11):
        trainF = df_train+ str(i) + '.csv'
        testF = df_test + str(i) + '.csv'
        train, test = read_file(trainF,testF,Directory, Target_col,transform=True,drop_cols=drop_cols,categ_transform=categ_transform)
        accuracy_svm, roc_auc_svm = svm_classifier(train, test, accuracy_svm, roc_auc_svm, Target_col)
        accuracy_RF, roc_auc_RF = RF_classifier(train, test, accuracy_RF, roc_auc_RF, Target_col)
        accuracy_log, roc_auc_log = log_classifier(train, test, accuracy_log, roc_auc_log, Target_col)
        accuracy_tree, roc_auc_tree = tree_classifier(train, test, accuracy_tree, roc_auc_tree, Target_col)
    print('Data set# ' + str(filenum))
    print('********** SVM classifier ***********')
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_svm).mean(), np.array(accuracy_svm).std())
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_svm).mean(), np.array(roc_auc_svm).std())
    print()

    print('********** RF classifier ************')
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_RF).mean(), np.array(accuracy_RF).std())
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_RF).mean(), np.array(roc_auc_RF).std())
    print()
    print('********** Logistic regression ******')
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_log).mean(), np.array(accuracy_log).std())
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_log).mean(), np.array(roc_auc_log).std())
    print()
    print('****** Decision Tree classifier *****')
    print('Accuracy mean   ' + 'Accuracy Stdev  ')
    print(np.array(accuracy_tree).mean(), np.array(accuracy_tree).std())
    print('AUC mean        ' + 'AUC      Stdev  ')
    print(np.array(roc_auc_tree).mean(), np.array(roc_auc_tree).std())
    print()
    


In [133]:
model_build(filenum=2,Target_column='letter', df_train='data2_train', df_test='data2_test', Directory = "./Data Set 2/splits/")

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    2.4s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    9.4s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    9.4s finished
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    2.3s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    8.4s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    8.4s finished
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    2.2s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    9.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    9.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    1.8s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    7.8s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    7.8s finished
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    2.3s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    9.1s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    9.2s finished
[Parallel(n_jobs=1)]: Done  49

Data set# 2
********** SVM classifier ***********
Accuracy mean   Accuracy Stdev  
0.353667709088 0.00684817781504
AUC mean        AUC      Stdev  
0.906650486049 0.0

********** RF classifier ************
Accuracy mean   Accuracy Stdev  
0.78412086085 0.00314022025978
AUC mean        AUC      Stdev  
nan nan

********** Logistic regression ******
Accuracy mean   Accuracy Stdev  
0.710795087387 0.00482357695207
AUC mean        AUC      Stdev  
nan nan

****** Decision Tree classifier *****
Accuracy mean   Accuracy Stdev  
0.820478288835 0.00518612409593
AUC mean        AUC      Stdev  
nan nan





In [None]:
model_build(filenum=4,Target_column='Activity', df_train='data4_train', df_test='data4_test', Directory = "./Data Set 4/splits/",drop_cols=['Tag_Identificator'],categ_transform=['Sequence_Name'])

In [None]:
model_build(filenum=6,Target_column='Class', df_train='d6_train', df_test='d6_test', Directory = "./Data Set 6/splits/",drop_cols=None,categ_transform=None)

In [None]:
model_build(filenum=7,Target_column='Class', df_train='d7_train', df_test='d7_test', Directory = "./Data Set 7/splits/",drop_cols=None,categ_transform=None)

In [None]:
model_build(filenum=12,Target_column='C118', df_train='brain_train', df_test='brain_test', Directory = "./data12_brain/",drop_cols=None,categ_transform=None)

In [None]:
model_build(filenum=14,Target_column='C2309', df_train='srbct_train', df_test='srbct_test', Directory = "./data14_srbct/",drop_cols=None,categ_transform=None)

In [None]:
model_build(filenum=15,Target_column='C94', df_train='lymph_train', df_test='lymph_test', Directory = "./data15_lymph/",drop_cols=None,categ_transform=None)