In [None]:
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn import preprocessing
import pandas as pd

In [None]:
# 4) Draw heatmaps for result of grid search and find 
#    best C for validation set.

def draw_heatmap_linear(acc, acc_desc, C_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=C_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$C$')
    plt.title(acc_desc + ' w.r.t $C$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()

In [None]:
def draw_heatmap_RF(acc, acc_desc, D_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=D_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$D$')
    plt.title(acc_desc + ' w.r.t $D$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()

In [None]:
def runLogisticRegression(X_train,Y_train,X_test,Y_test):
    C_list = [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    params = {'C': C_list}
    grid_search = GridSearchCV(LogisticRegression(), params, cv=3, return_train_score = 'true', n_jobs=-1)
    grid_search.fit(X_train, Y_train)
    
    train_acc = (grid_search.cv_results_['mean_train_score']).reshape(-1,1)
    draw_heatmap_linear(train_acc, 'train accuracy', C_list)

    val_acc = (grid_search.cv_results_['mean_test_score']).reshape(-1,1)
    draw_heatmap_linear(val_acc, 'val accuracy', C_list)

    new_classifier = LogisticRegression(C = grid_search.best_params_['C'])
    new_classifier.fit(X_train, Y_train)
    #test_acc = f1_score(new_classifier.predict(X_test), Y_test)
    test_acc = new_classifier.score(X_test, Y_test)

    return test_acc, grid_search.best_params_['C']

In [None]:
def runSVMlinear(X_train,Y_train,X_test,Y_test):
    #setup SVM
    SVM_classifier = svm.SVC(kernel = 'linear')
    C_list = [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    params = {'C': C_list}
    grid_search = GridSearchCV(SVM_classifier, params, cv=3, return_train_score = 'true', n_jobs=-1)
    grid_search.fit(X_train, Y_train)
    
    train_acc = (grid_search.cv_results_['mean_train_score']).reshape(-1,1)
    draw_heatmap_linear(train_acc, 'train accuracy', C_list)

    val_acc = (grid_search.cv_results_['mean_test_score']).reshape(-1,1)
    draw_heatmap_linear(val_acc, 'val accuracy', C_list)

    new_classifier = svm.SVC(kernel = 'linear', C = grid_search.best_params_['C'])
    new_classifier.fit(X_train, Y_train)
   # test_acc = f1_score(new_classifier.predict(X_test), Y_test)
    test_acc = new_classifier.score(X_test, Y_test)

    return test_acc, grid_search.best_params_['C']

In [None]:
def runSVMrbf(X_train,Y_train,X_test,Y_test):
    #setup SVM
    SVM_classifier = svm.SVC(kernel = 'rbf')
    C_list = [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    params = {'C': C_list}
    grid_search = GridSearchCV(SVM_classifier, params, cv=3, return_train_score = 'true', n_jobs=-1)
    grid_search.fit(X_train, Y_train)
    
    train_acc = (grid_search.cv_results_['mean_train_score']).reshape(-1,1)
    draw_heatmap_linear(train_acc, 'train accuracy', C_list)

    val_acc = (grid_search.cv_results_['mean_test_score']).reshape(-1,1)
    draw_heatmap_linear(val_acc, 'val accuracy', C_list)

    new_classifier = svm.SVC(kernel = 'linear', C = grid_search.best_params_['C'])
    new_classifier.fit(X_train, Y_train)
    #test_acc = f1_score(new_classifier.predict(X_test), Y_test)
    test_acc = new_classifier.score(X_test, Y_test)

    return test_acc, grid_search.best_params_['C']

In [None]:
def runRandomForest(X_train, Y_train, X_test, Y_test):
    D_list = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    parameters = {'max_depth':D_list}
    clf = RandomForestClassifier(criterion="entropy", n_estimators=1024)
    grid_search = GridSearchCV(clf, parameters, cv=5, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, Y_train)

    #plot heatmaps
    train_acc = (grid_search.cv_results_['mean_train_score']).reshape(-1,1)
    draw_heatmap_RF(train_acc, 'RF train accuracy', D_list)

    val_acc = grid_search.cv_results_['mean_test_score'].reshape(-1,1)
    draw_heatmap_RF(val_acc, 'RF val accuracy', D_list)

    #predict with best parameter
    best_D = grid_search.best_params_['max_depth']
    new_clf = RandomForestClassifier(max_depth=best_D, criterion="entropy", n_estimators=10)
    new_clf.fit(X_train, Y_train)
    #test_acc = f1_score(new_clf.predict(X_test), Y_test)
    test_acc = new_clf.score(X_test, Y_test)

    return test_acc, best_D

In [None]:
def runDecisionTree(X_train, Y_train, X_test, Y_test):
    estimator = tree.DecisionTreeClassifier()
    D_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    params = {
                  "max_depth": D_list,
                  "criterion": ["entropy"]
                 }
    clf = GridSearchCV(estimator, param_grid=params, return_train_score = 'true', cv=5)
    clf.fit(X_train, Y_train)
    train_acc = clf.cv_results_['mean_train_score'].reshape(-1,1)
    draw_heatmap_linear(train_acc, 'train accuracy', D_list)

    val_acc = clf.cv_results_['mean_test_score'].reshape(-1,1)
    draw_heatmap_linear(val_acc, 'val accuracy', D_list)
        
    best_D = clf.best_params_['max_depth']
    new_estimator = tree.DecisionTreeClassifier()
    params = {
                  "max_depth": [best_D],
                  "criterion": ["entropy"]
                 }
    new_estimator.fit(X_train, Y_train)
    test_acc = new_estimator.score(X_test, Y_test)
    #test_acc = f1_score(new_estimator.predict(X_test), Y_test)
    return test_acc, best_D

In [None]:
#partitions the dataset and runs the models
def RunModels(X_shuffled,Y_shuffled):
    all_results=[]
    
    #iterate through 3 partitions
    for split_num in [0.8,0.5,0.2]:
        RBFSVM_acc = []
        LinearSVM_acc = []
        LR_acc = []
        RF_acc = []
        DT_acc = []
        
        #iterate through 3 trials
        for i in range(3):
            cutoff = int(split_num*len(X_shuffled))
            X_train = X_shuffled[:cutoff] 
            Y_train = Y_shuffled[:cutoff]
            X_test = X_shuffled[cutoff:] 
            Y_test = Y_shuffled[cutoff:]
            #run RBF SVM
            test_acc_rbfSVM, best_C_rbfSVM = runSVMrbf(X_train, Y_train, X_test, Y_test)
            RBFSVM_acc.append(test_acc_rbfSVM)
            
            #run Linear SVM
            test_acc_linearSVM, best_C_linearSVM = runSVMlinear(X_train, Y_train, X_test, Y_test)
            LinearSVM_acc.append(test_acc_linearSVM)

            #run Logistic Regression
            test_acc_LR, best_C_LR = runLogisticRegression(X_train, Y_train, X_test, Y_test)
            LR_acc.append(test_acc_LR)
            
            #run RF
            test_acc_RF, best_D_RF = runRandomForest(X_train, Y_train, X_test, Y_test)
            RF_acc.append(test_acc_RF)
            
            #run DT
            test_acc_DT, best_D_DT = runDecisionTree(X_train, Y_train, X_test, Y_test)
            DT_acc.append(test_acc_DT)
            
        #get avg accuracies from 3 trials    
        avg_rbf_SVM_acc = sum(RBFSVM_acc)/3
        avg_linear_SVM_acc = sum(LinearSVM_acc)/3
        avg_LR_acc = sum(LR_acc)/3
        avg_RF_acc = sum(RF_acc)/3
        avg_DT_acc = sum(DT_acc)/3

        all_results.append(avg_rbf_SVM_acc)
        all_results.append(avg_linear_SVM_acc)
        all_results.append(avg_LR_acc)
        all_results.append(avg_RF_acc)
        all_results.append(avg_DT_acc)

        print('SVM RBF - ','Partition:', split_num, '; avg accuracy:', avg_rbf_SVM_acc)
        print('SVM Linear - ','Partition:', split_num, '; avg accuracy:', avg_linear_SVM_acc)
        print('LR - ','Partition:', split_num, '; avg accuracy:', avg_LR_acc)
        print('RF - ','Partition:', split_num, '; avg accuracy:', avg_RF_acc)
        print('DT - ','Partition:', split_num, '; avg accuracy:', avg_DT_acc)

    return all_results

### 1st Dataset (Iris)

In [None]:
# Load Iris dataset.
iris = datasets.load_iris()
X = iris.data  
Y = (iris.target > 1.5).reshape(-1,1)
X_and_Y = np.hstack((X, Y))     
np.random.seed(1)               
np.random.shuffle(X_and_Y)      

X_shuffled_iris = X_and_Y[:,:4]
Y_shuffled_iris = X_and_Y[:,4]
print(X_shuffled_iris.shape, Y_shuffled_iris.shape)

In [None]:
cutoff = int(0.8*len(X))

X_train = X_shuffled_iris[:cutoff] 
Y_train = Y_shuffled_iris[:cutoff] 
X_test = X_shuffled_iris[cutoff:] 
Y_test = Y_shuffled_iris[cutoff:]  
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
iris_results = RunModels(X_shuffled_iris, Y_shuffled_iris)

In [None]:
print(iris_results)

### 2nd Dataset (letter recognition)

In [None]:
def letterToLabel(val):
    first_half_alphabet = [b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M']
    if (val in first_half_alphabet):
        return 1
    else:
        return 0

In [None]:
#load letter dataset
letter_data = np.loadtxt('./letter-recognition.data', delimiter=',', converters={0:letterToLabel})
X_and_Y = letter_data
np.random.seed(1) 
np.random.shuffle(X_and_Y)
X_shuffled_letters = X_and_Y[:5000,1:17]
Y_shuffled_letters = X_and_Y[:5000,0]
print(X_shuffled_letters.shape, Y_shuffled_letters.shape)

In [None]:
letter_results = RunModels(X_shuffled_letters,Y_shuffled_letters)

In [None]:
print(letter_results)

### 3rd Dataset (Adult)

In [None]:
#load adult dataset
adult_data = pd.read_csv('./adult.data', header=None, names=['age','workclass','fnlwgt','education','education-num',
                                                'marital-status','occupation','relationship','race','sex',
                                                'capital-gain','capital-loss','hours-per-week','native-country',
                                                'above-50k'], na_values='?', index_col=False)

In [None]:
adult_data.dropna(inplace=True)

In [None]:
adult_data.shape

In [None]:
bool_mask = adult_data.applymap(np.isreal).all(0)

In [None]:
adult_data.head()

In [None]:
le = preprocessing.LabelEncoder()
onehot = preprocessing.OneHotEncoder(categorical_features=bool_mask)
for column in adult_data:
    if(adult_data[column].dtype!=int):
        adult_data[column] = le.fit_transform(adult_data[column])
adult_data = onehot.fit_transform(adult_data).toarray()
adult_data.shape

In [None]:
X_and_Y = adult_data[np.random.choice(32561, 2000), :]

In [None]:
np.random.seed(1) 
np.random.shuffle(X_and_Y)
X_shuffled_adult=X_and_Y[:, :-1]
Y_shuffled_adult=X_and_Y[:, -1]
print(X_shuffled_adult.shape, Y_shuffled_adult.shape)

In [None]:
adult_results = RunModels(X_shuffled_adult,Y_shuffled_adult)

In [None]:
print(adult_results)