# Data Competition - Milestone 3

In [1]:
import numpy as np
import pandas as pd
import pickle
import urllib.request

from IPython.core.interactiveshell import InteractiveShell
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
# load data
x = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/train_features'))
test_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/test_features'))
y = np.array(pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/train_labels')))
y = y.astype(float)

# image transformations not used for this milestone
# color_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/color_features'))
# compress_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/compress_features'))
# crop_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/crop_features'))
# crop_to_corner_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/crop_to_corner_features'))
# homography_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/homography_features'))
# mirror_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/mirror_features'))
# rotate30_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/rotate30_features'))
# scale_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/scale_features'))

# subset the data if looking to analyze a smaller set of classes
classes = np.unique(y)
index = np.ravel(np.nonzero(np.in1d(y, classes)))
x_subset = x[index]
y_subset = y[index]

In [3]:
def split_data_equal(x, y, test_set, train_size=0.75):
    # split into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_size, random_state=0, stratify=y)
    
    # center and standardize x values
    x_scaler = StandardScaler().fit(x_train)
    x_train = x_scaler.transform(x_train)
    x_test = x_scaler.transform(x_test)
    test_set = x_scaler.transform(test_set)
    
    return x_train, x_test, y_train, y_test, test_set


def decomp_PCA(train, test, test_set, explained_var_threshold=0.95):
    pca = PCA().fit(train)

    pca_explained_var_ratio = pca.explained_variance_ratio_

    pca_explained_var = []
    num_component_vectors = 0

    while np.sum(pca_explained_var) < explained_var_threshold:
        pca_explained_var.append(pca_explained_var_ratio[num_component_vectors])
        num_component_vectors += 1
    #print('# Component Vectors: %d    Explained Var: %f' % (num_component_vectors, np.sum(pca_explained_var)))

    pca = PCA(n_components=num_component_vectors).fit(train)
    x_train = pca.transform(train)
    x_test = pca.transform(test)
    test_set = pca.transform(test_set)
    
    return x_train, x_test, test_set


def accuracy_misclass_error(predict, actual):
    # calculate misclassification error
    misclass_error = np.mean(predict != actual)*100
    accuracy = 100 - misclass_error
    
    return accuracy, misclass_error


def class_diff(classes, conf_matrix):
    # initialize variable to append each individual class percent
    percent_correct = []
    
    # loop through confusion matrix by true label
    for i in range(len(conf_matrix[0, :])):
        class_count = np.sum(conf_matrix[i])
        misclass_count = 0
        
        # loop through confusion matrix by predict label and append percent correct
        for j in range(len(conf_matrix[:, 0])):
            if i != j:
                misclass_count += conf_matrix[i][j]
            else:
                pass
        percent_correct.append(misclass_count/class_count)
        
    # calcuate ordered list of multi-class misclassification error
    ordered_class_diff = np.vstack((classes, np.array(percent_correct))).T
    ordered_class_diff = ordered_class_diff[ordered_class_diff[:, 1].argsort()[::-1]]
    
    return ordered_class_diff

In [4]:
# split data
x_train, x_test, y_train, y_test, test_set = split_data_equal(x=x_subset, y=y_subset, test_set=test_set, train_size=0.75)
n = x_train.shape[0]
d = x_train.shape[1]

# run PCA to reduce dimensionality and speed up processing time
x_train, x_test, test_set = decomp_PCA(train=x_train, test=x_test, test_set=test_set)

## Please Note:

The code below takes a significant amount of time to run. As such, the fitted models have been saved and imported using pickle. Should you want to test the fitted models imported using pickle, you may uncomment and run the code below.

In [5]:
ovo_linSVC = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/ovo_linSVC'))
ovr_linSVC = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/ovr_linSVC'))
mult_linSVC = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/mult_linSVC'))
ovo_gridCV = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/ovo_gridCV'))
ovr_gridCV = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/ovr_gridCV'))
mult_gridCV = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/mult_gridCV'))
ovo_kernelgridCV = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/ovo_kernelgridCV'))
ovr_kernelgridCV = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/ovr_kernelgridCV'))
mult_kernelgridCV = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/HW7+Milestone/mult_kernelgridCV'))

#### Warm-up

- In a one-vs-one fashion, for each pairs of classes, train a linear SVM classiﬁer using scikit-learn’s function `LinearSVC`, with the default value for the regularization parameter. Compute the _multi-class misclassiﬁcation error_ obtained using these classiﬁers trained in a one-vs-one fashion.

In [6]:
# specify estimator/fit model
# linSVC_estimator = LinearSVC()
# ovo_linSVC = OneVsOneClassifier(linSVC_estimator, n_jobs=-1).fit(x_train, y_train)

# predict labels
ovo_linSVC_predictions = ovo_linSVC.predict(x_test)

In [7]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(ovo_linSVC_predictions, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=ovo_linSVC_predictions)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 30.925926%


Unnamed: 0,class,misclass error
0,59.0,0.875000
1,66.0,0.750000
2,102.0,0.750000
3,193.0,0.750000
4,172.0,0.750000
5,38.0,0.714286
6,131.0,0.714286
7,144.0,0.714286
8,67.0,0.714286
9,183.0,0.714286


- In a one-vs-rest fashion, for each class, train a linear SVM classiﬁer using scikit-learn’s function `LinearSVC`, with the default value for $\lambda_c$. Compute the multi-class misclassiﬁcation error obtained using these classiﬁers trained in a one-vs-rest fashion.

In [8]:
# fit model
# ovr_linSVC = LinearSVC(multi_class='ovr', class_weight='balanced').fit(x_train, y_train)

# predict labels
ovr_linSVC_predictions = ovr_linSVC.predict(x_test)

In [9]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(ovr_linSVC_predictions, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=ovr_linSVC_predictions)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 43.333333%


Unnamed: 0,class,misclass error
0,59.0,1.000000
1,102.0,0.875000
2,118.0,0.875000
3,193.0,0.875000
4,11.0,0.875000
5,29.0,0.875000
6,103.0,0.857143
7,185.0,0.857143
8,131.0,0.857143
9,27.0,0.857143


- Using the option `multi_class='crammer singer'` in scikitlearn’s function `LinearSVC`, train a multi-class linear SVM classiﬁer using the default value for the regularization parameter. Compute the multi-class misclassiﬁcation error obtained using this multiclass linear SVM classiﬁer.

In [10]:
# fit model
# mult_linSVC = LinearSVC(multi_class='crammer_singer', class_weight='balanced').fit(x_train, y_train)

# predict labels
mult_linSVC_predictions = mult_linSVC.predict(x_test)

In [11]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(mult_linSVC_predictions, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=mult_linSVC_predictions)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 30.925926%


Unnamed: 0,class,misclass error
0,59.0,0.875000
1,128.0,0.875000
2,91.0,0.857143
3,144.0,0.857143
4,186.0,0.750000
5,11.0,0.750000
6,66.0,0.750000
7,127.0,0.714286
8,71.0,0.714286
9,197.0,0.714286


#### Linear SVMs for multi-class classification

- Redo all questions above now tuning the regularization parameters using cross-validation.

In [12]:
# specify estimator/regularization parameters/fit model
# linSVC_estimator = LinearSVC()
# ovo_linSVC = OneVsOneClassifier(linSVC_estimator, n_jobs=-1)
# parameters = {'estimator__C':[10**i for i in range(-2, 2)]}
# ovo_gridCV = GridSearchCV(ovo_linSVC, parameters).fit(x_train, y_train)

# predict labels
ovo_linSVC_predictions_CV = ovo_gridCV.predict(x_test)

In [13]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(ovo_linSVC_predictions_CV, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=ovo_linSVC_predictions_CV)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 30.740741%


Unnamed: 0,class,misclass error
0,59.0,0.875000
1,144.0,0.857143
2,102.0,0.750000
3,193.0,0.750000
4,172.0,0.750000
5,173.0,0.714286
6,38.0,0.714286
7,183.0,0.714286
8,116.0,0.625000
9,11.0,0.625000


In [10]:
# specify regularization parameters/fit model
# ovr_linSVC = LinearSVC(multi_class='ovr', class_weight='balanced')
# parameters = {'C':[10**i for i in range(-2, 2)]}
# ovr_gridCV = GridSearchCV(ovr_linSVC, parameters).fit(x_train, y_train)

# predict labels
ovr_linSVC_predictions_CV = ovr_gridCV.predict(x_test)

In [11]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(ovr_linSVC_predictions_CV, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=ovr_linSVC_predictions_CV)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 35.555556%


Unnamed: 0,class,misclass error
0,59.0,1.000000
1,102.0,0.875000
2,118.0,0.875000
3,103.0,0.857143
4,27.0,0.857143
5,71.0,0.857143
6,144.0,0.857143
7,91.0,0.857143
8,193.0,0.750000
9,11.0,0.750000


In [16]:
# specify regularization parameters/fit model
# mult_linSVC = LinearSVC(multi_class='crammer_singer', class_weight='balanced')
# parameters = {'C':[10**i for i in range(-2, 2)]}
# mult_gridCV = GridSearchCV(mult_linSVC, parameters).fit(x_train, y_train)

# predict labels
mult_linSVC_predictions_CV = mult_gridCV.predict(x_test)

In [17]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(mult_linSVC_predictions_CV, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=mult_linSVC_predictions_CV)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 30.833333%


Unnamed: 0,class,misclass error
0,186.0,0.875000
1,59.0,0.875000
2,91.0,0.857143
3,49.0,0.750000
4,128.0,0.750000
5,197.0,0.714286
6,144.0,0.714286
7,127.0,0.714286
8,152.0,0.625000
9,142.0,0.625000


#### Kernel SVMs for multi-class classification

- Redo all questions above now using the polynomial kernel of order 2 (and tuning the regularization parameters using cross-validation).

In [5]:
# specify regularization parameters/fit model
# kernelSVC_estimator = SVC(kernel='poly', degree=2, class_weight='balanced')
# ovo_kernelSVC = OneVsOneClassifier(kernelSVC_estimator, n_jobs=-1)
# parameters = {'estimator__C':[10**i for i in range(-2, 2)]}
# ovo_kernelgridCV = GridSearchCV(ovo_kernelSVC, parameters).fit(x_train, y_train)

# predict labels
ovo_kernelSVC_predictions = ovo_kernelgridCV.predict(x_test)

In [6]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(ovo_kernelSVC_predictions, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=ovo_kernelSVC_predictions)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 32.777778%


Unnamed: 0,class,misclass error
0,59.0,0.875000
1,71.0,0.857143
2,11.0,0.750000
3,118.0,0.750000
4,102.0,0.750000
5,172.0,0.750000
6,143.0,0.714286
7,137.0,0.714286
8,131.0,0.714286
9,130.0,0.714286


In [20]:
# specify regularization parameters/fit model
# ovr_kernelSVC = SVC(kernel='poly', degree=2, class_weight='balanced', decision_function_shape='ovr')
# parameters = {'C':[10**i for i in range(-2, 2)]}
# ovr_kernelgridCV = GridSearchCV(ovr_kernelSVC, parameters).fit(x_train, y_train)

# predict labels
ovr_kernelSVC_predictions = ovr_kernelgridCV.predict(x_test)

In [21]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(ovr_kernelSVC_predictions, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=ovr_kernelSVC_predictions)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 32.500000%


Unnamed: 0,class,misclass error
0,59.0,0.875000
1,172.0,0.875000
2,71.0,0.857143
3,102.0,0.750000
4,11.0,0.750000
5,118.0,0.750000
6,131.0,0.714286
7,130.0,0.714286
8,173.0,0.714286
9,144.0,0.714286


In [22]:
# specify regularization parameters/fit model
# mult_kernelSVC = SVC(kernel='poly', degree=2, class_weight='balanced', decision_function_shape='ovo')
# parameters = {'C':[10**i for i in range(-2, 2)]}
# mult_kernelgridCV = GridSearchCV(mult_kernelSVC, parameters).fit(x_train, y_train)

# predict labels
mult_kernelSVC_predictions = mult_kernelgridCV.predict(x_test)

In [23]:
# compute multi-class misclassification error
accuracy, misclass_error = accuracy_misclass_error(mult_kernelSVC_predictions, y_test)
print('Misclassification Error: %f%%' % (misclass_error))
conf_mat = confusion_matrix(y_true=y_test, y_pred=mult_kernelSVC_predictions)
pd.DataFrame(class_diff(classes, conf_mat), columns=['class', 'misclass error'])

Misclassification Error: 32.500000%


Unnamed: 0,class,misclass error
0,59.0,0.875000
1,172.0,0.875000
2,71.0,0.857143
3,102.0,0.750000
4,11.0,0.750000
5,118.0,0.750000
6,131.0,0.714286
7,130.0,0.714286
8,173.0,0.714286
9,144.0,0.714286
