# Dane Jordan

## Homework 8 Data Competition Milestones

In [11]:
import numpy as np
import pandas as pd
import pickle
import urllib.request

from IPython.core.interactiveshell import InteractiveShell
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
# load data
x = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/train_features'))
test_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/test_features'))
y = np.array(pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/train_labels')))
y = y.astype(float)

# image transformations not used for this milestone
# color_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/color_features'))
# compress_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/compress_features'))
# crop_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/crop_features'))
# crop_to_corner_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/crop_to_corner_features'))
# homography_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/homography_features'))
# mirror_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/mirror_features'))
# rotate30_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/rotate30_features'))
# scale_set = pickle.load(urllib.request.urlopen('https://s3.amazonaws.com/stat558drjordankaggle/scale_features'))

# subset the data if looking to analyze a smaller set of classes
classes = np.unique(y)
index = np.ravel(np.nonzero(np.in1d(y, classes)))
x_subset = x[index]
y_subset = y[index]

In [3]:
def split_data_equal(x, y, test_set, train_size=0.75):
    # split into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_size, random_state=0, stratify=y)
    
    # center and standardize x values
    x_scaler = StandardScaler().fit(x_train)
    x_train = x_scaler.transform(x_train)
    x_test = x_scaler.transform(x_test)
    test_set = x_scaler.transform(test_set)
    
    return x_train, x_test, y_train, y_test, test_set


def decomp_PCA(train, test, test_set, explained_var_threshold=0.95):
    pca = PCA().fit(train)

    pca_explained_var_ratio = pca.explained_variance_ratio_

    pca_explained_var = []
    num_component_vectors = 0

    while np.sum(pca_explained_var) < explained_var_threshold:
        pca_explained_var.append(pca_explained_var_ratio[num_component_vectors])
        num_component_vectors += 1
    #print('# Component Vectors: %d    Explained Var: %f' % (num_component_vectors, np.sum(pca_explained_var)))

    pca = PCA(n_components=num_component_vectors).fit(train)
    x_train = pca.transform(train)
    x_test = pca.transform(test)
    test_set = pca.transform(test_set)
    
    return x_train, x_test, test_set

In [4]:
# split data
x_train, x_test, y_train, y_test, test_set = split_data_equal(x=x_subset, y=y_subset, test_set=test_set, train_size=0.75)
n = x_train.shape[0]
d = x_train.shape[1]

# run PCA to reduce dimensionality and speed up processing time
x_train, x_test, test_set = decomp_PCA(train=x_train, test=x_test, test_set=test_set)

- Compare the performance of SVMs for diﬀerent types of kernels, tuning the kernel parameters using cross-validation. You may train the SVMs for multi-class classiﬁcation in any fashion you wish (one-vs-one, one-vs-rest, multiclass). You may also use scikit-learn’s built-in functions to perform cross-validation over all parameters

In [8]:
# rbf kernel ovo
rbfSVC = SVC()
parameters = {'C':[10**i for i in range(-2, 2)]}
rbfCV = GridSearchCV(rbfSVC, parameters, n_jobs=-1)
rbfCV_fitted = rbfCV.fit(x_train, y_train)

# polynomial (order 2) kernel ovo
poly2SVC = SVC(kernel='poly', degree=2)
parameters = {'C':[10**i for i in range(-2, 2)]}
poly2CV = GridSearchCV(poly2SVC, parameters, n_jobs=-1)
poly2CV_fitted = poly2CV.fit(x_train, y_train)

In [10]:
# accuracy predictions
y_predict_rbf = rbfCV_fitted.predict(x_test)
y_predict_poly2 = poly2CV_fitted.predict(x_test)
print('rbf Accuracy: %f%%' % (np.mean(y_predict_rbf == y_test)*100))
print('poly2 Accuracy: %f%%' % (np.mean(y_predict_poly2 == y_test)*100))

rbf Accuracy: 57.870370%
poly2 Accuracy: 67.500000%


- Experiment with several ensembles of classiﬁers, using any technique you wish. You may consider bagging (ensembles of classiﬁers learnt from random subsamples of examples), ensemble of classiﬁers learnt from random subsets of features, ensembles of classiﬁers with diﬀerent kernels, etc.

In [14]:
# use rbf kernel with bagging classifier
rbf_bag = BaggingClassifier(base_estimator=rbfCV, n_jobs=-1).fit(x_train, y_train)

# use polynomial (order 2) kernel with bagging classifier
poly2_bag = BaggingClassifier(base_estimator=poly2CV, n_jobs=-1).fit(x_train, y_train)

In [15]:
# accuracy prediction
y_predict_rbf_bag = rbf_bag.predict(x_test)
y_predict_poly2_bag = poly2_bag.predict(x_test)
print('rbf bagging Accuracy: %f%%' % (np.mean(y_predict_rbf_bag == y_test)*100))
print('poly2 bagging Accuracy: %f%%' % (np.mean(y_predict_poly2_bag == y_test)*100))

rbf bagging Accuracy: 54.444444%
poly2 bagging Accuracy: 65.555556%
