In [119]:
from __future__ import print_function

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import os
import sys
import time

import sknn
import theano
import theano.tensor as T
import lasagne
from lasagne import layers
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
from nolearn.lasagne import visualize

from functools import reduce

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn import svm
from six.moves import cPickle as pickle

os.environ['KERAS_BACKEND']='theano'
from keras.models import Sequential
from keras.layers import Dense, Activation, Convolution2D, Flatten
from keras.utils import np_utils

from sklearn.neural_network import MLPClassifier
from collections import Counter

%matplotlib inline

Using Theano backend.


Import libraries

In [104]:
files = [
    'data_batch_1',
    'data_batch_2',
    'data_batch_3',
    'data_batch_4',
    'data_batch_5',
    'test_batch'
]
data = []
labels = []
start = time.time()
for file in files:
    with open(file, 'rb') as f:
        d = pickle.load(f, encoding='bytes')
        if file == 'test_batch':
            test_data = d[b'data']
            test_labels = d[b'labels']
        else:
            data.append(d[b'data'])
            labels.append(d[b'labels'])
end = time.time()
print('Time to load data: {:.3f}s'.format(end - start))
for i in range(len(data)):
    print('Train data {}:'.format(i), data[i].shape, len(labels[i]))
print('Test data:', test_data.shape, len(test_labels))

merged_data = reduce(lambda a,b: np.vstack((a,b)), data)
merged_labels = reduce(lambda a,b: a+b, labels)
print('Merged train data:', merged_data.shape, len(merged_labels))

Time to load data: 0.517s
Train data 0: (10000, 3072) 10000
Train data 1: (10000, 3072) 10000
Train data 2: (10000, 3072) 10000
Train data 3: (10000, 3072) 10000
Train data 4: (10000, 3072) 10000
Test data: (10000, 3072) 10000
Merged train data: (50000, 3072) 50000


Load data and print out time- this code is taken directly from section

In [105]:
scaler = StandardScaler()
scaler.fit(merged_data) 
#Train our scaler based on our training data

train_data = merged_data[:]
train_labels = merged_labels[:]
orig_data = test_data[:]
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)
#Apply our scaling to our training and test data, creating a copy of our merged data called 'train_data'



Scaling all the data so the Neural Network will work as expected

In [4]:
mlp = MLPClassifier(solver='lbgfs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=1)
start = time.time()
mlp.fit(train_data, train_labels)
end = time.time()
print('Time to build: {:.3f}s'.format(end - start))
#Create and train a basic Multi Layer Perceptron model

Time to build: 168.079s


In [5]:
predictions = mlp.predict(train_data) #Store the predictions for this basic model
correct = merged_labels - predictions #If the numbers are the same, they'll be 0, otherwise- any other number
accuracy = (correct == 0).sum() / len(correct)
accuracy * 100
#This is the accuracy of the base model on the training data

32.013999999999996

In [6]:
test_pred = mlp.predict(test_data)
correct = test_labels - test_pred #If the numbers are the same, they'll be 0, otherwise- any other number
test_accuracy = (correct == 0).sum() / len(correct)
test_accuracy * 100
#Accuracy of the base model on the test data

30.34

Our basic MLP model seems to have an accuracy of roughly 30-32% to start- looks like there's a lot of optimization we can do on this

http://lasagne.readthedocs.io/en/latest/user/tutorial.html

The documentation for the Lasagne package was extensively utilized for creating the following Neural Network.

In [7]:
start = time.time()
svmc = svm.SVC()
svmfull = svm.SVC()
svmfull.fit(train_data, train_labels)
end = time.time()
print('Time to build: {:.3f}s'.format(end - start))

Time to build: 7249.084s


In [9]:
predictions = svmfull.predict(test_data)
correct = test_labels - predictions #If the numbers are the same, they'll be 0, otherwise- any other number
test_accuracy = (correct == 0).sum() / len(correct)
test_accuracy * 100

54.810000000000002

SVM at 54.8% accuracy- much higher than the 30.34% of our MLP Neural Network

In [125]:
img_size = 32
num_labels = 10
num_channels = 3

train_labels_np = np.array(merged_labels)
test_labels_np = np.array(test_labels)

# modified section's reformatting function a bit so it doesn't use global var
# although i suppose using the global var is not the worst thing in the world
def reformat(dataset, labels, img_size, num_channels=1):
    dataset = dataset.reshape((-1, img_size, img_size, num_channels)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels

cnn_train_data, cnn_train_labels = reformat(train_data, train_labels_np, img_size, num_channels)
cnn_test_data, test_labels_np_onehot = reformat(test_data, test_labels_np, img_size, num_channels)
print('Training set', cnn_train_data.shape, train_labels_np.shape)
print('Test set', cnn_test_data.shape, test_labels_np_onehot.shape)

Training set (50000, 32, 32, 3) (50000,)
Test set (10000, 32, 32, 3) (10000, 10)


In [129]:
# Using the Keras way to translate the labels to one-hot format rather than the section's reformatting method
Y_train = np_utils.to_categorical(train_labels_np, num_labels)

# initialize model and add layers
model = Sequential()
model.add(Convolution2D(3, 3, 3, input_shape=cnn_train_data.shape[1:], name='name'))
model.add(Flatten())  # need to flatten to get correct input dimensions (2-d) for dense layer
model.add(Dense(num_labels))

model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
start = time.time()
model.fit(x=cnn_train_data, y=cnn_train_labels, nb_epoch=3, batch_size=256)
end = time.time()
predictions = model.predict_classes(cnn_test_data, batch_size=256)
correct = (predictions[:,None] == test_labels_np[:,None]).astype(int)
print('\n\nTest set accuracy: {:.2f}%'.format((100.*sum(correct)/len(correct))[0]))
print('Time to build: {:.3f}s'.format(end - start))

Epoch 1/3
Epoch 2/3
Epoch 3/3

Test set accuracy: 14.61%
Time to build: 19.190s


Currently the CNN is struggling with accuracy- this is likely due to a normalization issue, but I'm not sure where the mistake is.

Now we're going to try varying the size of the training set.

In [10]:
Counter(train_labels)
#We have 5,000 of each lable in our data

bucket = []
indices = []
for i in range(0,10):
    temp = [z for z, x in enumerate(train_labels) if x == i]
    indices.append(temp)
    bucket.append(indices[i])
#This finds all the indexes for each value in our training data, and stores them into the 'bucket' list of lists

In [11]:
t1 = []
l1 = []

t2 = []
l2 = []

t3 = []
l3 = []
#Create our training and label subsets

c12 = []
c13 = []
c23 = []
#Create our combined training sets

v12 = []
v13 = []
v23 = []
#Create our combined label sets

for i in range(0,10):
    t1.extend(bucket[i][:1700])
    l1.extend(bucket[i][:1700])
    
    t2.extend(bucket[i][1701:3400])
    l2.extend(bucket[i][1701:3400])
    
    t3.extend(bucket[i][3401:5000])
    l3.extend(bucket[i][3401:5000])
    #Now we append those index subsets together. This is for our basic sets.
    
    c12.extend(bucket[i][:3400])
    c13.extend(bucket[i][:1700])
    c13.extend(bucket[i][3401:5000])
    c23.extend(bucket[i][1701:5000])
    
    v12.extend(bucket[i][:3400])
    v13.extend(bucket[i][:1700])
    v13.extend(bucket[i][3401:5000])
    v23.extend(bucket[i][1701:5000])
    #Now for the combined sets

t1 = train_data[np.array(t1)]
l1 = [train_labels[i] for i in l1]

t2 = train_data[np.array(t2)]
l2 = [train_labels[i] for i in l2]

t3 = train_data[np.array(t3)]
l3 = [train_labels[i] for i in l3]
#And now we convert them so they're actually lists of values rather than indices
#Above is simple sets

c12 = train_data[np.array(c12)]
v12 = [train_labels[i] for i in v12]

c13 = train_data[np.array(c13)]
v13 = [train_labels[i] for i in v13]

c23 = train_data[np.array(c23)]
v23 = [train_labels[i] for i in v23]

Now we've split our data into the following sets:

T1/T2/T3: Training data, in thirds, of the original set- each containing the same number of each class
C12/C13/C23: Combined versions of the above three sets. We've already used ALL the data in a run, so there's no point to C123

In [12]:
start = time.time()
mlp.fit(t1, l1)
end = time.time()
print('Time to build: {:.3f}s'.format(end - start))
#Basic MLP model with the first test set

Time to build: 61.733s


In [13]:
test_pred = mlp.predict(test_data)
correct = test_labels - test_pred #If the numbers are the same, they'll be 0, otherwise- any other number
test_accuracy = (correct == 0).sum() / len(correct)
test_accuracy * 100
#Accuracy of the base model on the test data

27.350000000000001

In [18]:
start = time.time()
mlp.fit(c12, v12)
end = time.time()
print('Time to build: {:.3f}s'.format(end - start))
#Basic MLP model with the first combined set

Time to build: 116.076s


In [19]:
test_pred = mlp.predict(test_data)
correct = test_labels - test_pred #If the numbers are the same, they'll be 0, otherwise- any other number
test_accuracy = (correct == 0).sum() / len(correct)
test_accuracy * 100
#Accuracy of the base model on the test data

29.210000000000001

This finishes our comparison of the MLP classifier- next we have to compare our SVM classifier. Generally speaking the classifiers took about 1-2 minutes to build- looks like my computer runs at almost 1 minute/17,000 records for the training data.

In [24]:
start = time.time()
svmc.fit(t1, l1)
end = time.time()
print('Time to build: {:.3f}s'.format(end - start))

Time to build: 908.899s


In [25]:
predictions = svmc.predict(test_data)
correct = test_labels - predictions #If the numbers are the same, they'll be 0, otherwise- any other number
test_accuracy = (correct == 0).sum() / len(correct)
test_accuracy * 100
#First set

49.919999999999995

In [32]:
start = time.time()
svmc.fit(c12, v12)
end = time.time()
print('Time to build: {:.3f}s'.format(end - start))

Time to build: 3382.673s


In [33]:
predictions = svmc.predict(test_data)
correct = test_labels - predictions #If the numbers are the same, they'll be 0, otherwise- any other number
test_accuracy = (correct == 0).sum() / len(correct)
test_accuracy * 100
#First Combo set

52.810000000000002

The SVM algorithms can take up to an hour for the combined data sets. With that said, their accuracy is quite high- at 52-53% they outperform the unoptimized MLP handily.

Below we'll modify our test data, then rerun the algorithms on it and see how our classifiers hold up on this new modified test data.

In [106]:
distort_data = orig_data[:]
for i in range(0, len(distort_data)):
    distort_data[i][:31] = round(np.mean(distort_data))
distort_data = scaler.transform(distort_data)
#This distortion replaces the first 32 numbers with the mean- essentially averaging out the top row
#The data is then rescaled



In [107]:
mlp.fit(train_data, train_labels)
#Refit the full model onto this data

test_pred = mlp.predict(distort_data)
correct = test_labels - test_pred #If the numbers are the same, they'll be 0, otherwise- any other number
test_accuracy = (correct == 0).sum() / len(correct)
test_accuracy * 100
#Accuracy of the base model on the distorted data

29.370000000000001

Some accuracy loss- by a few percentage points, but still better than random (10%)

In [None]:
predictions = svmfull.predict(distort_data)
correct = test_labels - predictions #If the numbers are the same, they'll be 0, otherwise- any other number
test_accuracy = (correct == 0).sum() / len(correct)
test_accuracy * 100
#Accuracy of the base model on the distorted data