# Load data

In [2]:
%matplotlib inline

In [3]:
import numpy as np 
import pandas as pd
import sklearn 
import scipy.misc
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import KFold
%matplotlib inline

In [4]:
#if data stored locally
# x_train = np.loadtxt("train_x.csv", delimiter=",")
# y_train = np.loadtxt("train_y.csv", delimiter=",")
# x_test = np.loadtxt("test_x.csv", delimiter=",")

#if load from class web source using np
# URL_ENDPOINT = "http://cs.mcgill.ca/~ksinha4/datasets/kaggle/"
# x_train = np.loadtxt(URL_ENDPOINT+"train_x.csv", delimiter=",")
# y_train = np.loadtxt(URL_ENDPOINT+"train_y.csv", delimiter=",")
# x_test = np.loadtxt(URL_ENDPOINT+"test_x.csv", delimiter=",")

#if previous load not work, load using pd
URL_ENDPOINT = "http://cs.mcgill.ca/~ksinha4/datasets/kaggle/"
df1 = pd.read_csv(URL_ENDPOINT+"train_x.csv", delimiter=",", header=None, dtype="uint8")
df2 = pd.read_csv(URL_ENDPOINT+"train_y.csv", delimiter=",", header=None, dtype="uint8")
#df3 = pd.read_csv(URL_ENDPOINT+"test_x.csv", delimiter=",", header=None)

#df to numpy for 3rd loading method
x_train = np.zeros((50000,4096), dtype="uint8")
y_train = np.zeros(50000, dtype="uint8")
x_train[:,:] = df1.as_matrix()
y_train[:] = df2.as_matrix()[:,0]

# loading data
# x_train = np.loadtxt("data/train_x.csv", delimiter=",")
# y_train = np.loadtxt("data/train_y.csv", delimiter=",")
# x_test = np.loadtxt("data/test_x.csv", delimiter=",")


## Methods to convert y from one hot encoding to numerical representation and vice versa

In [6]:
# encode y_train data in one-hot encoding
def to_one_hot(y):
    y_train_one_hot = [[0 for i in range(10)] for i in range(len(y))]
    for i in range(len(y)):
        y_train_one_hot[i][int(y[i])] = 1

    return np.array(y_train_one_hot)

def from_one_hot(one_hot_data):
    y = []
    for row in one_hot_data:
        y.append(np.argmax(row))

    return y

## Method to show image

In [7]:
def show_img(img):
    plt.close()
    plt.imshow(img, cmap='gray_r')
    plt.show()

x_train_reshaped = x_train.reshape(-1, 64, 64)
y_train_reshaped = y_train.reshape(-1, 1) 
x_test_reshaped = x_test.reshape(-1, 64, 64)


## Data preprocessing step

In [8]:
def clean_data(x):
    """
    Because the only thing that matters is the numbers in the picture, which are black, 
    we recode the pixels as 1 if the pixel was 255 and -1 otherwise. To reduce noise and to
    reduce overflow/underflow in later stages.
    """
    new_data = []
    for row in x:
        new_row = []
        for pixel in row:
            if pixel == 255:
                new_row.append(1.0)
            else:
                new_row.append(-1.0)
        new_data.append(new_row)
    return np.array(new_data)

x_train = clean_data(x_train)

## Split training data into train / valid sets

In [9]:
from sklearn.model_selection import train_test_split
x_train_s, x_valid_s, y_train_s, y_valid_s = train_test_split(x_train, y_train, train_size=0.8, test_size=0.2)
data = {
    "x_train": x_train_s,
    "x_valid": x_valid_s,
    "y_train": to_one_hot(y_train_s),
    "y_valid": to_one_hot(y_valid_s),
    "y_train_og": y_train_s,
    "y_valid_og": y_valid_s,
}

# Baseline Linear Classifier: Linear SVM

In [23]:

from sklearn.svm import LinearSVC

Run CNN on training data with the validation split.

In [25]:
def baseline_linear_svm(data):
    """
    Using out-of-the-box linear SVM to classify data
    """
    clf = LinearSVC()
    
    y_pred = clf.fit(data["x_train"], data["y_train"]).predict(data["x_valid"])
    print(y_pred)
    return metrics.accuracy_score(data["y_valid"], y_pred, average="macro"), y_pred
    
# score, y_pred = baseline_linear_svm(data)
# print(score)

# Neural Network

In [10]:
from scipy.special import expit
import torch

In [20]:
class Layer:
    """
    Represents a layer in the neural net.
    """
    def __init__(self, input_rows, input_cols, learning_rate=0.01, num_nodes=200, activation_func="sigmoid"):
        self.input_rows = input_rows
        self.input_cols = input_cols
        self.num_nodes = num_nodes
        self.next = None
        self.prev = None
        input_range = 1.0 / input_rows ** (1/2)
        self.w = np.random.normal(loc=0, scale=input_range, size=(self.input_cols,num_nodes))
        self.learning_rate = learning_rate
        
        if activation_func == "sigmoid":
            self.activation_func = expit
            self.d_activation_func = lambda x: x * (1. - x)
            
        elif activation_func == "tanh":
            self.activation_func = lambda x: np.tanh(x)
            self.d_activation_func = lambda x: 1 - np.square(x)
            
        elif activation_func == "relu":
            self.activation_func = lambda x: np.maximum(0,x)
            self.d_activation_func = lambda x: x/x
        else:
            raise Error
        
    def feedforward(self, x):
        """
        return the predictions (represented by a probability)
        """
        # calculate stuff
        self.input = x
        before_activation = np.dot(x, self.w)
        self.output = self.activation_func(before_activation) 
        self.derivative = self.d_activation_func(before_activation)
        
        # if there's a next layer
        if self.next:
            passed_output = []
            # add bias to the end of each row of self.output
            # need to check if the input was a vector or a matrix
            try:
                passed_output = np.append(self.output, np.ones((self.output.shape[0], 1)), axis=-1)
            except ValueError:
                passed_output = np.append(self.output, 1) 
            
            # call next layer's feedforward step
            self.next.feedforward(passed_output)
 
    def backprop(self, prev_deltas):
        """
        compute derivatives and adjust w
        """ 
        deltas = prev_deltas * self.derivative
        if self.prev:
            self.prev.backprop(np.dot(self.w[:-1], deltas.T).T)
        self.w = self.w - (self.learning_rate * np.dot(self.input.T, deltas))


class OutputLayer(Layer):
    """
    Similar to the Layer class, but designed to handle the start of the backprop algorithm.
    """
    
    def __init__(self, input_rows, input_cols, learning_rate=0.01, num_nodes=200, activation_func="softmax"):
        self.input_rows = input_rows
        self.input_cols = input_cols
        self.num_nodes = num_nodes
        self.next = None
        self.prev = None
        input_range = 1.0 / input_rows ** (1/2)
        self.w = np.random.normal(loc=0, scale=input_range, size=(self.input_cols,num_nodes))
        self.learning_rate = learning_rate
        
        if activation_func == "softmax":
            self.activation_func = lambda x: 1 / (1 + np.exp(-x))
            self.backprop_func = lambda x, target: x - target
            
        elif activation_func == "sigmoid":
            self.activation_func = self.sigmoid
            self.backprop_func = lambda x, target: self.d_sigmoid(x) * (x - target)
            
        else:
            pass
        
    def backprop(self, targets):
        deltas = self.backprop_func(self.output, targets)
        self.prev.backprop(np.dot(self.w[:-1], deltas.T).T)
        self.w = self.w - self.learning_rate * np.dot(self.input.T, deltas)
    
    def softmax(self, x):
        y = []
        for i in range(len(x)):
            row_exp = np.exp(x[i] - np.amax(x[i]))
            row_sum = np.sum(row_exp)
            y.append(row_exp/row_sum)
        return np.array(y)
        
    
class NeuralNet:
    """
    Stores a linked list of layers, and starts the method calls.
    """
    
    def __init__(self, learning_rate, num_epochs):
        self.first = None
        self.last = None
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        
    def add_layer(self, layer):
        """
        Add layer to the end
        """
        if not self.first:
            self.first = layer
            self.last = layer
            
        else:
            temp = self.last
            temp.next = layer
            layer.prev = temp
            self.last = layer
            
    def fit(self, x_train, y_train, x_valid, y_valid):
        """
        Fits the parameters of the neural net by gradient descent
        """
        x_input = np.append(x_train, np.ones((x_train.shape[0], 1)), axis=-1)
        init_acc = self.get_accuracy(x_valid, y_valid)
        
        for i in range(self.num_epochs):

            for j in range(0, len(x_input), 1000):
                self.first.feedforward(x_input[j:j+1000])
                self.last.backprop(y_train[j:j+1000])
                
        final_acc = self.get_accuracy(x_valid, y_valid)
        print("Initial accuracy, final accuracy", init_acc, final_acc)
        return final_acc
                
    def predict(self, x):
        x_input = np.append(x, np.ones((x.shape[0], 1)), axis=-1)
        self.first.feedforward(x_input)
        return self.last.output
    
    def get_accuracy(self, x_valid, y_valid):
        y_pred = self.predict(x_valid)
        return metrics.accuracy_score(from_one_hot(y_valid), from_one_hot(y_pred))

### k fold validation

In [None]:
x_tr = data["x_train"]
y_tr = data["y_train"]


# neural_net = NeuralNet(1e-5, 2)
# # neural_net.k_fold(x_train, y_train)
# neural_net.add_layer(Layer(x_tr.shape[0], x_tr.shape[1] + 1, 1e-5, 3500, activation_func="relu"))
# # neural_net.add_layer(Layer(x_tr.shape[0], 300 + 1, 1e-6, 200))
# neural_net.add_layer(OutputLayer(x_tr.shape[0], 3500 + 1, 1e-5, 10))
# # neural_net.fit(x_tr, y_tr, data["x_valid"], data["y_valid"])
# # print("Done")
        
def k_fold(x, y):
    # divide into 5 folds
    y = to_one_hot(y)
    kf = KFold(n_splits=5, shuffle=True)
    kf.get_n_splits(x)
    accuracy_scores = []

    # for each fold, train and compute accuracy
    for train_index, test_index in kf.split(x):

        # create 'new' train & valid sets based on kfold split
        new_xt, new_yt = [], []
        new_xv, new_yv = [], []
        for ind in train_index:
            new_xt.append(x[ind])
            new_yt.append(y[ind])

        for ind in test_index:
            new_xv.append(x[ind])
            new_yv.append(y[ind])
        
        
        new_xt = np.array(new_xt)
        new_yt = np.array(new_yt)
        new_xv = np.array(new_xv)
        new_yv = np.array(new_yv)
        print(new_xt.shape, new_yt.shape, new_xv.shape, new_yv.shape)
        
        nn = NeuralNet(1e-5, 2)
        nn.add_layer(Layer(new_xt.shape[0], new_xt.shape[1] + 1, 1e-5, 3000, activation_func="relu"))
        nn.add_layer(OutputLayer(new_xt.shape[0], 3000 + 1, 1e-5, 10))
        acc_score = nn.fit(new_xt, new_yt, new_xv, new_yv)
        accuracy_scores.append(acc_score)

    print(accuracy_scores, np.average(accuracy_scores))
    return accuracy_scores

k_fold(x_train, y_train)

Neural Net with Pytorch

In [None]:
"""
These classes are meant to be equivalent to the previous classes.
The only difference is that this uses the pytorch Tensor libraries to perform the computation on the GPU.
Only functions for matrix computations were used, it served only to do the work of numpy on the GPU.
"""

class TorchLayer:
    def __init__(self, input_rows, input_cols, learning_rate=0.01, num_nodes=200, activation_func="sigmoid"):
        self.input_rows = input_rows
        self.input_cols = input_cols
        self.num_nodes = num_nodes
        self.next = None
        self.prev = None
        input_range = 1.0 / input_rows ** (1/2)
        self.w = torch.from_numpy(np.random.normal(loc=0, scale=input_range, size=(self.input_cols,num_nodes))).cuda()
        self.learning_rate = learning_rate
        
        if activation_func == "sigmoid":
            self.activation_func = torch.sigmoid
            self.d_activation_func = self.d_sigmoid
            
            
        elif activation_func == "tanh":
            self.activation_func = self.tanh
            self.d_activation_func = self.d_tanh
            
        elif activation_func == "relu":
            self.activation_func = self.relu
            self.d_activation_func = self.d_relu
        else:
            raise Error
        
    def feedforward(self, x):
        """
        return the predictions (represented by a probability)
        """
        # calculate stuff
        self.input = x
        before_activation = torch.mm(x, self.w)
        self.output = self.activation_func(before_activation) 
        self.derivative = self.d_activation_func(before_activation)
        
        
        
        # if there's a next layer
        if self.next:
            # add bias to the end of each row of the output.
            passed_output = torch.cat((self.output, torch.ones(self.output.size()[0], 1).double().cuda()), 1)
            
            # call next layer's feedforward step
            self.next.feedforward(passed_output)

    def backprop(self, prev_deltas):
        """
        compute derivatives and adjust w
        """ 
        deltas = prev_deltas * self.derivative
        if self.prev:
            self.prev.backprop(torch.mm(self.w[:-1], deltas.t()).t())
        self.w = self.w - (self.learning_rate * torch.mm(self.input.t(), deltas))
    
    def tanh(self, x):
        return torch.tanh(x)
        
    def d_tanh(self, x):
        return 1 - (torch.tanh(x))**2
        
    def relu(self, x):
        return torch.clamp(x, min=0)
    
    def d_relu(self, x):
        return torch.gt(x, 0).double()
    
        
    def sigmoid(self, x):
        """
        sigmoid function
        """
        return 1 / (1 + np.exp(-x))

    def d_sigmoid(self, x):
        """
        derivative of sigmoid
        """
        return x * (1. - x)
   
    
class TorchOutputLayer(TorchLayer):
    def __init__(self, input_rows, input_cols, learning_rate=0.01, num_nodes=200, activation_func="softmax"):
        self.input_rows = input_rows
        self.input_cols = input_cols
        self.num_nodes = num_nodes
        self.next = None
        self.prev = None
        self.w = torch.from_numpy(np.random.uniform(size=(self.input_cols,num_nodes)) / np.sqrt(self.input_cols)).cuda()
        self.learning_rate = learning_rate
        
        if activation_func == "softmax":
            self.activation_func = self.softmax
            self.backprop_func = lambda x, target: x - target
            
        elif activation_func == "sigmoid":
            self.activation_func = self.sigmoid
            self.backprop_func = lambda x, target: self.d_sigmoid(x) * (x - target)
            
        else:
            pass
        
        # Value is unused.
        self.d_activation_func = lambda x: None
     
    def backprop(self, targets):
        deltas = self.backprop_func(self.output, targets)
        self.prev.backprop(torch.mm(self.w[:-1], deltas.t()).t())
        self.w = self.w - self.learning_rate * torch.mm(self.input.t(), deltas)
        
    
    def softmax(self, x):
        # This is technically from the neural net package, but it's really just a way to compute Softmax.
        return torch.nn.functional.softmax(torch.autograd.Variable(x), 1).data
        
    
class TorchNeuralNet:
    def __init__(self, learning_rate, num_epochs):
        self.first = None
        self.last = None
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        
    def add_layer(self, layer):
        """
        Add layer to the end
        """
        if not self.first:
            self.first = layer
            self.last = layer
            
        else:
            temp = self.last
            temp.next = layer
            layer.prev = temp
            self.last = layer
            
    def fit(self, x_train, y_train):
        x_input = np.append(x_train, np.ones((x_train.shape[0], 1)), axis=-1) 
        
        for i in range(self.num_epochs):

            for j in range(0, len(x_input), 500):
                self.first.feedforward(torch.from_numpy(x_input[j:j+500]).cuda())
                self.last.backprop(torch.from_numpy(y_train[j:j+500]).double().cuda()) 
                
    def predict(self, x):
        x_input = torch.from_numpy(np.append(x, np.ones((x.shape[0], 1)), axis=-1)).cuda()
        self.first.feedforward(x_input)
        return self.last.output.cpu().numpy()
    
    def get_accuracy(self):
        y_pred = np.empty((len(data["x_valid"]), 10), dtype=data["x_valid"].dtype)
        for i in range(0, len(data["x_valid"]), 1000):
            y_pred[i:i+1000] = self.predict(data["x_valid"][i:i+1000])
            
        return metrics.accuracy_score(data["y_valid_og"], from_one_hot(y_pred))
    
    def get_train_accuracy(self):
        y_pred = np.empty((len(data["x_train"]), 10), dtype=data["x_train"].dtype)
        for i in range(0, len(data["x_train"]), 1000):
            y_pred[i:i+1000] = self.predict(data["x_train"][i:i+1000])
            
        return metrics.accuracy_score(data["y_train_og"], from_one_hot(y_pred))


x_tr = data["x_train"]
y_tr = data["y_train"]

# Run a sample Neural Net.

neural_net = TorchNeuralNet(1e-5, 400)

neural_net.add_layer(TorchLayer(x_tr.shape[0], x_tr.shape[1] + 1, 1e-4, 25, activation_func="tanh"))
neural_net.add_layer(TorchLayer(x_tr.shape[0], 25 + 1, 1e-4, 75, activation_func="relu"))
neural_net.add_layer(TorchOutputLayer(x_tr.shape[0], 75 + 1, 1e-4, 10))

neural_net.fit(x_tr, y_tr)
print(neural_net.get_accuracy())
print("Done")
        
        

Convolution Neural Network

In [11]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [12]:

class CNN(nn.Module):
    """
    Class to hold all the layers of the neural net, and that all calls are made to.
    """
    
    def __init__(self):
        super(CNN, self).__init__()
        self.layers = []
        self.num_layers = 0
    
    def add_layer(self, layer):
        self.num_layers += 1
        self.add_module(str(self.num_layers), layer)
        self.layers.append(layer)
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    
class View_Layer(nn.Module):
    """
    Just a way to make the call to the method view() of a Tensor, and use it as part of the neural net.
    This is needed before the linear layers.
    """
    def __init__(self, param1, param2):
        super(View_Layer, self).__init__()
        self.param1 = param1
        self.param2 = param2
        
    def forward(self, x):
        return x.view(self.param1, self.param2)
    
    
def fit(cnn, x_train, y_train, epochs, lr=0.01, momentum=0.9):
    """
    Train the model on the given inputs x_train and y_train, stored in numpy arrays.
    """
    crit = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(cnn.parameters(), lr=lr, momentum=momentum)
    
    
    for i in range(0, epochs):
        for j in range(0, x_train.shape[0], 500):
            x_in = Variable(torch.from_numpy(x_train[j:j+500]).cuda()).view(-1, 1, 64, 64)
            y_in = Variable(torch.from_numpy(y_train[j:j+500]).long().cuda())
        
            optimizer.zero_grad()
        
            y_pred = cnn(x_in)
            loss = crit(y_pred, y_in)
            loss.backward()
            optimizer.step()
            
            
def fit_loader(cnn, dataloader, epochs, lr=0.01, momentum=0.9):
    """
    Same as the fit method, but it takes a dataloader for the data instead of numpy arrays. 
    """
    crit = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(cnn.parameters(), lr=lr, momentum=momentum)
    
    for epoch in range(0, epochs):
        for i, entries in enumerate(dataloader, 0):
            x_in, y_in = entries
            x_in = Variable(x_in.cuda())
            y_in = Variable(y_in.long().cuda())
            
            optimizer.zero_grad()
        
            y_pred = cnn(x_in)
            loss = crit(y_pred, y_in)
            loss.backward()
            optimizer.step()
        
        

    
def predict(cnn, x):
    return cnn(Variable(torch.from_numpy(x).cuda().view(-1, 1, 64, 64))).cpu().data.numpy()

def predict_batch(cnn, x):
    y_pred = np.empty((len(x), 10), dtype=x.dtype)
    for i in range(0, len(x), 1000):
        y_pred[i:i+1000] = predict(cnn, x[i:i+1000])
            
    return y_pred

def get_accuracy(cnn, x_valid, y_valid):
    y_pred = np.empty((len(x_valid), 10), dtype=y_valid.dtype)
    for i in range(0, len(x_valid), 100):
        y_pred[i:i+100] = predict(cnn, x_valid[i:i+100])
            
    return metrics.accuracy_score(from_one_hot(y_valid), from_one_hot(y_pred))
    



In [13]:
import time
import datetime
import logging

"""
This is used to log outputs to both stdout, and to a log file.
This is necessary when running the notebook remotely.
It makes it possible to close the browser, and keep the output.
"""

logger = logging.getLogger()

def setup_file_logger(log_file):
    hdlr = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr) 
    logger.setLevel(logging.INFO)

def log(message):
    #outputs to Jupyter console
    print('{} {}'.format(datetime.datetime.now(), message))
    #outputs to file
    logger.info(message)

setup_file_logger('out.log')



In [14]:
import torchvision
import torchvision.transforms as transforms

In [15]:

from torch.utils.data import Dataset

class ImageArrayDataset(Dataset):
    """
    This is a Dataset used to hold the training data in numpy arrays.
    It also allows for transformation to the data.
    """
    
    def __init__(self, x, y, transform=None):
        self.x = x
        self.y = y
        self.transform = transform
    
    def __len__(self):
        return self.x.shape[0]
        
    def __getitem__(self, idx):
        
        if self.transform:
            x_trans = self.transform(self.x[idx])
        
        return (x_trans, self.y[idx])
    

Run CNN on training data with the validation split.

In [None]:
log("Testing with 9 conv layers.")

x_tr = data["x_train"].reshape(-1, 1, 64, 64)
y_tr = data["y_train_og"]


# Add a random rotation to the data.
trans = transforms.Compose([
    transforms.Lambda(lambda x: np.array([255 if i == 1.0 else 0 for i in x.reshape(-1)]).astype(np.uint8).reshape(64, 64, 1)),
    transforms.ToPILImage(),
    transforms.RandomRotation(40),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(1, 64, 64).double().apply_(lambda a: 1.0 if a == 1 else -1.0))
])

train = ImageArrayDataset(x_tr, y_tr, transform=trans)

loader = torch.utils.data.DataLoader(train, batch_size=500, shuffle=True, num_workers=2)

# Set the padding for each layer.
conv_1_pad = 2
conv_2_pad = 2
conv_3_pad = 2
conv_4_pad = 1
conv_5_pad = 1
conv_6_pad = 0
conv_7_pad = 0
conv_8_pad = 0
conv_9_pad = 0

# Set the stride for each layer.
conv_1_stride = 1
conv_2_stride = 1
conv_3_stride = 1
conv_4_stride = 1
conv_5_stride = 1
conv_6_stride = 1
conv_7_stride = 1
conv_8_stride = 1
conv_9_stride = 1

# Set the filter size for each layer.
conv_1_kernel = 3
conv_2_kernel = 3
conv_3_kernel = 3
conv_4_kernel = 3
conv_5_kernel = 3
conv_6_kernel = 3
conv_7_kernel = 3
conv_8_kernel = 3
conv_9_kernel = 3

# Set the probability of data being dropped before each convolution layer.
conv_2_drop = 0.1
conv_3_drop = 0.1
conv_4_drop = 0.1
conv_5_drop = 0.1
conv_6_drop = 0.1
conv_7_drop = 0.1
conv_8_drop = 0.1
conv_9_drop = 0.1

# Set the number of output channels for each convolution layer.
conv_1_out = 25
conv_2_out = 35
conv_3_out = 50
conv_4_out = 65
conv_5_out = 80
conv_6_out = 90
conv_7_out = 100
conv_8_out = 110
conv_9_out = 120

# Set the probability of dropping data before each linear layer.
drop_1_p = 0.5
drop_2_p = 0.5
drop_3_p = 0.5


# Initialize CNN.
cnn = CNN()

# Add batch normalization, and then add the convolution layer.
cnn.add_layer(nn.BatchNorm2d(1))
cnn.add_layer(nn.Conv2d(1, conv_1_out, conv_1_kernel, padding=conv_1_pad, stride=conv_1_stride))

# Track the adjustment to the side length of the output image.
side_len = int((64 - conv_1_kernel + (2 * conv_1_pad)) / conv_1_stride) + 1

# Use ReLU for activation, and then max pool.
cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.MaxPool2d(2,2, padding=1))
side_len = int((side_len + 2) / 2)


# Repeat for each convolution layer.

cnn.add_layer(nn.BatchNorm2d(conv_1_out))
cnn.add_layer(nn.Dropout2d(conv_2_drop))
cnn.add_layer(nn.Conv2d(conv_1_out, conv_2_out, conv_2_kernel, padding=conv_2_pad, stride=conv_2_stride))
side_len = int((side_len - conv_2_kernel + (2 * conv_2_pad)) / conv_2_stride) + 1

cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.MaxPool2d(2, 2, padding=1))
side_len = int((side_len + 2) / 2)


cnn.add_layer(nn.BatchNorm2d(conv_2_out))
cnn.add_layer(nn.Dropout2d(conv_3_drop))
cnn.add_layer(nn.Conv2d(conv_2_out, conv_3_out, conv_3_kernel, padding=conv_3_pad, stride=conv_3_stride))
side_len = int((side_len - conv_3_kernel + (2 * conv_3_pad)) / conv_3_stride) + 1

cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.MaxPool2d(2, 2, padding=1))
side_len = int((side_len + 2) / 2)


cnn.add_layer(nn.BatchNorm2d(conv_3_out))
cnn.add_layer(nn.Dropout2d(conv_4_drop))
cnn.add_layer(nn.Conv2d(conv_3_out, conv_4_out, conv_4_kernel, padding=conv_4_pad, stride=conv_4_stride))
side_len = int((side_len - conv_4_kernel + (2 * conv_4_pad)) / conv_4_stride) + 1

# Leave the option open for more Max Pooling.
cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_4_out))
cnn.add_layer(nn.Dropout2d(conv_5_drop))
cnn.add_layer(nn.Conv2d(conv_4_out, conv_5_out, conv_5_kernel, padding=conv_5_pad, stride=conv_5_stride))
side_len = int((side_len - conv_5_kernel + (2 * conv_5_pad)) / conv_5_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_5_out))
cnn.add_layer(nn.Dropout2d(conv_6_drop))
cnn.add_layer(nn.Conv2d(conv_5_out, conv_6_out, conv_6_kernel, padding=conv_6_pad, stride=conv_6_stride))
side_len = int((side_len - conv_6_kernel + (2 * conv_6_pad)) / conv_6_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_6_out))
cnn.add_layer(nn.Dropout2d(conv_7_drop))
cnn.add_layer(nn.Conv2d(conv_6_out, conv_7_out, conv_7_kernel, padding=conv_7_pad, stride=conv_7_stride))
side_len = int((side_len - conv_7_kernel + (2 * conv_7_pad)) / conv_7_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_7_out))
cnn.add_layer(nn.Dropout2d(conv_8_drop))
cnn.add_layer(nn.Conv2d(conv_7_out, conv_8_out, conv_8_kernel, padding=conv_8_pad, stride=conv_8_stride))
side_len = int((side_len - conv_8_kernel + (2 * conv_8_pad)) / conv_8_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_8_out))
cnn.add_layer(nn.Dropout2d(conv_9_drop))
cnn.add_layer(nn.Conv2d(conv_8_out, conv_9_out, conv_9_kernel, padding=conv_9_pad, stride=conv_9_stride))
side_len = int((side_len - conv_9_kernel + (2 * conv_9_pad)) / conv_9_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


# Bring the data into a line.
cnn.add_layer(View_Layer(-1, conv_9_out * side_len**2))

# Run a fully connected neural network. Use Dropout layers to set some activations to zero during training.
cnn.add_layer(nn.BatchNorm1d(conv_9_out * side_len**2))
cnn.add_layer(nn.Dropout(drop_1_p))
cnn.add_layer(nn.Linear(conv_9_out * side_len**2, 120))
cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.BatchNorm1d(120))
cnn.add_layer(nn.Dropout(drop_2_p))
cnn.add_layer(nn.Linear(120, 84))
cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.BatchNorm1d(84))
cnn.add_layer(nn.Dropout(drop_3_p))
cnn.add_layer(nn.Linear(84, 10))

# Convert to a Double network, prepare to train, and move to the GPU.
cnn.double()
cnn.train(True)
cnn.cuda()

# Fit on the raining data.
fit_loader(cnn, loader, 25, lr=0.2)
cnn.train(False)
acc = get_accuracy(cnn, data["x_valid"], data["y_valid"])

# Log the configuration, and the resulting accuracy.
log(str(conv_1_out) + "\t" + str(conv_2_out) + "\t" + str(conv_3_out) + "\t" + str(conv_4_out) + "\t" + str(conv_5_out) + "\t" + str(conv_6_out) + "\t" + str(conv_7_out) + "\t" + str(conv_8_out) + "\t" + str(conv_9_out) + "\t" + str(conv_1_kernel))
log("Accuracy: " + str(acc))
log("Train Accuracy: " + str(get_accuracy(cnn, data["x_train"], data["y_train"])))



Run the CNN on the full training set, and make predictions for the test set.

In [27]:
# loading test data
x_train_full = clean_data(np.loadtxt("data/train_x.csv", delimiter=","))
y_train_full = np.loadtxt("data/train_y.csv", delimiter=",")
y_train_one_hot = to_one_hot(y_train)
x_test_full = clean_data(np.loadtxt("data/test_x.csv", delimiter=","))

Done


In [28]:
def save_results(filename, y_pred):
    """
    Save the predictions to a file.
    """
    with open(filename, 'w') as f:
        f.write("Id,Label")
        for i in range(0, len(y_pred)):
            f.write("\n" + str(i) + "," + str(y_pred[i]))

In [None]:
log("Testing with 9 conv layers.")

x_tr = x_train_full.reshape(-1, 1, 64, 64)
y_tr = y_train_full

# Add a random rotation to the data.
trans = transforms.Compose([
    transforms.Lambda(lambda x: np.array([255 if i == 1.0 else 0 for i in x.reshape(-1)]).astype(np.uint8).reshape(64, 64, 1)),
    transforms.ToPILImage(),
    transforms.RandomRotation(40),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(1, 64, 64).double().apply_(lambda a: 1.0 if a == 1 else -1.0))
])

train = ImageArrayDataset(x_tr, y_tr, transform=trans)

loader = torch.utils.data.DataLoader(train, batch_size=500, shuffle=True, num_workers=2)

# Set the padding for each layer.
conv_1_pad = 2
conv_2_pad = 2
conv_3_pad = 2
conv_4_pad = 1
conv_5_pad = 1
conv_6_pad = 0
conv_7_pad = 0
conv_8_pad = 0
conv_9_pad = 0

# Set the stride for each layer.
conv_1_stride = 1
conv_2_stride = 1
conv_3_stride = 1
conv_4_stride = 1
conv_5_stride = 1
conv_6_stride = 1
conv_7_stride = 1
conv_8_stride = 1
conv_9_stride = 1

# Set the filter size for each layer.
conv_1_kernel = 3
conv_2_kernel = 3
conv_3_kernel = 3
conv_4_kernel = 3
conv_5_kernel = 3
conv_6_kernel = 3
conv_7_kernel = 3
conv_8_kernel = 3
conv_9_kernel = 3

# Set the probability of data being dropped before each convolution layer.
conv_2_drop = 0.1
conv_3_drop = 0.1
conv_4_drop = 0.1
conv_5_drop = 0.1
conv_6_drop = 0.1
conv_7_drop = 0.1
conv_8_drop = 0.1
conv_9_drop = 0.1

# Set the number of output channels for each convolution layer.
conv_1_out = 25
conv_2_out = 35
conv_3_out = 50
conv_4_out = 65
conv_5_out = 80
conv_6_out = 90
conv_7_out = 100
conv_8_out = 110
conv_9_out = 120

# Set the probability of dropping data before each linear layer.
drop_1_p = 0.5
drop_2_p = 0.5
drop_3_p = 0.5


# Initialize CNN.
cnn = CNN()

# Add batch normalization, and then add the convolution layer.
cnn.add_layer(nn.BatchNorm2d(1))
cnn.add_layer(nn.Conv2d(1, conv_1_out, conv_1_kernel, padding=conv_1_pad, stride=conv_1_stride))

# Track the adjustment to the side length of the output image.
side_len = int((64 - conv_1_kernel + (2 * conv_1_pad)) / conv_1_stride) + 1

# Use ReLU for activation, and then max pool.
cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.MaxPool2d(2,2, padding=1))
side_len = int((side_len + 2) / 2)


# Repeat for each convolution layer.

cnn.add_layer(nn.BatchNorm2d(conv_1_out))
cnn.add_layer(nn.Dropout2d(conv_2_drop))
cnn.add_layer(nn.Conv2d(conv_1_out, conv_2_out, conv_2_kernel, padding=conv_2_pad, stride=conv_2_stride))
side_len = int((side_len - conv_2_kernel + (2 * conv_2_pad)) / conv_2_stride) + 1

cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.MaxPool2d(2, 2, padding=1))
side_len = int((side_len + 2) / 2)


cnn.add_layer(nn.BatchNorm2d(conv_2_out))
cnn.add_layer(nn.Dropout2d(conv_3_drop))
cnn.add_layer(nn.Conv2d(conv_2_out, conv_3_out, conv_3_kernel, padding=conv_3_pad, stride=conv_3_stride))
side_len = int((side_len - conv_3_kernel + (2 * conv_3_pad)) / conv_3_stride) + 1

cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.MaxPool2d(2, 2, padding=1))
side_len = int((side_len + 2) / 2)


cnn.add_layer(nn.BatchNorm2d(conv_3_out))
cnn.add_layer(nn.Dropout2d(conv_4_drop))
cnn.add_layer(nn.Conv2d(conv_3_out, conv_4_out, conv_4_kernel, padding=conv_4_pad, stride=conv_4_stride))
side_len = int((side_len - conv_4_kernel + (2 * conv_4_pad)) / conv_4_stride) + 1

# Leave the option open for more Max Pooling.
cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_4_out))
cnn.add_layer(nn.Dropout2d(conv_5_drop))
cnn.add_layer(nn.Conv2d(conv_4_out, conv_5_out, conv_5_kernel, padding=conv_5_pad, stride=conv_5_stride))
side_len = int((side_len - conv_5_kernel + (2 * conv_5_pad)) / conv_5_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_5_out))
cnn.add_layer(nn.Dropout2d(conv_6_drop))
cnn.add_layer(nn.Conv2d(conv_5_out, conv_6_out, conv_6_kernel, padding=conv_6_pad, stride=conv_6_stride))
side_len = int((side_len - conv_6_kernel + (2 * conv_6_pad)) / conv_6_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_6_out))
cnn.add_layer(nn.Dropout2d(conv_7_drop))
cnn.add_layer(nn.Conv2d(conv_6_out, conv_7_out, conv_7_kernel, padding=conv_7_pad, stride=conv_7_stride))
side_len = int((side_len - conv_7_kernel + (2 * conv_7_pad)) / conv_7_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_7_out))
cnn.add_layer(nn.Dropout2d(conv_8_drop))
cnn.add_layer(nn.Conv2d(conv_7_out, conv_8_out, conv_8_kernel, padding=conv_8_pad, stride=conv_8_stride))
side_len = int((side_len - conv_8_kernel + (2 * conv_8_pad)) / conv_8_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


cnn.add_layer(nn.BatchNorm2d(conv_8_out))
cnn.add_layer(nn.Dropout2d(conv_9_drop))
cnn.add_layer(nn.Conv2d(conv_8_out, conv_9_out, conv_9_kernel, padding=conv_9_pad, stride=conv_9_stride))
side_len = int((side_len - conv_9_kernel + (2 * conv_9_pad)) / conv_9_stride) + 1

cnn.add_layer(nn.ReLU())
# cnn.add_layer(nn.MaxPool2d(2, 2))
# side_len = int(side_len / 2)


# Bring the data into a line.
cnn.add_layer(View_Layer(-1, conv_9_out * side_len**2))

# Run a fully connected neural network. Use Dropout layers to set some activations to zero during training.
cnn.add_layer(nn.BatchNorm1d(conv_9_out * side_len**2))
cnn.add_layer(nn.Dropout(drop_1_p))
cnn.add_layer(nn.Linear(conv_9_out * side_len**2, 120))
cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.BatchNorm1d(120))
cnn.add_layer(nn.Dropout(drop_2_p))
cnn.add_layer(nn.Linear(120, 84))
cnn.add_layer(nn.ReLU())
cnn.add_layer(nn.BatchNorm1d(84))
cnn.add_layer(nn.Dropout(drop_3_p))
cnn.add_layer(nn.Linear(84, 10))

# Convert to a Double network, prepare to train, and move to the GPU.
cnn.double()
cnn.train(True)
cnn.cuda()

# Fit on the raining data.
fit_loader(cnn, loader, 25, lr=0.2)
cnn.train(False)

# Log the configuration, and the resulting accuracy.
log(str(conv_1_out) + "\t" + str(conv_2_out) + "\t" + str(conv_3_out) + "\t" + str(conv_4_out) + "\t" + str(conv_5_out) + "\t" + str(conv_6_out) + "\t" + str(conv_7_out) + "\t" + str(conv_8_out) + "\t" + str(conv_9_out) + "\t" + str(conv_1_kernel))
log("Train Accuracy: " + str(get_accuracy(cnn, data["x_train"], data["y_train"])))
save_results("test_y_94.csv", from_one_hot(predict_batch(cnn, x_test_full)))
