# Deep Learning Assignment 1

In [2]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import time

## Logistic Regression implementation.
The implementation of logistic regression here is as described in the lecture notes. The weights are created as a vector, each uniformly distributed on $[0,1]$. The 'back propogation' step takes place in the stochastic_gradient_descent function. 

In [3]:
class LogisticRegression:
    def __init__(self, n_inputs, sgd_thresh = 10e-8, sgd_max_iters = 2500, alpha = 0.0001):
        self.n_inputs = n_inputs
        self.weights = np.random.rand(self.n_inputs, 1).astype('f')
        #print(self.weights)
        self.bias = 1.0
        self.alpha = alpha
        self.sgd_thresh = sgd_thresh
        self.sgd_max_iters = sgd_max_iters
        
    def sigmoid(self, x):
        return 1/(1+np.exp(-1*x))
    
    def predict(self, x):
        x = np.array(x).reshape(-1,1)
        return int(np.round(self.sigmoid(np.dot(self.weights.T, x) + self.bias)))
    
    def log_loss(self, X, Y):
        #dot = [np.dot(self.weights.T, x) for x in X]
        
        y_hats = [self.sigmoid(np.dot(self.weights.T, x) + self.bias) for x in X]
        self.yhats = y_hats
        #print(np.sum(y_hats))
        J_w_b = (-1/len(Y))*np.sum([y*np.log(y_hat)+(1-y)*np.log(1-y_hat) for y,y_hat in zip(Y,y_hats)])
        return J_w_b

    def stochastic_gradient_descent(self,X,Y):
        X = np.array(X)
        Y = np.array(Y)
        J = self.log_loss(X,Y)
        for i in range(self.sgd_max_iters):
            idx = np.random.randint(len(X))
            x_sample, y_sample = X[idx], Y[idx]
            diff = self.predict(x_sample) - y_sample

            self.weights = self.weights - (self.alpha*diff*x_sample).reshape(-1,1)

            self.bias = self.bias - self.alpha*diff

            
            if self.log_loss(X,Y) - J < self.sgd_thresh:
                break
            #print(self.log_loss(X,Y), J)
            J = self.log_loss(X,Y)
            
    def score(self, x_test, y_test):
        x_test = np.array(x_test)
        y_test = np.array(y_test)
        correct = 0
        total = len(x_test)
        for x,y in zip(x_test, y_test):
            if self.predict(x) ==  y:
                correct += 1
        return correct/total

## Performance on the moons400 dataset

The data is loaded in using pandas and is split into training and test sets using the train_test_split function from the model_selection module in the sci-kit learn package. On the harder of the two problems it achieves an accuracy of 

In [4]:
path = "/home/oisin/MAI_work/ongoing_assignments/DeepLearning/data/"

data = pd.read_csv(path+"moons400.csv")

x_train, x_test, y_train, y_test = train_test_split(data.drop(['Class'], axis = 1), data['Class']) 

lr = LogisticRegression(x_train.shape[1],sgd_max_iters=200,sgd_thresh=-np.inf,alpha=10e-2)
lr.stochastic_gradient_descent(x_train,y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)
lr.score(x_test, y_test)

0.86

## Performance on the blobs250 dataset

We observe then on the easiest of all the data sets that logistic regression perfectly splits the datasets and scores 100% on the test set.

In [17]:
data = pd.read_csv(path+"blobs250.csv")

x_train, x_test, y_train, y_test = train_test_split(data.drop(['Class'], axis = 1), data['Class']) 

lr = LogisticRegression(x_train.shape[1],sgd_max_iters=200,sgd_thresh=-np.inf,alpha=10e-2)
lr.stochastic_gradient_descent(x_train,y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)
lr.score(x_test, y_test)

1.0

## Shallow Neural Network Performance
Below is the code for the implementation of the neural network. Since one of my enhancements was to implement deep neural networks (support for multiple layers), the code is identical to the enhancement code. This is simply because there was no point implementing code specific to one hidden layer and then implementing code for many layers when just generalising first does the trick. 

### Layer Class
This implementation has a Layer class, where the weights and layer activation function are stored. The weights are initially $\mathcal{N}(0,1)$ as are the biases. The class implements "fwdprop_ouput" which calculates a and z for the Layer, as seen in the notes.

### Network Class
The Network class creates the layers, as directed by the inputs. These inputs are in the form of 2 lists, m and n say, where the $i^{th}$ member of m is the number of nodes in layer $i$, and the $i^{th}$ member of n is the activation function class for that layer (see below).

### Activation Function Class
The activation functions are created with an "f" method and a "deriv" method. These represent the function itself and the derivative of the function, to be used in the back propagation step, as required by the chain rule

In [188]:
class Layer:
    
    def __init__(self, prev_no_nodes, no_nodes, activation_function, is_input=False):
        if not(is_input):
            self.weights = np.random.normal(0.01,1,size = (prev_no_nodes,no_nodes))
            self.bias = np.random.normal(0.01,1, size = (no_nodes,1))
            self.act_f = activation_function()
            
        else:
            self.act_f = activation_function()
        self.no_nodes = no_nodes
        self.is_input = is_input

    def fwdprop_output(self, X):
        if self.is_input:
            self.a = X
            self.z = self.a
            return X
        X = X.reshape(-1,1)
        self.z = np.dot(self.weights.T,X) + self.bias
        self.a = self.act_f.f(self.z)
        return self.a
    
    
    
class IdentityActivation:
    def __init__(self):
        pass
    
    def f(self,x):
        return x
    
    def deriv(self, x):
        return x
    
    
class ReluActivation:
    def __init__(self):
        pass
    
    def f(self, x):
        return np.maximum(x,0)
        
    def deriv(self, x):
        x[x>0] = 1
        x[x<0] = 0
        return x
        
class SigmoidActivation:
    def __init__(self):
        pass
    
    def f(self,x):
        return 1/(1+np.exp(-x))
    
    def deriv(self, x):
        return self.f(x)*(1-self.f(x))

class Network:
    
    def __init__(self, no_nodes_layer, activation_function, loss_function = "log_loss", lamb = 0):
        '''
        TODO: Sort out the fact that the first layer doesn't have an activation function (DOESNT FUCKIN NEED ONE AHAHHA)
        TODO: Add the loss_functions
        TODO: Finish train and predict/score
        '''
        self.no_nodes_layer = no_nodes_layer
        self.activation_function = activation_function
        self.input_size = no_nodes_layer[0]
        self.no_layers = len(self.no_nodes_layer)
        self.lamb = lamb
        self.no_params = sum([self.no_nodes_layer[i]*self.no_nodes_layer[i-1] for i in range(1,len(self.no_nodes_layer))])
        
        if isinstance(no_nodes_layer, list):
            '''
            TODO: add functionality so that differnet layers can have different activation functions 
            '''
            assert(self.no_nodes_layer[-1] == 1 or self.no_nodes_layer[-1] == 2)
            self.layers = [Layer(self.no_nodes_layer[i-1],self.no_nodes_layer[i],activation_function[i]) for i in range(1,len(self.no_nodes_layer))]
            input_layer = Layer(0,no_nodes=self.no_nodes_layer[0],activation_function=activation_function[0], is_input=True)
            self.layers = [input_layer] + self.layers

        else:
            #Come up with a better default
            self.layers = [Layer(no_nodes_layer,1)]

        self.W = np.zeros(0)
        #turn all the weight matrices into one long weight vector so one can find the l_p norm of it.
        self.W = np.concatenate([self.W] +[layer.weights.flatten() for layer in self.layers[1:]])
        #print(np.linalg.norm(self.W, 2))
    
    def sigmoid(self, x):
        return 1/(1+np.exp(-x))
    
    def log_loss(self,X,Y):
        y_hats = [self.fwdpropagate(x) for x in X]
        J_w_b = (-1)*sum([y*np.log(y_hat)+(1-y)*np.log(1-y_hat) for y,y_hat in zip(Y,y_hats)])
        J_w_b += self.lamb*np.linalg.norm(self.W, 2)
        return J_w_b
        
        
    def fwdpropagate(self, _input):
        
        if len(_input) != self.layers[0].no_nodes:
            print(f"Input must be of length {self.layers[0].no_nodes}, it is of length {len(_input)}")
        
        self.a_s = []   
        a = _input

        
        for layer in self.layers:
            a = layer.fwdprop_output(a)
            self.a_s.append(a)
        
        #self.layer_outputs = []
        y_hat = a
        #self.layer_outputs.append(y_hat)
        
        self.prediction = int(np.round(y_hat))

        
    def d_dout_sig(self, x):
        return self.sigmoid(x)*(1.-self.sigmoid(x))
    
    
    def d_dout_loss(self,X):
        pass
    
    def train(self, x_train, y_train):
        raise NotImplementedError

    def predict(self, x_test):
        self.fwdpropagate(x_test)
        return self.prediction

    def stochastic_gradient_descent(self, x_train, y_train, alpha = 0.001, n_iter = 10):
        '''
        Implementation of stochastic gradient descent with regularization. 
        '''
        
        
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        
        #combine the x and ys into one array to make suffling more straightforward
        comb = np.c_[x_train.reshape(len(x_train), -1), y_train.reshape(len(y_train), -1)]
        
        #keep track of total training time and print initial accuracy sans training.
        start = time.time()
        ita = score = self.score(x_train,y_train)
        print(f"Initial training accuracy: {ita:.4f}%")
        for epoch in range(1,n_iter+1):
            print(f"Running epoch {epoch} of {n_iter}")
            
            #This is just a fancy way of shuffling the x and ys together.
            x_train_c = comb[:, :x_train.size//len(x_train)].reshape(x_train.shape)
            y_train_c = comb[:, x_train.size//len(x_train):].reshape(y_train.shape)
            np.random.shuffle(comb)

        
            for x,y in tqdm(zip(x_train_c, y_train_c)):
                self.backward_propagate_error(x,y)
                for i,layer in enumerate(self.layers):
                    if layer.is_input:
                        continue
                    else:
                        layer.weights -= (alpha*np.array(self.delta_w[i]))
                        layer.bias -= alpha*np.array(self.delta_b[i])
            
            
            score = self.score(x_train,y_train)
            print(f"Training accuracy: {score:.4f}%")
            #-----------------
            
            
        end = time.time()
        print(f"Ran {n_iter} epochs in {end-start}")
    
    
    def score(self, x_test, y_test):
        '''
        Simple score function that gets the accuracy for a given test set
        '''
        x_test = np.array(x_test)
        y_test = np.array(y_test)
        correct = 0
        total = len(x_test)
        for x,y in zip(x_test, y_test):
            self.fwdpropagate(x)
            if self.predict(x) ==  y:
                correct += 1
        return correct/total
    
    
    
    def backward_propagate_error(self, x, y):
        ''' The backward propogate error function is based on the algorithm from http://cs229.stanford.edu/notes2020spring/cs229-notes-deep_learning.pdf
            (on the 2nd last page). Note the difference in derivatives of cost function from these notes, as 
            the notes use 1/2 squared error as the cost function.
            
            The function fisrt does the forward propagtaion to ensure the z's and a's of each layer exist. Then,
            using the chain rule, we compute the gradients for each weight in each layer.
        '''
        self.fwdpropagate(x)
        delta = [None]*self.no_layers  
        delta_w = [None]*self.no_layers 
        delta_b = [None]*self.no_layers
   
        for i in range(self.no_layers - 1, -1, -1):
            if i == self.no_layers - 1:
                if y == 1:
                    q = (y/(self.layers[i].a+10e6))
                else:
                    q = -1*((1-y)/(1-(self.layers[i].a-10e-6)))
                delta[i] = -1*q*self.layers[i].act_f.deriv(self.layers[i].z)
            else:
                delta[i] = ((self.layers[i+1].weights)@(delta[i+1]))*self.layers[i].act_f.deriv(self.layers[i].z).reshape(-1,1)
                delta_w[i+1] = ((delta[i+1])@(self.layers[i].a.reshape(1,-1))).T + 2*self.lamb*self.layers[i+1].weights
                delta_b[i+1] = delta[i+1]

        self.delta_w = delta_w
        self.delta_b = delta_b
        self.delta = delta

In [33]:
class Layer:
    
    def __init__(self, prev_no_nodes, no_nodes, activation_function, is_input=False):
        if not(is_input):
            self.weights = np.random.normal(0.0,1,size = (prev_no_nodes,no_nodes))
            self.bias = np.random.normal(0.0,1, size = (no_nodes,1))
            self.act_f = activation_function
            
        self.no_nodes = no_nodes
        self.is_input = is_input

    def fwdprop_output(self, X):
        if self.is_input:
            self.a = X
            self.z = self.a
            return X
        X = X.reshape(-1,1)
        self.z = np.dot(self.weights.T,X) + self.bias
        self.a = self.act_f(self.z)
        return self.a

    
class Network:
    
    def __init__(self, no_nodes_layer, activation_function, loss_function = "log_loss", lamb = 0):
        '''
        TODO: Sort out the fact that the first layer doesn't have an activation function (DOESNT FUCKIN NEED ONE AHAHHA)
        TODO: Add the loss_functions
        TODO: Finish train and predict/score
        '''
        self.no_nodes_layer = no_nodes_layer
        self.activation_function = activation_function
        self.input_size = no_nodes_layer[0]
        self.no_layers = len(self.no_nodes_layer)
        self.lamb = lamb
        self.no_params = sum([self.no_nodes_layer[i]*self.no_nodes_layer[i-1] for i in range(1,len(self.no_nodes_layer))])

        
        if isinstance(no_nodes_layer, list):
            '''
            TODO: add functionality so that differnet layers can have different activation functions 
            '''
            assert(self.no_nodes_layer[-1] == 1 or self.no_nodes_layer[-1] == 2)
            self.layers = [Layer(self.no_nodes_layer[i-1],self.no_nodes_layer[i],activation_function) for i in range(1,len(self.no_nodes_layer))]
            input_layer = Layer(0,no_nodes=self.no_nodes_layer[0],activation_function=activation_function, is_input=True)
            self.layers = [input_layer] + self.layers

        else:
            #Come up with a better default
            self.layers = [Layer(no_nodes_layer,1)]

        self.W = np.zeros(0)
        #turn all the weight matrices into one long weight vector so one can find the l_p norm of it.
        self.W = np.concatenate([self.W] +[layer.weights.flatten() for layer in self.layers[1:]])
        #print(np.linalg.norm(self.W, 2))
    
    def sigmoid(self, x):
        return 1/(1+np.exp(-x))
    
    def log_loss(self,X,Y):
        y_hats = [self.fwdpropagate(x) for x in X]
        J_w_b = (-1)*sum([y*np.log(y_hat)+(1-y)*np.log(1-y_hat) for y,y_hat in zip(Y,y_hats)])
        J_w_b += self.lamb*np.linalg.norm(self.W, 2)
        return J_w_b
        
        
    def fwdpropagate(self, _input):
        
        if len(_input) != self.layers[0].no_nodes:
            print(f"Input must be of length {self.layers[0].no_nodes}, it is of length {len(_input)}")
        
        self.a_s = []   
        a = _input

        
        for layer in self.layers:
            a = layer.fwdprop_output(a)
            self.a_s.append(a)
        
        #self.layer_outputs = []
        y_hat = a
        #self.layer_outputs.append(y_hat)
        
        self.prediction = int(np.round(y_hat))

        
    def d_dout_sig(self, x):
        return self.sigmoid(x)*(1.-self.sigmoid(x))
    
    
    def d_dout_loss(self,X):
        pass
    
    def train(self, x_train, y_train):
        raise NotImplementedError

    def predict(self, x_test):
        self.fwdpropagate(x_test)
        return self.prediction

    def stochastic_gradient_descent(self, x_train, y_train, alpha = 0.001, n_iter = 10):
        start = time.time()
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        comb = np.c_[x_train.reshape(len(x_train), -1), y_train.reshape(len(y_train), -1)]
        
        for _ in range(n_iter):
            x_train_c = comb[:, :x_train.size//len(x_train)].reshape(x_train.shape)
            y_train_c = comb[:, x_train.size//len(x_train):].reshape(y_train.shape)
            np.random.shuffle(comb)
            score = self.score(x_train,y_train)
            print(f"Training accuracy: {score}%")
            
            for x,y in tqdm(zip(x_train_c, y_train_c)):
                self.backward_propagate_error(x,y)
                for i,layer in enumerate(self.layers):
                    if layer.is_input:
                        continue
                    else:
                        layer.weights -= (alpha*np.array(self.delta_w[i]))
                        layer.bias -= alpha*np.array(self.delta_b[i])
        end = time.time()
        print(f"Ran {n_iter} epochs in {end-start}")
    
    def score(self, x_test, y_test):
        x_test = np.array(x_test)
        y_test = np.array(y_test)
        correct = 0
        total = len(x_test)
        for x,y in zip(x_test, y_test):
            self.fwdpropagate(x)
            if self.predict(x) ==  y:
                correct += 1
        return correct/total
    
    
    
    def backward_propagate_error(self, x, y):
        ''' tick  '''
        self.fwdpropagate(x)
        delta = [None]*self.no_layers  
        delta_w = [None]*self.no_layers 
        delta_b = [None]*self.no_layers
   
        for i in range(self.no_layers - 1, -1, -1):
            if i == self.no_layers - 1:
                if y == 1:
                    q = (y/self.layers[i].a)
                else:
                    q = -1*((1-y)/(1-self.layers[i].a))
                delta[i] = -1*q*self.d_dout_sig(self.layers[i].z)
            else:
                delta[i] = ((self.layers[i+1].weights)@(delta[i+1]))*self.d_dout_sig(self.layers[i].z).reshape(-1,1)
                delta_w[i+1] = ((delta[i+1])@(self.layers[i].a.reshape(1,-1))).T + 2*self.lamb*self.layers[i+1].weights
                delta_b[i+1] = delta[i+1]

        self.delta_w = delta_w
        self.delta_b = delta_b
        self.delta = delta

## Shallow Neural Network on blobs250 and moons400
To create the shallow neural network we must create a list with the number of notes in the input layer, number in the hidden layer, the output layer will always be 1. It is neccessary to create a list of the same length with the activation function in the respective layers.

In [189]:
data = pd.read_csv(path+"blobs250.csv")

x_train, x_test, y_train, y_test = train_test_split(data.drop(['Class'], axis = 1), data['Class']) 

acts = [SigmoidActivation, SigmoidActivation, SigmoidActivation]

net = Network([3, 10, 1],activation_function=acts, lamb = 0.001)

In [199]:
net = Network([3, 10, 1],activation_function=acts, lamb = 0.001)
net.stochastic_gradient_descent(x_train, y_train, alpha=0.001, n_iter=10)

187it [00:00, 7870.58it/s]
187it [00:00, 9632.37it/s]
187it [00:00, 9088.47it/s]
187it [00:00, 9445.72it/s]
187it [00:00, 9436.06it/s]
187it [00:00, 9201.70it/s]
0it [00:00, ?it/s]

Initial training accuracy: 0.5348%
Running epoch 1 of 10
Training accuracy: 0.5348%
Running epoch 2 of 10
Training accuracy: 0.5936%
Running epoch 3 of 10
Training accuracy: 0.7005%
Running epoch 4 of 10
Training accuracy: 0.8075%
Running epoch 5 of 10
Training accuracy: 0.9519%
Running epoch 6 of 10
Training accuracy: 1.0000%
Running epoch 7 of 10


187it [00:00, 7019.54it/s]
187it [00:00, 8150.12it/s]
187it [00:00, 8884.63it/s]
187it [00:00, 8545.72it/s]

Training accuracy: 1.0000%
Running epoch 8 of 10
Training accuracy: 1.0000%
Running epoch 9 of 10
Training accuracy: 1.0000%
Running epoch 10 of 10
Training accuracy: 1.0000%
Ran 10 epochs in 0.35856175422668457





In [200]:
net.score(x_test,y_test)

1.0

In [179]:
data = pd.read_csv(path+"moons400.csv")

x_train, x_test, y_train, y_test = train_test_split(data.drop(['Class'], axis = 1), data['Class']) 

acts = [SigmoidActivation, SigmoidActivation, SigmoidActivation]

net = Network([2, 100, 1],activation_function=acts, lamb = 0.001)

net.stochastic_gradient_descent(x_train, y_train, alpha=0.0001, n_iter=20)

net.score(x_test, y_test)

300it [00:00, 6971.72it/s]
300it [00:00, 7498.00it/s]
300it [00:00, 9256.57it/s]
0it [00:00, ?it/s]

Initial training accuracy: 0.6333%
Running epoch 1 of 20
Training accuracy: 0.6500%
Running epoch 2 of 20
Training accuracy: 0.7033%
Running epoch 3 of 20
Training accuracy: 0.7233%
Running epoch 4 of 20


300it [00:00, 7457.91it/s]
300it [00:00, 8523.22it/s]
300it [00:00, 8476.94it/s]
300it [00:00, 9239.71it/s]
300it [00:00, 9215.62it/s]


Training accuracy: 0.7467%
Running epoch 5 of 20
Training accuracy: 0.7733%
Running epoch 6 of 20
Training accuracy: 0.7867%
Running epoch 7 of 20
Training accuracy: 0.7833%
Running epoch 8 of 20
Training accuracy: 0.7833%
Running epoch 9 of 20


300it [00:00, 7981.75it/s]
300it [00:00, 9068.83it/s]
300it [00:00, 8322.91it/s]
300it [00:00, 8367.63it/s]
300it [00:00, 9192.59it/s]


Training accuracy: 0.8033%
Running epoch 10 of 20
Training accuracy: 0.8067%
Running epoch 11 of 20
Training accuracy: 0.8100%
Running epoch 12 of 20
Training accuracy: 0.8200%
Running epoch 13 of 20


300it [00:00, 9195.01it/s]
300it [00:00, 9167.74it/s]
300it [00:00, 9205.64it/s]
300it [00:00, 7235.34it/s]


Training accuracy: 0.8367%
Running epoch 14 of 20
Training accuracy: 0.8333%
Running epoch 15 of 20
Training accuracy: 0.8400%
Running epoch 16 of 20
Training accuracy: 0.8400%
Running epoch 17 of 20
Training accuracy: 0.8533%
Running epoch 18 of 20


300it [00:00, 9179.52it/s]
300it [00:00, 8236.40it/s]
300it [00:00, 8469.92it/s]


Training accuracy: 0.8467%
Running epoch 19 of 20
Training accuracy: 0.8433%
Running epoch 20 of 20
Training accuracy: 0.8433%
Ran 20 epochs in 1.075195550918579


0.91

## Results
There is naturally no improvement in the easier dataset as the logistic regression had 100% test accracy, as did the shallow net. The shallow net improved on the harder of the two data sets by about 6%.

|       | Logistic Regression | Shallow Neural Network |
|-------|---------------------|------------------------|
| Blobs | 100%                | 100%                   |
| Moons | 86%                 | 91%                    |

In [165]:
net.stochastic_gradient_descent(x_train, y_train, alpha=0.001, n_iter=100)

300it [00:00, 9102.09it/s]
300it [00:00, 8910.78it/s]
300it [00:00, 9568.46it/s]
300it [00:00, 7257.33it/s]

Initial training accuracy: 0.5033%
Running epoch 1 of 100
Training accuracy: 0.5033%
Running epoch 2 of 100
Training accuracy: 0.5033%
Running epoch 3 of 100
Training accuracy: 0.5033%
Running epoch 4 of 100
Training accuracy: 0.5033%
Running epoch 5 of 100



300it [00:00, 7700.73it/s]
300it [00:00, 8264.10it/s]
300it [00:00, 9478.58it/s]
300it [00:00, 9404.20it/s]
300it [00:00, 9242.76it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5033%
Running epoch 6 of 100
Training accuracy: 0.5033%
Running epoch 7 of 100
Training accuracy: 0.5033%
Running epoch 8 of 100
Training accuracy: 0.5033%
Running epoch 9 of 100
Training accuracy: 0.5033%
Running epoch 10 of 100


300it [00:00, 9368.63it/s]
300it [00:00, 7228.98it/s]
300it [00:00, 8295.31it/s]
300it [00:00, 7924.40it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5033%
Running epoch 11 of 100
Training accuracy: 0.5033%
Running epoch 12 of 100
Training accuracy: 0.5033%
Running epoch 13 of 100
Training accuracy: 0.5033%
Running epoch 14 of 100


300it [00:00, 9235.50it/s]
300it [00:00, 9638.61it/s]
300it [00:00, 7987.98it/s]
300it [00:00, 9452.95it/s]
300it [00:00, 9504.57it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5033%
Running epoch 15 of 100
Training accuracy: 0.5033%
Running epoch 16 of 100
Training accuracy: 0.5033%
Running epoch 17 of 100
Training accuracy: 0.5033%
Running epoch 18 of 100
Training accuracy: 0.5033%
Running epoch 19 of 100


300it [00:00, 9393.25it/s]
300it [00:00, 9271.09it/s]
300it [00:00, 9432.11it/s]
300it [00:00, 8834.83it/s]
300it [00:00, 9045.75it/s]


Training accuracy: 0.5033%
Running epoch 20 of 100
Training accuracy: 0.5033%
Running epoch 21 of 100
Training accuracy: 0.5033%
Running epoch 22 of 100
Training accuracy: 0.5033%
Running epoch 23 of 100
Training accuracy: 0.5033%

300it [00:00, 7926.99it/s]
300it [00:00, 8300.40it/s]
300it [00:00, 9110.06it/s]
300it [00:00, 8938.82it/s]



Running epoch 24 of 100
Training accuracy: 0.5033%
Running epoch 25 of 100
Training accuracy: 0.5033%
Running epoch 26 of 100
Training accuracy: 0.5033%
Running epoch 27 of 100
Training accuracy: 0.5033%
Running epoch 28 of 100


300it [00:00, 9117.59it/s]
300it [00:00, 9131.15it/s]
300it [00:00, 9526.23it/s]
300it [00:00, 9627.47it/s]
300it [00:00, 9384.14it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5033%
Running epoch 29 of 100
Training accuracy: 0.5033%
Running epoch 30 of 100
Training accuracy: 0.5033%
Running epoch 31 of 100
Training accuracy: 0.5033%
Running epoch 32 of 100
Training accuracy: 0.5033%
Running epoch 33 of 100


300it [00:00, 9445.14it/s]
300it [00:00, 9322.47it/s]
300it [00:00, 9579.24it/s]
300it [00:00, 9470.95it/s]
300it [00:00, 9486.44it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5033%
Running epoch 34 of 100
Training accuracy: 0.5033%
Running epoch 35 of 100
Training accuracy: 0.5033%
Running epoch 36 of 100
Training accuracy: 0.5033%
Running epoch 37 of 100
Training accuracy: 0.5033%
Running epoch 38 of 100


300it [00:00, 9094.66it/s]
300it [00:00, 8185.60it/s]
300it [00:00, 9170.88it/s]
300it [00:00, 9056.95it/s]
300it [00:00, 8597.23it/s]

Training accuracy: 0.5033%
Running epoch 39 of 100
Training accuracy: 0.5033%
Running epoch 40 of 100
Training accuracy: 0.5033%
Running epoch 41 of 100
Training accuracy: 0.5033%
Running epoch 42 of 100
Training accuracy: 0.5033%



300it [00:00, 9179.18it/s]
300it [00:00, 9358.73it/s]
300it [00:00, 8932.22it/s]
300it [00:00, 9037.18it/s]
0it [00:00, ?it/s]

Running epoch 43 of 100
Training accuracy: 0.5033%
Running epoch 44 of 100
Training accuracy: 0.5033%
Running epoch 45 of 100
Training accuracy: 0.5033%
Running epoch 46 of 100
Training accuracy: 0.5033%
Running epoch 47 of 100


300it [00:00, 9311.09it/s]
300it [00:00, 8967.36it/s]
300it [00:00, 8383.58it/s]
300it [00:00, 9042.31it/s]
300it [00:00, 9166.74it/s]


Training accuracy: 0.5033%
Running epoch 48 of 100
Training accuracy: 0.5033%
Running epoch 49 of 100
Training accuracy: 0.5033%
Running epoch 50 of 100
Training accuracy: 0.5033%
Running epoch 51 of 100
Training accuracy: 0.5033%
Running epoch 52 of 100


300it [00:00, 9419.83it/s]
300it [00:00, 8745.36it/s]
300it [00:00, 9129.96it/s]
300it [00:00, 8676.55it/s]
300it [00:00, 9480.58it/s]


Training accuracy: 0.5033%
Running epoch 53 of 100
Training accuracy: 0.5033%
Running epoch 54 of 100
Training accuracy: 0.5033%
Running epoch 55 of 100
Training accuracy: 0.5033%
Running epoch 56 of 100
Training accuracy: 0.5033%
Running epoch 57 of 100


300it [00:00, 9591.73it/s]
300it [00:00, 9642.15it/s]
300it [00:00, 9632.93it/s]
300it [00:00, 7794.51it/s]
300it [00:00, 9163.00it/s]


Training accuracy: 0.5033%
Running epoch 58 of 100
Training accuracy: 0.5033%
Running epoch 59 of 100
Training accuracy: 0.5033%
Running epoch 60 of 100
Training accuracy: 0.5033%
Running epoch 61 of 100
Training accuracy: 0.5033%
Running epoch 62 of 100


300it [00:00, 9204.63it/s]
300it [00:00, 9189.97it/s]
300it [00:00, 9452.59it/s]
300it [00:00, 9388.13it/s]
300it [00:00, 9373.37it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5033%
Running epoch 63 of 100
Training accuracy: 0.5033%
Running epoch 64 of 100
Training accuracy: 0.5033%
Running epoch 65 of 100
Training accuracy: 0.5033%
Running epoch 66 of 100
Training accuracy: 0.5033%
Running epoch 67 of 100


300it [00:00, 8332.67it/s]
300it [00:00, 7812.71it/s]
300it [00:00, 9236.59it/s]
300it [00:00, 8973.37it/s]
300it [00:00, 9253.98it/s]


Training accuracy: 0.5033%
Running epoch 68 of 100
Training accuracy: 0.5033%
Running epoch 69 of 100
Training accuracy: 0.5033%
Running epoch 70 of 100
Training accuracy: 0.5033%
Running epoch 71 of 100
Training accuracy: 0.5033%
Running epoch 72 of 100


300it [00:00, 7490.72it/s]
300it [00:00, 8267.24it/s]
300it [00:00, 8985.55it/s]
300it [00:00, 7904.19it/s]
300it [00:00, 9160.94it/s]

Training accuracy: 0.5033%
Running epoch 73 of 100
Training accuracy: 0.5033%
Running epoch 74 of 100
Training accuracy: 0.5033%
Running epoch 75 of 100
Training accuracy: 0.5033%
Running epoch 76 of 100
Training accuracy: 0.5033%



300it [00:00, 8557.82it/s]
300it [00:00, 9182.80it/s]
300it [00:00, 8215.75it/s]
300it [00:00, 9513.91it/s]


Running epoch 77 of 100
Training accuracy: 0.5033%
Running epoch 78 of 100
Training accuracy: 0.5033%
Running epoch 79 of 100
Training accuracy: 0.5033%
Running epoch 80 of 100
Training accuracy: 0.5033%
Running epoch 81 of 100


300it [00:00, 9510.75it/s]
300it [00:00, 9529.62it/s]
300it [00:00, 9231.78it/s]
300it [00:00, 8681.40it/s]
300it [00:00, 9288.27it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5033%
Running epoch 82 of 100
Training accuracy: 0.5033%
Running epoch 83 of 100
Training accuracy: 0.5033%
Running epoch 84 of 100
Training accuracy: 0.5033%
Running epoch 85 of 100
Training accuracy: 0.5033%
Running epoch 86 of 100


300it [00:00, 9431.41it/s]
300it [00:00, 8849.49it/s]
300it [00:00, 9569.55it/s]
300it [00:00, 8300.46it/s]
300it [00:00, 9600.29it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5033%
Running epoch 87 of 100
Training accuracy: 0.5033%
Running epoch 88 of 100
Training accuracy: 0.5033%
Running epoch 89 of 100
Training accuracy: 0.5033%
Running epoch 90 of 100
Training accuracy: 0.5033%
Running epoch 91 of 100


300it [00:00, 9491.74it/s]
300it [00:00, 8013.06it/s]
300it [00:00, 9608.21it/s]
300it [00:00, 9648.14it/s]
300it [00:00, 8573.33it/s]


Training accuracy: 0.5033%
Running epoch 92 of 100
Training accuracy: 0.5033%
Running epoch 93 of 100
Training accuracy: 0.5033%
Running epoch 94 of 100
Training accuracy: 0.5033%
Running epoch 95 of 100


300it [00:00, 8608.94it/s]
300it [00:00, 9529.40it/s]
300it [00:00, 9463.26it/s]
300it [00:00, 8765.71it/s]

Training accuracy: 0.5033%
Running epoch 96 of 100
Training accuracy: 0.5033%
Running epoch 97 of 100
Training accuracy: 0.5033%
Running epoch 98 of 100
Training accuracy: 0.5033%
Running epoch 99 of 100
Training accuracy: 0.5033%
Running epoch 100 of 100



300it [00:00, 7991.79it/s]

Training accuracy: 0.5033%
Ran 100 epochs in 5.048585891723633





In [None]:
nn.score(x_test, y_test)

In [180]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        d = pickle.load(fo, encoding='bytes')
    return d

path_to_images = "/home/oisin/MAI_work/ongoing_assignments/DeepLearning/cifar-10-batches-py/"
datas = [unpickle(path_to_images+"data_batch_"+str(i)) for i in range(1,6)]
x = [d[b'data'] for d in datas]
y = [d[b'labels'] for d in datas]
x_ = np.concatenate(x)
y_ = np.concatenate(y)
data =  [[imput_x, imput_y - 2] for imput_x, imput_y in zip(x_,y_) if imput_y == 2 or imput_y == 3]

_all_data = np.array(data)
x_train, x_test, y_train, y_test = train_test_split(_all_data[:,0],_all_data[:,1])
inp_size = len(x_train[0])

  _all_data = np.array(data)


In [187]:
'''
WRITE THE CODE TO CONVERT THESE TO GREYSCALE.

FORMULA IS L = R * 299/1000 + G * 587/1000 + B * 114/1000
where R,G,B are, obviously the RGB colour values.

In the data we have a vector of length 3*1024. Split this into a matrix of size 3x1024 
then convert using the formula

'''

'\nWRITE THE CODE TO CONVERT THESE TO GREYSCALE.\n\nFORMULA IS L = R * 299/1000 + G * 587/1000 + B * 114/1000\nwhere R,G,B are, obviously the RGB colour values.\n\nIn the data we have a vector of length 3*1024. Split this into a matrix of size 3x1024 \nthen convert using the formula\n\n'

In [186]:
datas[0]

{b'batch_label': b'training batch 1 of 5',
 b'labels': [6,
  9,
  9,
  4,
  1,
  1,
  2,
  7,
  8,
  3,
  4,
  7,
  7,
  2,
  9,
  9,
  9,
  3,
  2,
  6,
  4,
  3,
  6,
  6,
  2,
  6,
  3,
  5,
  4,
  0,
  0,
  9,
  1,
  3,
  4,
  0,
  3,
  7,
  3,
  3,
  5,
  2,
  2,
  7,
  1,
  1,
  1,
  2,
  2,
  0,
  9,
  5,
  7,
  9,
  2,
  2,
  5,
  2,
  4,
  3,
  1,
  1,
  8,
  2,
  1,
  1,
  4,
  9,
  7,
  8,
  5,
  9,
  6,
  7,
  3,
  1,
  9,
  0,
  3,
  1,
  3,
  5,
  4,
  5,
  7,
  7,
  4,
  7,
  9,
  4,
  2,
  3,
  8,
  0,
  1,
  6,
  1,
  1,
  4,
  1,
  8,
  3,
  9,
  6,
  6,
  1,
  8,
  5,
  2,
  9,
  9,
  8,
  1,
  7,
  7,
  0,
  0,
  6,
  9,
  1,
  2,
  2,
  9,
  2,
  6,
  6,
  1,
  9,
  5,
  0,
  4,
  7,
  6,
  7,
  1,
  8,
  1,
  1,
  2,
  8,
  1,
  3,
  3,
  6,
  2,
  4,
  9,
  9,
  5,
  4,
  3,
  6,
  7,
  4,
  6,
  8,
  5,
  5,
  4,
  3,
  1,
  8,
  4,
  7,
  6,
  0,
  9,
  5,
  1,
  3,
  8,
  2,
  7,
  5,
  3,
  4,
  1,
  5,
  7,
  0,
  4,
  7,
  5,
  5,
  1,
  0,
  9,
  6,
  9,
 

In [10]:
# acts = [ReluActivation, ReluActivation, ReluActivation, ReluActivation, SigmoidActivation]
acts = lambda x:1/(1+np.exp(-x))
nn = Network([inp_size,inp_size//2, inp_size//4, 1],activation_function=acts, lamb = 0.001)

In [11]:
nn.no_params

5899008

In [14]:
nn.stochastic_gradient_descent(np.array(x_train), np.array(y_train), alpha=0.001, n_iter=5)

  acts = lambda x:1/(1+np.exp(-x))
  return 1/(1+np.exp(-x))
1it [00:00,  7.53it/s]

Training accuracy: 0.602%


7500it [13:47,  9.07it/s]
1it [00:00,  9.53it/s]

Training accuracy: 0.6492%


35it [00:03,  9.03it/s]


KeyboardInterrupt: 

In [15]:
nn.score(x_test, y_test)

  acts = lambda x:1/(1+np.exp(-x))


0.6204

In [None]:
nn.delta[-1]

In [568]:
nn.score(x_test,y_test)

0.94

In [162]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        d = pickle.load(fo, encoding='bytes')
    return d

In [163]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        d = pickle.load(fo, encoding='bytes')
    return d

path_to_images = "/home/oisin/MAI_work/ongoing_assignments/DeepLearning/cifar-10-batches-py/"
datas = [unpickle(path_to_images+"data_batch_"+str(i)) for i in range(1,6)]
x = [d[b'data'] for d in datas]
y = [d[b'labels'] for d in datas]
x_ = np.concatenate(x)
y_ = np.concatenate(y)
data =  [[imput_x, imput_y - 2] for imput_x, imput_y in zip(x_,y_) if imput_y == 2 or imput_y == 3]

_all_data = np.array(data)
x_train, x_test, y_train, y_test = train_test_split(_all_data[:,0],_all_data[:,1])

In [164]:
_all_data = np.array(data)

  _all_data = np.array(data)


In [165]:
x_train, x_test, y_train, y_test = train_test_split(_all_data[:,0],_all_data[:,1])

In [137]:
len(x_train)

7500

In [138]:
imp_shape = len(x_train[0])

In [192]:
# acts = [ReluActivation, ReluActivation, ReluActivation, SigmoidActivation]
acts = lambda x : 1/(1+np.exp(-x))
net = Network([imp_shape,100,100,1],activation_function=acts, lamb = 0.001)

In [147]:
net.stochastic_gradient_descent(x_train,y_train, n_iter=20)

Running epoch 0 of 20


  return 1/(1+np.exp(-x))
17it [00:00, 166.15it/s]

Training accuracy: 0.49666666666666665%


7500it [00:38, 195.84it/s]


Running epoch 1 of 20


17it [00:00, 163.38it/s]

Training accuracy: 0.49666666666666665%


7500it [00:42, 177.35it/s]


Running epoch 2 of 20


17it [00:00, 168.95it/s]

Training accuracy: 0.49666666666666665%


7500it [00:39, 190.56it/s]


Running epoch 3 of 20


17it [00:00, 164.73it/s]

Training accuracy: 0.49666666666666665%


4130it [00:21, 190.54it/s]


KeyboardInterrupt: 

In [148]:
net.score(x_test, y_test)

  return 1/(1+np.exp(-x))


0.5096

In [150]:
net.delta[-1]


array([[0.]])