<a href="https://colab.research.google.com/github/emmelinetsen/deep_learning/blob/master/assignment_2/autodiff.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Autodiff Python Library for MNST Classifier

1. Do write autodiff python  library (we discussed in class)  and based on it , do  mnist classifier in colab (similar to last exercise but using autodiff library  that you will write instead of direct numpy)


In [0]:
import numpy as np

class Tensor (object):
    
    def __init__(self,data,
                 autograd=False,
                 parent=None,
                 operation=None,
                 id=None):
        
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None
        if(id is None):
            self.id = np.random.randint(0,10000)
        else:
            self.id = id
        
        self.parent = parent
        self.operation = operation
        self.children = {}
        
        if(parent is not None):
            for c in parent:
                if(self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def accounted(self):
        for id,cnt in self.children.items():
            if(cnt != 0):
                return False
        return True 
        
    def back(self,grad=None, grad_origin=None):
        if(self.autograd):
 
            if(grad is None):
                grad = Tensor(np.ones_like(self.data))

            if(grad_origin is not None):
                if(self.children[grad_origin.id] == 0):
                    raise Exception("can't backprop more than once")
                else:
                    self.children[grad_origin.id] -= 1

            if(self.grad is None):
                self.grad = grad
            else:
                self.grad += grad
            
            # grads must not have their own grads
            assert grad.autograd == False
            
            # backprop should only continue if there is something to backprop
            # and if all the gradients from the children are accounted for
            # override waiting for children if backprop was called on the 
            # variable directly
            if(self.parent is not None and 
               (self.accounted() or 
                grad_origin is None)):

                # adding operation
                if(self.operation == "add"):
                    self.parent[0].back(self.grad, self)
                    self.parent[1].back(self.grad, self)
                    
                #  subtracting operation
                if(self.operation == "sub"):
                    self.parent[0].back(Tensor(self.grad.data), self)
                    self.parent[1].back(Tensor(self.grad.__neg__().data), self)

                # multiply operation
                if(self.operation == "mul"):
                    new = self.grad * self.parent[1]
                    self.parent[0].back(new , self)
                    new = self.grad * self.parent[0]
                    self.parent[1].back(new, self)                    
                    
                # matrix multiplication operation
                if(self.operation == "mm"):
                    c0 = self.parent[0]
                    c1 = self.parent[1]
                    new = self.grad.mm(c1.transpose())
                    c0.back(new)
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.back(new)
                    
                # transpose operation
                if(self.operation == "transpose"):
                    self.parent[0].back(self.grad.transpose())

                if("sum" in self.operation):
                    dim = int(self.operation.split("_")[1])
                    self.parent[0].back(self.grad.expand(dim,
                                                         self.parent[0].data.shape[dim]))

                if("expand" in self.operation):
                    dim = int(self.operation.split("_")[1])
                    self.parent[0].back(self.grad.sum(dim))
                    
                if(self.operation == "neg"):
                    self.parent[0].back(self.grad.__neg__())
                    
                if(self.operation == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.parent[0].back(self.grad * (self * (ones - self)))

                if(self.operation == "relu"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.parent[0].back(self.grad * (ones - (self * self)))
                
                if(self.operation == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.parent[0].back(self.grad * (ones - (self * self)))

                if(self.operation == "cross_entropy"):
                    # print("Softmax Output", self.softmax_output)
                    # print("Target", self.target_dist)
                    dx = self.softmax_output - self.target_dist
                    self.parent[0].back(Tensor(dx))
                
                if(self.operation == "index_select"):
                    new_grad = np.zeros_like(self.parent[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.parent[0].back(Tensor(new_grad))

                    
    def __add__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data + other.data,
                          autograd=True,
                          parent=[self,other],
                          operation="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if(self.autograd):
            return Tensor(self.data * -1,
                          autograd=True,
                          parent=[self],
                          operation="neg")
        return Tensor(self.data * -1)
    
    def __sub__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data - other.data,
                          autograd=True,
                          parent=[self,other],
                          operation="sub")
        return Tensor(self.data - other.data)
    
    def __mul__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data * other.data,
                          autograd=True,
                          parent=[self,other],
                          operation="mul")
        return Tensor(self.data * other.data)    

    def sum(self, dim):
        if(self.autograd):
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          parent=[self],
                          operation="sum_"+str(dim))
        return Tensor(self.data.sum(dim))
    
    def expand(self, dim,copies):

        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd)
        
        if(self.autograd):
            return Tensor(new_data,
                          autograd=True,
                          parent=[self],
                          operation="expand_"+str(dim))
        return Tensor(new_data)
    
    def transpose(self):
        if(self.autograd):
            return Tensor(self.data.transpose(),
                          autograd=True,
                          parent=[self],
                          operation="transpose")
        
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if(self.autograd):
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          parent=[self,x],
                          operation="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if(self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          parent=[self],
                          operation="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if(self.autograd):
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          parent=[self],
                          operation="tanh")
        return Tensor(np.tanh(self.data))

    def relu(self):
        if(self.autograd):
            return Tensor(((self.data > 0) * self.data),
                          autograd=True,
                          parent=[self],
                          operation="relu")
        return Tensor(np.tanh(self.data))
    
    def index_select(self, indices):

        if(self.autograd):
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         parent=[self],
                         operation="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])
    
    def cross_entropy(self, target_indices):

        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)
        
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t),-1)
        # print("P:", p)
        # print("T:", t)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()
    
        if(self.autograd):
            out = Tensor(loss,
                         autograd=True,
                         parent=[self],
                         operation="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out

        return Tensor(loss)
        
    
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  

# Adding Layers
class Layer(object):
    
    def __init__(self):
        self.parameters = list()
        
    def get_parameters(self):
        return self.parameters


class Tanh(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()
    
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()

# Adding Optimization
class SGD(object):
    
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
        
    def step(self, zero=True):
        
        for p in self.parameters:
            
            p.data -= p.grad.data * self.alpha
            
            if(zero):
                p.grad.data *= 0

# Linear Layer
class Linear(Layer):

    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        self.bias = Tensor(np.zeros(n_outputs), autograd=True)
        
        self.parameters.append(self.weight)
        self.parameters.append(self.bias)

    def forward(self, input):
        return input.mm(self.weight)+self.bias.expand(0,len(input.data))

# Sequential Layer
class Sequential(Layer):
    
    def __init__(self, layers=list()):
        super().__init__()
        
        self.layers = layers
    
    def add(self, layer):
        self.layers.append(layer)
        
    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input
    
    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params
  
# MSE Loss
class MSELoss(Layer):
    
    def __init__(self):
        super().__init__()
    
    def forward(self, pred, target):
        return ((pred - target)*(pred - target)).sum(0)


class Relu(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.relu()
    

class Softmax(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.softmax()

class CrossEntropyLoss(object):
    
    def __init__(self):
        super().__init__()
    
    def forward(self, input, target):
        return input.cross_entropy(target)

In [0]:
p = np.array([[0,1,2,3], [4,5,6,7]])
t = 0
np.eye(p.shape[1])[t]

array([1., 0., 0., 0.])

In [0]:
p.shape[1]

4

In [0]:
import numpy as np
from tensorflow.keras.datasets import mnist

np.random.seed(1)

# loading the mnist data into training and testing data
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# getting the first 1000 data and reshaping the dataset for the first 1000
# scaling the input to 255
img, labels = (x_train[0:1000].reshape(1000,28*28) / 255), y_train[0:1000]

# creating an array of zeros 
one_hot_labels = np.zeros((len(labels), 10))


# assigning 1 to where the label would be for that particular array
# for example, if the array 
for i,l in enumerate(labels):
  one_hot_labels[i][l] = 1
labels = one_hot_labels

import sys, numpy as np

test_img = x_test.reshape(len(x_test), 28*28) / 255
test_label = np.zeros((len(y_test), 10))

for i,l in enumerate(y_test):
    test_label[i][l] = 1

In [0]:
# x_train[0]


np.random.seed(0)

data = Tensor(np.array(x_train[0]), autograd=True)
t = y_train[0]
target = Tensor(np.array([t]*28), autograd=True)
# t = labels[0]
# target = Tensor(t*10, autograd=True)
         
model = Sequential([Linear(28,28), Relu(), 
                    Linear(28,15), Relu(),
                    Linear(15,10)])

criterion = CrossEntropyLoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.1)

for i in range(1):
    
    # Predict
    pred = model.forward(data)

    # Compare
    loss = criterion.forward(pred, target)
  
    # Learn
    loss.back(Tensor(np.ones_like(loss.data)))
    optim.step()
    # print(loss)

print(pred)
# print(y_train[0])

[[   0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.        ]
 [ -48.28660289  -11.46317745  -59.09231186 -118.1390371    48.27313553
    68.22849653  -33.67332355  -63.30599138   11.70656736   20.52135836]
 [-101.00513955   59.12995848  -82.39388287 -193.20274748  136.46572602
    93.63813327  -38.90186824 -129.51993855  -45.10246337   33.08826985]
 [  58.9716159    93.1963

In [0]:
class Embedding(Layer):
    
    def __init__(self, vocab_size, dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dim = dim
        
        # this random initialiation style is just a convention from word2vec
        self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)
        
        self.parameters.append(self.weight)
    
    def forward(self, input):
        return self.weight.index_select(input)

import numpy
np.random.seed(0)

# data indices
data = Tensor(np.array([1,2,1,2]), autograd=True)

# target indices
target = Tensor(np.array([0,1,0,1]), autograd=True)

model = Sequential([Embedding(3,3), Tanh(), Linear(3,4)])
criterion = CrossEntropyLoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.1)

for i in range(10):
    
    # Predict
    pred = model.forward(data)
    
    # Compare
    loss = criterion.forward(pred, target)
    
    # Learn
    loss.back(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

1.3885032434928422
0.9558181509266036
0.6823083585795604
0.509525996749312
0.39574491472895856
0.31752527285348275
0.2617222861964216
0.22061283923954234
0.18946427334830074
0.16527389263866668
