#This notebook is dedicated to the creation of a Mini-Deep Learning Framework (Based on *Grokking deep learning* <-Great book, check it out)


First we will write the whole code so we can test the functions later

In [0]:
import numpy as np

class Tensor (object):
    
    def __init__(self,data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None

        if(id is None):
            self.id = np.random.randint(0,1000000000)
        else:
            self.id = id
        
        self.creators = creators
        self.creation_op = creation_op
        self.children = {}
        
        if(creators is not None):
            for c in creators:
                if(self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        for id,cnt in self.children.items():
            if(cnt != 0):
                return False
        return True 
        
    def backward(self,grad=None, grad_origin=None):
        if(self.autograd):
 
            if(grad is None):
                grad = Tensor(np.ones_like(self.data))

            if(grad_origin is not None):
                if(self.children[grad_origin.id] == 0):
                    return
                    print(self.id)
                    print(self.creation_op)
                    print(len(self.creators))
                    for c in self.creators:
                        print(c.creation_op)
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[grad_origin.id] -= 1

            if(self.grad is None):
                self.grad = grad
            else:
                self.grad += grad
            
            # grads must not have grads of their own
            assert grad.autograd == False
            
            # only continue backpropping if there's something to
            # backprop into and if all gradients (from children)
            # are accounted for override waiting for children if
            # "backprop" was called on this variable directly
            if(self.creators is not None and 
               (self.all_children_grads_accounted_for() or 
                grad_origin is None)):

                if(self.creation_op == "add"):
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                    
                if(self.creation_op == "sub"):
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor(self.grad.__neg__().data), self)

                if(self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new , self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)                    
                    
                if(self.creation_op == "mm"):
                    c0 = self.creators[0]
                    c1 = self.creators[1]
                    new = self.grad.mm(c1.transpose())
                    c0.backward(new)
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new)
                    
                if(self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())

                if("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.expand(dim,
                                                               self.creators[0].data.shape[dim]))

                if("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
                    
                if(self.creation_op == "neg"):
                    self.creators[0].backward(self.grad.__neg__())
                    
                if(self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                
                if(self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
                
                if(self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))
                    
                if(self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))
                    
    def __add__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if(self.autograd):
            return Tensor(self.data * -1,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(self.data * -1)
    
    def __sub__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="sub")
        return Tensor(self.data - other.data)
    
    def __mul__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="mul")
        return Tensor(self.data * other.data)    

    def sum(self, dim):
        if(self.autograd):
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))
    
    def expand(self, dim,copies):

        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd)
        
        if(self.autograd):
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_"+str(dim))
        return Tensor(new_data)
    
    def transpose(self):
        if(self.autograd):
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")
        
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if(self.autograd):
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self,x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if(self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          creators=[self],
                          creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if(self.autograd):
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          creators=[self],
                          creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    def index_select(self, indices):

        if(self.autograd):
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         creators=[self],
                         creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])
    
    def softmax(self):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)
        return softmax_output
    
    def cross_entropy(self, target_indices):

        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)
        
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t),-1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()
    
        if(self.autograd):
            out = Tensor(loss,
                         autograd=True,
                         creators=[self],
                         creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out

        return Tensor(loss)
        
    
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  

class Layer(object):
    
    def __init__(self):
        self.parameters = list()
        
    def get_parameters(self):
        return self.parameters

    
class SGD(object):
    
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
        
    def step(self, zero=True):
        
        for p in self.parameters:
            
            p.data -= p.grad.data * self.alpha
            
            if(zero):
                p.grad.data *= 0


class Linear(Layer):

    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()
        
        self.use_bias = bias
        
        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        if(self.use_bias):
            self.bias = Tensor(np.zeros(n_outputs), autograd=True)
        
        self.parameters.append(self.weight)
        
        if(self.use_bias):        
            self.parameters.append(self.bias)

    def forward(self, input):
        if(self.use_bias):
            return input.mm(self.weight)+self.bias.expand(0,len(input.data))
        return input.mm(self.weight)


class Sequential(Layer):
    
    def __init__(self, layers=list()):
        super().__init__()
        
        self.layers = layers
    
    def add(self, layer):
        self.layers.append(layer)
        
    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input
    
    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params


class Embedding(Layer):
    
    def __init__(self, vocab_size, dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dim = dim
        
        # this random initialiation style is just a convention from word2vec
        self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)
        
        self.parameters.append(self.weight)
    
    def forward(self, input):
        return self.weight.index_select(input)


class Tanh(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()


class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()
    

class CrossEntropyLoss(object):
    
    def __init__(self):
        super().__init__()
    
    def forward(self, input, target):
        return input.cross_entropy(target)

    
class RNNCell(Layer):
    
    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()

        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        if(activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif(activation == 'tanh'):
            self.activation == Tanh()
        else:
            raise Exception("Non-linearity not found")

        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)
        
        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()        
    
    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden
    
    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size,self.n_hidden)), autograd=True)
    
class LSTMCell(Layer):
    
    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()

        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output

        self.xf = Linear(n_inputs, n_hidden)
        self.xi = Linear(n_inputs, n_hidden)
        self.xo = Linear(n_inputs, n_hidden)        
        self.xc = Linear(n_inputs, n_hidden)        
        
        self.hf = Linear(n_hidden, n_hidden, bias=False)
        self.hi = Linear(n_hidden, n_hidden, bias=False)
        self.ho = Linear(n_hidden, n_hidden, bias=False)
        self.hc = Linear(n_hidden, n_hidden, bias=False)        
        
        self.w_ho = Linear(n_hidden, n_output, bias=False)
        
        self.parameters += self.xf.get_parameters()
        self.parameters += self.xi.get_parameters()
        self.parameters += self.xo.get_parameters()
        self.parameters += self.xc.get_parameters()

        self.parameters += self.hf.get_parameters()
        self.parameters += self.hi.get_parameters()        
        self.parameters += self.ho.get_parameters()        
        self.parameters += self.hc.get_parameters()                
        
        self.parameters += self.w_ho.get_parameters()        
    
    def forward(self, input, hidden):
        
        prev_hidden = hidden[0]        
        prev_cell = hidden[1]
        
        f = (self.xf.forward(input) + self.hf.forward(prev_hidden)).sigmoid()
        i = (self.xi.forward(input) + self.hi.forward(prev_hidden)).sigmoid()
        o = (self.xo.forward(input) + self.ho.forward(prev_hidden)).sigmoid()        
        g = (self.xc.forward(input) + self.hc.forward(prev_hidden)).tanh()        
        c = (f * prev_cell) + (i * g)

        h = o * c.tanh()
        
        output = self.w_ho.forward(h)
        return output, (h, c)
    
    def init_hidden(self, batch_size=1):
        init_hidden = Tensor(np.zeros((batch_size,self.n_hidden)), autograd=True)
        init_cell = Tensor(np.zeros((batch_size,self.n_hidden)), autograd=True)
        init_hidden.data[:,0] += 1
        init_cell.data[:,0] += 1
        return (init_hidden, init_cell)

## The Tensor() class
**The framework is based on the Tensor class, which can do:**
  - Backpropagation <- possibly the most important part.
  - Adding tensors `__add__`
  - Inversing tensor sign. `__neg__`
  - Substracting tensors `__sub__`
  - Matrix multiplication `__mul__`
  - Summing tensors `sum`
    + Disambiguation: This function reduces a vector to a single value (the sum) in a dimension.
  - expanding a Tensor `expand`
    + Adds a dimension to a tensor
  - transposing tensors `transpose`
    + Transposes a tensor.
  - Dot product `mm`
    + Calculates dot product
  - Sigmoid `sigmoid`
    + Calculates sigmoid
  - Tanh `tanh`
   + Calculates tanh
  - index_select `index_select`
    + Selects the indices of the tensor
  - softmax `softmax`
    + Calculates softmax probabilities
  - cross_entropy `cross_entropy`
    + Calculates cross entropy
  - repr `__repr__`
    + returns the data of the Tensor as string
  - string `__str__`
    + returns the data of the Tensor as string








### The backward function (Backpropagation)
This function is where the magic happens, and where the network learns.

The steps of this function are:
1. Check if the function has an origin (we cannot backpropagate the last parent nodes - eg, the imput data itself.)
   + This throws an exception if you try to backpropagate more than once.
2. If this is efectively the fathermost node, you add a gradient to it ("The beggining of the road")
3. assert that gradients do not have gradients of their own.
4. Check if there's something to backpropagate into AND if all gradients (From children) are accounted for, override waiting for children if "backprop" was called on this variable directly.
  + Inside this part of the code we have all the backpropagation math for every single defined function.
  + This functions aim to "reverse" the operations made in each function so you can "standarize" the values of the gradients in each layer.




### Testing every operation from the Tensor class. and examples of backpropagation with each operation


#### add


In [0]:
a = Tensor([1,1,1], autograd=True)
b = Tensor([1,2,3], autograd=True)

print(a.__add__(b))
print(a + b) #Also works because __add__ overwrites the '+' (I think¿)

[2 3 4]
[2 3 4]


In [0]:
#The backpropagation
c = a.__add__(b)
print(c)
print(c.grad)
c.backward()
print(c.grad) #The new calculated gradients (right now is 1 because we don't have a loss)
  # and the backward() function assigns it automatically
print(c.creators) #Where c comes from


[2 3 4]
None
[1 1 1]
[array([1, 1, 1]), array([1, 2, 3])]


#### Neg


In [0]:
a = Tensor([1,1,1], autograd=True)
a.__neg__()

array([-1, -1, -1])

In [0]:
#The backpropagation
c = a.__neg__()
print(c)
print(c.grad)
c.backward()
print(c.grad) #The new calculated gradients (right now is 1 because we don't have a loss)
  # and the backward() function assigns it automatically
print(c.creators) #Where c comes from

[-1 -1 -1]
None
[1 1 1]
[array([1, 1, 1])]


#### Sub


In [0]:
a = Tensor([1,1,1], autograd=True)
b = Tensor([1,2,3], autograd=True)

a.__sub__(b)

array([ 0, -1, -2])

In [0]:
#The backpropagation
c = a.__sub__(b)
print(c)
print(c.grad)
c.backward()
print(c.grad) #The new calculated gradients (right now is 1 because we don't have a loss)
  # and the backward() function assigns it automatically
print(c.creators) #Where c comes from


[ 0 -1 -2]
None
[1 1 1]
[array([1, 1, 1]), array([1, 2, 3])]


#### Mul


In [0]:
a = Tensor([1,1,1], autograd=True)
b = Tensor([1,2,3], autograd=True)

a.__mul__(b)

array([1, 2, 3])

In [0]:
#The backpropagation
c = a.__mul__(b)
print(c)
print(c.grad)
c.backward()
print(c.grad) #The new calculated gradients (right now is 1 because we don't have a loss)
  # and the backward() function assigns it automatically
print(c.creators) #Where c comes from


[1 2 3]
None
[1 1 1]
[array([1, 1, 1]), array([1, 2, 3])]


Before continuing note that the magic of backprop is how you can combine functions, and stack them and still get the proper gradients.

In [0]:
a = Tensor([1,1,1], autograd=True)
b = Tensor([1,2,3], autograd=True)

#The backpropagation
c = a.__add__(b)
d = b.__mul__(a)

e = c.__sub__(d)
f = e.__neg__()

In [0]:
print(a.grad)
print(a.children) #The id of the children nodes of a

None
{585014643: 1, 551494389: 1}


In [0]:
print(f)
print(f.grad)
f.backward()
print(f.grad)
print(f.creators) 

[-1 -1 -1]
None
[1 1 1]
[array([1, 1, 1])]


In [0]:
### Also notice how the gradients of a and b also change when f is called
  # Because f comes from a string of operations that start with a and b
print(a)
print(a.grad)

[1 1 1]
[0 1 2]


#### Sum

In [0]:
a = Tensor([1,1,1], autograd = True)

In [0]:
a.sum(0)

array(3)

In [0]:
#The backpropagation
c = a.sum(0)
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)


3
None
1
[array([1, 1, 1])]


####Expand

In [0]:
a = Tensor([1,1,1], autograd = True)
print(a.expand(0,2)) #dimension to expand, copies
print("\n")
print(a.expand(1,4))

[[1 1 1]
 [1 1 1]]


[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]


In [0]:
#The backpropagation
c = a.expand(0,2)
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)


[[1 1 1]
 [1 1 1]]
None
[[1 1 1]
 [1 1 1]]
[array([1, 1, 1])]


#### Transpose

In [0]:
a = Tensor([[0,0],[1,1],[2,2]], autograd = True)
a.transpose()

array([[0, 1, 2],
       [0, 1, 2]])

In [0]:
#The backpropagation
c = a.transpose()
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)


[[0 1 2]
 [0 1 2]]
None
[[1 1 1]
 [1 1 1]]
[array([[0, 0],
       [1, 1],
       [2, 2]])]


#### mm (dot product)


In [0]:
a = Tensor([1,1,1], autograd = True)
b = Tensor([1,2,3], autograd = True)

a.mm(b) #1*1 + 1*2 + 1*3

array(6)

In [0]:
#The backpropagation
c = a.mm(b)
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)


6
None
1
[array([1, 1, 1]), array([1, 2, 3])]


#### Sigmoid

In [0]:
a = Tensor([-1,1,3], autograd = True)
a.sigmoid()

array([0.26894142, 0.73105858, 0.95257413])

In [0]:
#The backpropagation
c = a.sigmoid()
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)

[0.26894142 0.73105858 0.95257413]
None
[1. 1. 1.]
[array([-1,  1,  3])]


####Tanh

In [0]:
a = Tensor([-1,1,3], autograd = True)
a.tanh()

array([-0.76159416,  0.76159416,  0.99505475])

In [0]:
#The backpropagation
c = a.tanh()
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)

[-0.76159416  0.76159416  0.99505475]
None
[1. 1. 1.]
[array([-1,  1,  3])]


####Index_select

In [0]:
a = Tensor([[0,0],[1,1],[2,2]], autograd = True)
a.index_select(Tensor([0,2]))


array([[0, 0],
       [2, 2]])

In [0]:
#The backpropagation
c = a.index_select(Tensor([0,2]))
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)

[[0 0]
 [2 2]]
None
[[1 1]
 [1 1]]
[array([[0, 0],
       [1, 1],
       [2, 2]])]


#### Softmax

In [0]:
a = Tensor([[0,0],[10,1],[0,2]], autograd = True)
a.softmax()

array([[5.00000000e-01, 5.00000000e-01],
       [9.99876605e-01, 1.23394576e-04],
       [1.19202922e-01, 8.80797078e-01]])

In [0]:
#The backpropagation
  #Here there are no gradients, c.grads does not exist
  # This is because it is a numpy array (because softmax transforms it)
"""
c = a.softmax()
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)
"""

'\nc = a.softmax()\nprint(c)\nprint(c.grad)\nc.backward()\nprint(c.grad)\nprint(c.creators)\n'

#### cross_entropy

In [0]:
a = Tensor([[0,0],[10,1],[0,2]], autograd = True)
targets = Tensor([0,1,0])
a.cross_entropy(targets)

array(3.9400662)

In [0]:
#The backpropagation
c = a.cross_entropy(targets)
print(c)
print(c.grad)
c.backward()
print(c.grad)
print(c.creators)

3.9400661979308804
None
1.0
[array([[ 0,  0],
       [10,  1],
       [ 0,  2]])]


Some things must be commented on:
If the tensor has more dimensions than the labels (eg previous example), you first sum them (to make them a single number), (same with the labels but you just flatten them) and then use the cross-entropy function:
`loss = -(np.log(p) * (target_dist)).sum(1).mean()`


#### Repr and string


In [0]:
a = Tensor([[0,0],[10,1],[0,2]], autograd = True)
print(a.__repr__())
print(a.__str__())


array([[ 0,  0],
       [10,  1],
       [ 0,  2]])
[[ 0  0]
 [10  1]
 [ 0  2]]


## The Layer class
Other classes were created to support our main class, they are
  - Layer
  - SGD
  - Linear
  - Sequential
  - Embedding
  - Tanh
  - Sigmoid
  - CrossEntropyLoss
  - RNNCell
  - LSTMCell

The main function of every class (except Layer and SGD) is their respective .forward() function, which returns the prediction of that layer.

For the Layer() class, it simply provides a base for every other class (Except SGD)

For the SGD() class, it is an optimizer that applies the change in the weights to the model.



### Testing every class

Layer()

In [0]:
Layer()

<__main__.Layer at 0x7fc71de0f240>

#### SGD
SGD Is an optimization layer (Stochastic gradient descent)
To exemplify it a single pass example of a network will be given.

(


In [0]:
import numpy
import sys

data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

w = list()
w.append(Tensor(np.random.rand(2,3), autograd=True))
w.append(Tensor(np.random.rand(3,1), autograd=True))

optim = SGD(parameters=w, alpha=0.1)

#for i in range(10): For multiple passes, uncomment

# Predict
pred = data.mm(w[0]).mm(w[1])

# Compare
loss = ((pred - target)*(pred - target)).sum(0)

# Learn
loss.backward(Tensor(np.ones_like(loss.data)))
optim.step() #For every parameter, given a gradient, change the weights 

sys.stdout.write('\r'+str(loss))

[0.58359915]

#### Linear

In [0]:
Linear_layer = Linear(2,3)
print(Linear_layer.weight)
print(Linear_layer.bias)

[[ 1.33626584 -1.03141487 -0.18351639]
 [ 0.06167598  0.35537624  2.04473991]]
[0. 0. 0.]


In [0]:
#Prediction, it's the weights 
Linear_layer.forward(Tensor([1,2],autograd=True))
#Returns the same number because it is dot-product

array([[-1.68639651, -2.47933601, -0.86747018],
       [-1.68639651, -2.47933601, -0.86747018]])

#### Sequential

In [0]:
Sequential_layer = Sequential([Linear(2,3), Linear(3,1)])
SLParams = Sequential_layer.get_parameters()
SLParams[0:1] #Weights and biases of first linear (with second would be the next ones)

[array([[ 2.32571573,  2.19716518, -1.18415004],
        [-0.76148084, -0.11429519,  1.45228121]])]

In [0]:
#NOTE THAT THE DIMENSIONS MUST MAKE SENSE, the tensor passed has 1x2 -> 
  # 2x3 (of first linear) ->3x1 (returns a single value per input)
Sequential_layer.forward(Tensor([1,0],autograd=True)) 

array([[-0.74233009],
       [-0.74233009]])

#### Embedding

In [0]:
Embedding_layer = Embedding(2,3)


#Parameters show all the parameters, in this case, the layer only has weights
print(Embedding_layer.parameters)
print("\n")
Embedding_layer.forward(Tensor([1,0],autograd=True)) 

[array([[ 0.05573594,  0.10320427, -0.01470895],
       [ 0.01508788, -0.08155138,  0.02532749]])]




array([[ 0.01508788, -0.08155138,  0.02532749],
       [ 0.05573594,  0.10320427, -0.01470895]])

#### Tanh

In [0]:
tanh_layer = Tanh()
tanh_layer.forward(Tensor([-1,2,3],autograd = True))

array([-0.76159416,  0.96402758,  0.99505475])

#### Sigmoid


In [0]:
sigmoid_layer = Sigmoid()
sigmoid_layer.forward(Tensor([1,2,3],autograd = True))

array([0.73105858, 0.88079708, 0.95257413])

#### Cross entropy loss

In [0]:
CrossEntopy_layer = CrossEntropyLoss()
CrossEntopy_layer.forward(Tensor([1,2,3],autograd = True),Tensor([1],autograd = True))

array(1.40760596)

#### RNNCell

In [0]:
#n_inputs must match with the second size of the embedding layer
model = RNNCell(n_inputs=3, n_hidden=2, n_output=1)
hidden = model.init_hidden(batch_size = 1)
rnn_input = Embedding(2,3).forward(Tensor([1,0],autograd=True)) #Taken from the embeding class 
## rnn_input is 2x3
output, hidden = model.forward(input=rnn_input, hidden=hidden)
print(hidden)
print("\n")
print(output)

[[0.49790149 0.51723242]
 [0.56408507 0.50974354]]


[[-0.02591581]
 [-0.02032943]]


#### LSTM Cell

In [0]:
current_vocab_size = 5
input_dimension = 3
#dim has to be equal to n_inputs
embed_lstm = Embedding(vocab_size=current_vocab_size,dim=input_dimension)
model = LSTMCell(n_inputs=input_dimension, n_hidden=4, n_output=current_vocab_size)

#n_hidden is the number of hidden layers (and weights).
hidden = model.init_hidden(batch_size = 3) #Has to be multiple(broadcastalbe) of the n_iputs X n_hidden
lstm_input = embed_lstm.forward(Tensor([1,0,1],autograd=True))
# THE OUTPUT DEPENDS ON THE INPUT DIMENSION, WHICH HAS TO BE MULTIPLE OF BATCH_SIZE
  # AND HAS TO MAKE SENSE WITH THE SHAPE OF THE INPUT TENSOR OF THE EMBED_LSTM layer
output, hidden = model.forward(input=lstm_input, hidden=hidden)
#Output is vocab_size X input_dimension (basically the predictions for the input tensor)
print(hidden) #The hidden layers
print("\n")
print(output)#The predictions ()

(array([[ 0.20474022, -0.22035093, -0.22545262, -0.03007398],
       [ 0.20072954, -0.22994678, -0.22875616, -0.0770633 ],
       [ 0.20474022, -0.22035093, -0.22545262, -0.03007398]]), array([[ 0.62935339, -0.43628705, -0.54277915, -0.04061711],
       [ 0.58501956, -0.48393447, -0.54249725, -0.10305764],
       [ 0.62935339, -0.43628705, -0.54277915, -0.04061711]]))


[[-0.49274267 -0.21660056  0.22842868 -0.01471593 -0.20429764]
 [-0.47992161 -0.30172723  0.21678575 -0.015754   -0.23023531]
 [-0.49274267 -0.21660056  0.22842868 -0.01471593 -0.20429764]]


In [0]:
embed_lstm.parameters #returns vocab_size array X input_dimension

[array([[-0.14687184,  0.12223735, -0.10779355],
        [-0.16010357,  0.00816488, -0.00171421],
        [-0.14608273, -0.16573372, -0.03558867],
        [ 0.05641243, -0.00957594,  0.00776743],
        [ 0.06203665,  0.05508864, -0.09712408]])]

In [0]:
np.array(output.data).shape # (, vocab_size)

(3, 5)

----------

# Training and using the mini-framework

## Example and data taken from Grokking Deep Learning


In [0]:
import sys,random,math
from collections import Counter
import numpy as np
import sys

np.random.seed(0)

# dataset from http://karpathy.github.io/2015/05/21/rnn-effectiveness/
# (Can be found in repo as well)

#Load data
f = open('shakespear.txt','r')
raw = f.read()
f.close()

# Pre-process data
vocab = list(set(raw))
word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
indices = np.array(list(map(lambda x:word2index[x], raw)))

#Set the layers and mode
embed = Embedding(vocab_size=len(vocab),dim=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_output=len(vocab))
model.w_ho.weight.data *= 0

#Set the criterion (loss) and start the optimizer
criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

# This function  generates an example of text predicted with the network
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        output.data *= 15
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()

#         m = (temp_dist > np.random.rand()).argmax() # sample from predictions
        m = output.data.argmax() # take the max prediction
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

#set batch_size
batch_size = 16
bptt = 25
n_batches = int((indices.shape[0] / (batch_size)))

#Indices is the whole data as ints
# Separating the indices into batches
# This part of the code is very important
#This line reshapes the dataset so each column is a section of the initial indices array
#The issue is that the previous line prints the characters vertically (not horizontally)
#So you have to transpose it to fix this
trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches).transpose()

#Takes every batch except the last one
input_batched_indices = batched_indices[0:-1]
#Takes every batch except the first one.
target_batched_indices = batched_indices[1:]

#Get the number of truncated batches to use.
n_bptt = int(((n_batches-1) / bptt))
#Pass the batches to useful shapes
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt,bptt,batch_size)

target_batches = target_batched_indices[:n_bptt*bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)

In [0]:
#The actual training
def train(iterations=400):
    min_loss = 1000
    #For every iteration
    for iter in range(iterations):
        #Set total loss and n_loss to zero
        total_loss = 0
        n_loss = 0
        
        #Start the hidden model
        hidden = model.init_hidden(batch_size=batch_size)
        # Get the length of batches to train
        batches_to_train = len(input_batches)
        # batches_to_train = 32
        
        # For every batch do:
        for batch_i in range(batches_to_train):
            
            #Set the hidden layers as tensors
            hidden = (Tensor(hidden[0].data, autograd=True), Tensor(hidden[1].data, autograd=True))
            
            #Define the losses as a list.
            losses = list()
            
            #For every example in bptt (truncated backprop)
            for t in range(bptt):
              # We have batches of batches (195,16,32): 195 batches of batches with 32 values each
              #The input is the 32 values of current bptt batch
              input = Tensor(input_batches[batch_i][t], autograd=True)
              #embed the input
              rnn_input = embed.forward(input=input)
              #Pass the embedded inputs to the model and the hidden values as well
                #Get the prediction
              output, hidden = model.forward(input=rnn_input, hidden=hidden)
              
              #Get the targets, get the current batch loss
              target = Tensor(target_batches[batch_i][t], autograd=True)    
              batch_loss = criterion.forward(output, target)

              #Append the batch loss to the losses list
              if(t == 0):
                  losses.append(batch_loss)
              #if this is not the first batch, sum the current loss and the batch_loss
                #losses is an array of accumulated losses from the first batch to the last one of bptt
              else:
                  losses.append(batch_loss + losses[-1])

            #ONCE YOU HAVE FORWARDPROPAGATED THROUGH ALL THE BPTT BATCHES
            #Take the last value of losses
            loss = losses[-1]
            

            #Backpropagate.
            loss.backward()
            #Optimize
            optim.step()
            #Add the regularized bptt loss to the total loss
            total_loss += loss.data / bptt
            
            #Set the epoch loss
            epoch_loss = np.exp(total_loss / (batch_i+1))
            #If we made a 'new record' set it
            if(epoch_loss < min_loss):
                min_loss = epoch_loss

            #Set metrics
            log = "\r Iter:" + str(iter)
            log += " - Alpha:" + str(optim.alpha)[0:5]
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Min Loss:" + str(min_loss)[0:5]
            log += " - Loss:" + str(epoch_loss)
            #For every batch, print the logs and a sample
            if(batch_i == 0):
                log += " - " + generate_sample(n=70, init_char='T').replace("\n"," ")
            if(batch_i % 1 == 0):
                sys.stdout.write(log)
        optim.alpha *= 0.99

In [0]:
train()