In [682]:
import torch
import math
#torch.set_grad_enabled(False)

In [683]:
class Module(object):
    def forward (self, *input):
        raise NotImplementedError
        
    def backward ( self , * gradwrtoutput ) :
        raise NotImplementedError
    
    def get_parameters( self ) :
        return []   

In [684]:
class Losses(object):        
    def forward():
        return NotImplementedError
    def backward():
        NotImplementedError

In [685]:
class Optimizers(object):
    # this is a SGD optimizer
    def __init__(self,lr,max_iter, parameters) :  # should we add a "tolerance_grad" argument ? 
        super().__init__()
        self.eta = lr
        self.maxStep = max_iter # maybe this shouldn't be put inside the module
        self.param = parameters
        self.number_step = 0

    def zero_grad(self):
        for parameter in self.param : 
            parameter.grad = 0

    def step(self): #batch de datapoint  --> confused : how can we do it stochastic ? ou alors on l'appelle step(batch)
        # right now, eta is considered constant 
        #print(self.param[1].data)
        if self.number_step <=self.maxStep:
          for parameter in self.param : 
              #print(parameter.data[1])
              parameter = parameter.data - self.eta * parameter.grad
              #print(parameter.data[1])
          self.number_step = self.number_step + 1
          #print('after',self.param[1].data)
        return self.param

In [686]:
class Parameter():
    def __init__(self):
        self.name = ''
        self.data = None
        self.grad = None

In [687]:
class Linear(Module):
    
    def __init__(self, input_dim, out_dim, bias = True):
        super().__init__()
        std = 1/math.sqrt(input_dim)
        self.weight = Parameter()
        self.parameters = []
        
        self.weight.data = torch.rand(out_dim, input_dim)
        self.weight.data = 2*std*self.weight.data - std
        self.weight.name = 'weight'
        self.parameters += [self.weight]
        
        self.with_bias = bias
        if bias :
            self.bias = Parameter()
            self.bias.data = torch.rand(out_dim)
            self.bias.data = 2*std*self.bias.data - std
            self.bias.name = 'bias'
            self.parameters +=[self.bias]
            
        self.x = None
              
    def forward(self, x):
        self.x = x
        return self.weight.data.mv(x) + self.bias.data
        
    def backward(self, prev_grad):
        
        prev_grad = prev_grad.view(-1, 1)
        if self.x is None:
            raise CallForwardFirst
        
        if self.weight.grad is None:
            self.weight.grad = torch.zeros_like(self.weight.data)
        
        self.weight.grad += prev_grad.view(-1, 1)*self.x.view(1, -1)
        
        if self.with_bias:
            if self.bias.grad is None:
                self.bias.grad = torch.zeros_like(self.bias.data)
            self.bias.grad += prev_grad.view(-1)
        
        next_grad = prev_grad.view(1, -1)@self.weight.data
        next_grad = next_grad.view(-1, 1)
        return next_grad
    
    def get_parameters(self):
        return self.parameters
    

In [688]:
class Tanh(Module):
    def __init__(self):
        self.x = None
    
    def forward (self, x):
        self.x = x
        return torch.tanh(x)
        
    def backward ( self, prev_grad) :
        if self.x is None:
            raise CallForwardFirst
            
        def d(x):
            return 4 * (x.exp() + x.mul(-1).exp()).pow(-2)
        
        return d(self.x)*prev_grad
    

In [689]:
class MSE(Losses):
    def __init__(self):
        self.x = None
    def forward(self, x, t):
        self.x = x
        self.t = t
        return (x - t).pow(2).mean()
    
    def backward(self):
        if self.x == None or self.t == None:
            raise CallForwardFirst
        return 2 * (self.x - self.t)/len(self.x)

In [690]:
class Sequential(object):
    def __init__(self, modules):
        super().__init__()
        self.modules=modules
        self.parameters = []
        for m in self.modules:
            param = m.get_parameters()
            if param:
                self.parameters += param
        
    def forward(self,x):
        for m in self.modules:
            x=m.forward(x)
        return x
    
#    def backward(self,weights,values,dl_dw,target):
#        xn=values[-1]
#        dl_dx=[]
#        dl_dx.append(dloss(x_n,t))
#        for i in range(len(self.modules)).reversed():
#            m=self.modules[i]
#            dl_dx.append(m.backwardoutput(dl_dx[-1],values[i]))#backward is implemented for each module
#        for i in range(len(self.modules)).reversed():
#            m=self.modules[i]
#            dl_dw=m.backwardweights(dl_dw,dl_dx[i+1],values[i])
    
    def backward(self, loss_grad):
        x = loss_grad
        for m in reversed(self.modules):
            x = m.backward(x)
            
    def get_parameters(self):
        return self.parameters

    def set_parameters(self , params):
        print(self.parameters[1].data)
        self.parameters = params
        print('after',self.parameters[1].data)
        #for i in range (len(new_par)):
         #   self.parameters[i] = params[i]

In [691]:
x = torch.randn(9, requires_grad = False)
y = torch.randn(6,requires_grad = False)

#handmade sequential linear + relu 
linear = Linear(9, 6, True)
sigma = Tanh()
loss = MSE()

net = Sequential([
    linear, 
    sigma
])


In [692]:
for param in reversed(net.get_parameters()):
    print(param.name)

bias
weight


In [693]:
output = net.forward(x)
loss.forward(output, y)

net.backward(loss.backward())

In [694]:
#comparing with builtin methods

In [695]:
b_linear = torch.nn.Linear(9, 6, True)
b_linear.weight.data = linear.weight.data
b_linear.bias.data = linear.bias.data
l = torch.nn.MSELoss()(torch.tanh(b_linear(x)), y)
l.backward()

In [696]:
b_linear.weight.grad, linear.weight.grad,  abs(b_linear.weight.grad - linear.weight.grad).max() 

(tensor([[ 4.8438e-04,  1.2142e-05, -8.7161e-04, -5.2760e-05, -3.7245e-04,
          -7.6598e-04,  5.6828e-04, -1.0298e-03, -1.2740e-03],
         [ 6.0204e-03,  1.5091e-04, -1.0833e-02, -6.5576e-04, -4.6292e-03,
          -9.5204e-03,  7.0632e-03, -1.2800e-02, -1.5835e-02],
         [ 1.8317e-03,  4.5914e-05, -3.2960e-03, -1.9951e-04, -1.4084e-03,
          -2.8965e-03,  2.1489e-03, -3.8943e-03, -4.8176e-03],
         [ 3.7853e-01,  9.4884e-03, -6.8114e-01, -4.1231e-02, -2.9106e-01,
          -5.9859e-01,  4.4409e-01, -8.0479e-01, -9.9559e-01],
         [ 2.7043e-01,  6.7788e-03, -4.8662e-01, -2.9456e-02, -2.0794e-01,
          -4.2765e-01,  3.1727e-01, -5.7496e-01, -7.1128e-01],
         [-7.7327e-03, -1.9383e-04,  1.3914e-02,  8.4227e-04,  5.9458e-03,
           1.2228e-02, -9.0721e-03,  1.6441e-02,  2.0338e-02]]),
 tensor([[ 4.8438e-04,  1.2142e-05, -8.7161e-04, -5.2760e-05, -3.7245e-04,
          -7.6598e-04,  5.6828e-04, -1.0298e-03, -1.2740e-03],
         [ 6.0204e-03,  1.5091e-

In [697]:
b_linear.bias.grad, linear.bias.grad,  abs(b_linear.bias.grad - linear.bias.grad).max()

(tensor([ 0.0007,  0.0089,  0.0027,  0.5620,  0.4015, -0.0115]),
 tensor([ 0.0007,  0.0089,  0.0027,  0.5620,  0.4015, -0.0115]),
 tensor(5.9605e-08))

In [698]:
# Declare model 
model = Sequential([
    linear, 
    sigma
])
# Choose loss
loss = MSE()

optimizer = Optimizers(lr = 1,max_iter = 100, parameters = model.get_parameters())

for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model.forward(x)

    # Compute and print loss.
    mse = loss.forward(y_pred, y)
    if t % 100 == 99:
        print(t, '   MSE loss = ' , mse.item())

    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    model.backward(loss.backward())

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    new_par = optimizer.step()
    #print(len(new_par))
    model.set_parameters(new_par)

tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
after tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
after tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
after tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
after tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
after tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
after tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
after tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
tensor([ 0.1117,  0.2189, -0.3303,  0.0506,  0.0882, -0.1296])
after tensor(