In [51]:
import torch
import math
#torch.set_grad_enabled(False)

In [52]:
class Module(object):
    def forward (self, *input):
        raise NotImplementedError
        
    def backward ( self , * gradwrtoutput ) :
        raise NotImplementedError
    
    def get_parameters( self ) :
        return []   

In [53]:
class Losses(object):        
    def forward():
        return NotImplementedError
    def backward():
        NotImplementedError

In [54]:
class Optimizers(object):
    # this is a SGD optimizer
    def __init__(self,lr,max_iter, parameters) :  # should we add a "tolerance_grad" argument ? 
        super().__init__()
        self.eta = lr
        self.maxStep = max_iter # maybe this shouldn't be put inside the module
        self.param = parameters
        self.number_step = 0

    def zero_grad(self):
        for parameter in self.param : 
            parameter.grad = 0

    def step(self): #batch de datapoint  --> confused : how can we do it stochastic ? ou alors on l'appelle step(batch)
        # right now, eta is considered constant 
        #print(self.param[1].data)
        #print('step')
        if self.number_step <=self.maxStep:
            for parameter in self.param :
                #print(parameter)
                #print(parameter.data[1])
                parameter.data = parameter.data - self.eta * parameter.grad
                #print(parameter.data[1])
            self.number_step = self.number_step + 1
            #print('after update',self.param[1].data)
        return self.param

In [55]:
class Parameter():
    def __init__(self):
        self.name = ''
        self.data = None
        self.grad = None

In [56]:
class Linear(Module):
    
    def __init__(self, input_dim, out_dim, bias = True):
        super().__init__()
        std = 1/math.sqrt(input_dim)
        self.weight = Parameter()
        self.parameters = []
        
        self.weight.data = torch.rand(out_dim, input_dim)
        self.weight.data = 2*std*self.weight.data - std
        self.weight.name = 'weight'
        self.parameters += [self.weight]
        
        self.with_bias = bias
        if bias :
            self.bias = Parameter()
            self.bias.data = torch.rand(out_dim)
            self.bias.data = 2*std*self.bias.data - std
            self.bias.name = 'bias'
            self.parameters +=[self.bias]
            
        self.x = None
              
    def forward(self, x):
        self.x = x
        return self.weight.data.mv(x) + self.bias.data
        
    def backward(self, prev_grad):
        
        prev_grad = prev_grad.view(-1, 1)
        if self.x is None:
            raise CallForwardFirst
        
        if self.weight.grad is None:
            self.weight.grad = torch.zeros_like(self.weight.data)
        
        self.weight.grad += prev_grad.view(-1, 1)*self.x.view(1, -1)
        
        if self.with_bias:
            if self.bias.grad is None:
                self.bias.grad = torch.zeros_like(self.bias.data)
            self.bias.grad += prev_grad.view(-1)
        
        next_grad = prev_grad.view(1, -1)@self.weight.data
        next_grad = next_grad.view(-1, 1)
        return next_grad
    
    def get_parameters(self):
        return self.parameters
    

In [57]:
class Tanh(Module):
    def __init__(self):
        self.x = None
    
    def forward (self, x):
        self.x = x
        return torch.tanh(x)
        
    def backward ( self, prev_grad) :
        if self.x is None:
            raise CallForwardFirst
            
        def d(x):
            return 4 * (x.exp() + x.mul(-1).exp()).pow(-2)
        
        return d(self.x)*prev_grad
    

In [58]:
class MSE(Losses):
    def __init__(self):
        self.x = None
    def forward(self, x, t):
        self.x = x
        self.t = t
        return (x - t).pow(2).mean()
    
    def backward(self):
        if self.x == None or self.t == None:
            raise CallForwardFirst
        return 2 * (self.x - self.t)/len(self.x)

In [59]:
class Sequential(object):
    def __init__(self, modules):
        super().__init__()
        self.modules=modules
        self.parameters = []
        for m in self.modules:
            param = m.get_parameters()
            if param:
                self.parameters += param
        
    def forward(self,x):
        for m in self.modules:
            x=m.forward(x)
        return x
    
#    def backward(self,weights,values,dl_dw,target):
#        xn=values[-1]
#        dl_dx=[]
#        dl_dx.append(dloss(x_n,t))
#        for i in range(len(self.modules)).reversed():
#            m=self.modules[i]
#            dl_dx.append(m.backwardoutput(dl_dx[-1],values[i]))#backward is implemented for each module
#        for i in range(len(self.modules)).reversed():
#            m=self.modules[i]
#            dl_dw=m.backwardweights(dl_dw,dl_dx[i+1],values[i])
    
    def backward(self, loss_grad):
        x = loss_grad
        for m in reversed(self.modules):
            x = m.backward(x)
            
    def get_parameters(self):
        return self.parameters

    def set_parameters(self , params):
        #print(self.parameters[1].data)
        self.parameters = params
        #print('after',self.parameters[1].data)
        #for i in range (len(new_par)):
         #   self.parameters[i] = params[i]

In [73]:
x = torch.randn(9, requires_grad = False)
y = torch.randn(6,requires_grad = False)

#handmade sequential linear + relu 
linear = Linear(9, 6, True)
sigma = Tanh()
loss = MSE()

net = Sequential([
    linear, 
    sigma
])


In [74]:
for param in reversed(net.get_parameters()):
    print(param.name)

bias
weight


In [75]:
output = net.forward(x)
loss.forward(output, y)

net.backward(loss.backward())

In [76]:
#comparing with builtin methods

In [77]:
b_linear = torch.nn.Linear(9, 6, True)
b_linear.weight.data = linear.weight.data
b_linear.bias.data = linear.bias.data
l = torch.nn.MSELoss()(torch.tanh(b_linear(x)), y)
l.backward()

In [78]:
b_linear.weight.grad, linear.weight.grad,  abs(b_linear.weight.grad - linear.weight.grad).max() 

(tensor([[-0.3711, -0.8117, -0.3824,  1.8273,  0.5904,  0.8132, -0.2082, -0.7149,
          -0.8794],
         [-0.1280, -0.2799, -0.1319,  0.6302,  0.2036,  0.2804, -0.0718, -0.2466,
          -0.3033],
         [-0.2584, -0.5651, -0.2662,  1.2721,  0.4110,  0.5661, -0.1449, -0.4977,
          -0.6122],
         [ 0.1353,  0.2960,  0.1394, -0.6663, -0.2153, -0.2965,  0.0759,  0.2607,
           0.3207],
         [ 0.0082,  0.0178,  0.0084, -0.0402, -0.0130, -0.0179,  0.0046,  0.0157,
           0.0193],
         [-0.0287, -0.0629, -0.0296,  0.1415,  0.0457,  0.0630, -0.0161, -0.0554,
          -0.0681]]),
 tensor([[-0.3711, -0.8117, -0.3824,  1.8273,  0.5904,  0.8132, -0.2082, -0.7149,
          -0.8794],
         [-0.1280, -0.2799, -0.1319,  0.6302,  0.2036,  0.2804, -0.0718, -0.2466,
          -0.3033],
         [-0.2584, -0.5651, -0.2662,  1.2721,  0.4110,  0.5661, -0.1449, -0.4977,
          -0.6122],
         [ 0.1353,  0.2960,  0.1394, -0.6663, -0.2153, -0.2965,  0.0759,  0.2607

In [79]:
b_linear.bias.grad, linear.bias.grad,  abs(b_linear.bias.grad - linear.bias.grad).max()

(tensor([ 0.8492,  0.2929,  0.5911, -0.3096, -0.0187,  0.0658]),
 tensor([ 0.8492,  0.2929,  0.5911, -0.3096, -0.0187,  0.0658]),
 tensor(1.1921e-07))

In [81]:
# Declare model 
model = Sequential([
    linear, 
    sigma
])
# Choose loss
loss = MSE()

optimizer = Optimizers(lr = 1e-4,max_iter = 100, parameters = model.get_parameters())

for t in range(10**3):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model.forward(x)

    # Compute and print loss.
    mse = loss.forward(y_pred, y)
    if t%99==0:
        print(t, '   MSE loss = ' , mse.item())

    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    model.backward(loss.backward())

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    new_par = optimizer.step()
    #print(len(new_par))
    model.set_parameters(new_par)

0    MSE loss =  0.5442897081375122
99    MSE loss =  0.5442832112312317
198    MSE loss =  0.5442830920219421
297    MSE loss =  0.5442830920219421
396    MSE loss =  0.5442830920219421
495    MSE loss =  0.5442830920219421
594    MSE loss =  0.5442830920219421
693    MSE loss =  0.5442830920219421
792    MSE loss =  0.5442830920219421
891    MSE loss =  0.5442830920219421
990    MSE loss =  0.5442830920219421
1089    MSE loss =  0.5442830920219421
1188    MSE loss =  0.5442830920219421
1287    MSE loss =  0.5442830920219421
1386    MSE loss =  0.5442830920219421
1485    MSE loss =  0.5442830920219421
1584    MSE loss =  0.5442830920219421
1683    MSE loss =  0.5442830920219421
1782    MSE loss =  0.5442830920219421
1881    MSE loss =  0.5442830920219421
1980    MSE loss =  0.5442830920219421
2079    MSE loss =  0.5442830920219421
2178    MSE loss =  0.5442830920219421
2277    MSE loss =  0.5442830920219421
2376    MSE loss =  0.5442830920219421
2475    MSE loss =  0.5442830920219421
