In [1]:
import torch
import math
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x206fe4a5f08>

In [6]:
class Parameter():
    def __init__(self):
        self.name = ''
        self.data = None
        self.grad = None

In [2]:
class Module(object):
    def forward (self, *input):
        raise NotImplementedError
        
    def backward ( self , * gradwrtoutput ) :
        raise NotImplementedError
    
    def get_parameters( self ) :
        return []   

In [3]:
class Losses(object):        
    def forward():
        return NotImplementedError
    def backward():
        NotImplementedError

In [4]:
class Optimizer(object):
    def zero_grad(self):
        for parameter in self.param : 
            parameter.grad = 0
            
    def step(self):
        raise NotImplementedError

In [5]:
class SGD(Optimizer):
    # this is a SGD optimizer
    def __init__(self,lr,max_iter, parameters) :  # should we add a "tolerance_grad" argument ? F: What do you mean with "tolerance_grad"
        super().__init__()
        self.eta = lr
        self.maxStep = max_iter # maybe this shouldn't be put inside the module F: Agree
        self.param = parameters
        self.number_step = 0

    def step(self): #batch de datapoint  --> confused : how can we do it stochastic ? ou alors on l'appelle step(batch)
        # right now, eta is considered constant 
        #print(self.param[1].data)
        #print('step')
        if self.number_step <=self.maxStep:
            for parameter in self.param :
                #print(parameter)
                #print(parameter.data[1])
                parameter.data = parameter.data - self.eta * parameter.grad
                #print(parameter.data[1])
            self.number_step = self.number_step + 1
            #print('after update',self.param[1].data)
        return self.param

In [7]:
class Linear(Module):
    
    def __init__(self, input_dim, out_dim, bias = True):
        super().__init__()
        std = 1/math.sqrt(input_dim)
        self.weight = Parameter()
        self.parameters = []
        
        self.weight.data = torch.rand(out_dim, input_dim)
        self.weight.data = 2*std*self.weight.data - std
        self.weight.name = 'weight'
        self.parameters += [self.weight]
        
        self.with_bias = bias
        if bias :
            self.bias = Parameter()
            self.bias.data = torch.rand(out_dim)
            self.bias.data = 2*std*self.bias.data - std
            self.bias.name = 'bias'
            self.parameters +=[self.bias]
            
        self.x = None
              
    def forward(self, x):
        self.x = x
        return self.weight.data.mv(x) + self.bias.data
        
    def backward(self, prev_grad):
        
        prev_grad = prev_grad.view(-1, 1)
        if self.x is None:
            raise CallForwardFirst
        
        if self.weight.grad is None:
            self.weight.grad = torch.zeros_like(self.weight.data)
        
        self.weight.grad += prev_grad.view(-1, 1)*self.x.view(1, -1)
        
        if self.with_bias:
            if self.bias.grad is None:
                self.bias.grad = torch.zeros_like(self.bias.data)
            self.bias.grad += prev_grad.view(-1)
        
        next_grad = prev_grad.view(1, -1)@self.weight.data
        next_grad = next_grad.view(-1, 1)
        return next_grad
    
    def get_parameters(self):
        return self.parameters
    

In [8]:
class Tanh(Module):
    def __init__(self):
        self.x = None
    
    def forward (self, x):
        self.x = x
        return torch.tanh(x)
        
    def backward ( self, prev_grad) :
        if self.x is None:
            raise CallForwardFirst
            
        def d(x):
            return 4 * (x.exp() + x.mul(-1).exp()).pow(-2)
        
        return d(self.x)*prev_grad
    

In [9]:
class MSE(Losses):
    def __init__(self):
        self.x = None
    def forward(self, x, t):
        self.x = x
        self.t = t
        return (x - t).pow(2).mean()
    
    def backward(self):
        if self.x == None or self.t == None:
            raise CallForwardFirst
        return 2 * (self.x - self.t)/len(self.x)

In [10]:
class Sequential(object):
    def __init__(self, modules):
        super().__init__()
        self.modules=modules
        self.parameters = []
        for m in self.modules:
            param = m.get_parameters()
            if param:
                self.parameters += param
        
    def forward(self,x):
        for m in self.modules:
            x=m.forward(x)
        return x
    
    def backward(self, loss_grad):
        x = loss_grad
        for m in reversed(self.modules):
            x = m.backward(x)
            
    def get_parameters(self):
        return self.parameters

    def set_parameters(self , params):
        #print(self.parameters[1].data)
        self.parameters = params
        #print('after',self.parameters[1].data)
        #for i in range (len(new_par)):
         #   self.parameters[i] = params[i]

In [11]:
start_norm = []
end_norm = []
for i in range(20):
    
    x = torch.randn(9, requires_grad = False)
    y = torch.randn(6,requires_grad = False)

    linear = Linear(9, 6, True)
    sigma = Tanh()
    loss = MSE()

    model = Sequential([
        linear, 
        sigma
    ])

    loss = MSE()
    optimizer = SGD(lr = 0.1,max_iter = 100, parameters = model.get_parameters())
    (model.forward(x) - y).norm()
    
    start_norm += [(model.forward(x) - y).norm()]
    for t in range(10**3):
        # Forward pass: compute predicted y by passing x to the model.
        y_pred = model.forward(x)

        # Compute and print loss.
        mse = loss.forward(y_pred, y)
        #if t%99==0:
        #    print(t, '   MSE loss = ' , mse.item())

        #optimizer.zero_grad()

        # Backward pass: compute gradient of the loss with respect to model
        # parameters
        model.backward(loss.backward())

        # Calling the step function on an Optimizer makes an update to its
        # parameters
        new_par = optimizer.step()
        #print(len(new_par))
        model.set_parameters(new_par)
        
    end_norm += [(model.forward(x) - y).norm()]

In [12]:
#how does it compare with torch builtinfunctions ? 
start_norm_b = []
end_norm_b = []
for i in range(20):
    x = torch.randn(9, requires_grad = False)
    y = torch.randn(6,requires_grad = False)
    torch.set_grad_enabled(True)

    builtin_model = torch.nn.Sequential(
              torch.nn.Linear(9,6),
              torch.nn.Tanh()
            )

    loss = torch.nn.MSELoss()
    optim = torch.optim.SGD(builtin_model.parameters(), momentum = 0., lr=0.1) #, momentum=None)
    
    
    start_norm_b += [(builtin_model(x) - y).norm().item()]
    for i in range(10**3):
        optim.zero_grad()
        l = loss(builtin_model(x), y)
        l.backward()
        optim.step()
    end_norm_b+= [(builtin_model(x) - y).norm().item()]

In [13]:
import numpy as np
start_norm = np.array(start_norm)
start_norm_b = np.array(start_norm_b)
end_norm = np.array(end_norm)
end_norm_b = np.array(end_norm_b)

In [14]:
start_norm.mean(), start_norm_b.mean(), end_norm.mean(), end_norm_b.mean()

(2.5046544, 2.6501628160476685, 1.0949566, 0.7341336557516456)

In [15]:
start_norm.std(), start_norm_b.std(), end_norm.std(), end_norm_b.std()

(0.73611104, 0.9338806749622279, 0.36315927, 0.6242520126243107)

In [16]:
#comment: the std of the builtin function seems to be larger 

In [17]:
end_norm

array([0.85009634, 2.163151  , 1.4973958 , 1.1858326 , 1.148578  ,
       1.1113453 , 0.70269215, 0.67118526, 0.9788051 , 1.5970316 ,
       1.1625462 , 1.2288423 , 1.2199734 , 0.8311082 , 1.3185073 ,
       0.63410884, 0.7690381 , 0.77949744, 1.2229536 , 0.8264441 ],
      dtype=float32)

In [18]:
end_norm_b

array([1.84584141e-01, 2.38434839e+00, 2.49344112e-07, 1.46917319e+00,
       5.40413260e-02, 4.61876392e-01, 9.27244961e-01, 2.35553205e-01,
       1.66754472e+00, 8.56634021e-01, 3.02995950e-01, 1.07201123e+00,
       1.48670539e-01, 9.99723002e-02, 9.83681560e-01, 6.59794569e-01,
       1.07395566e+00, 8.52353990e-01, 2.65626311e-02, 1.22167408e+00])