In [1]:
import torch
import math
#torch.set_grad_enabled(False)

In [2]:
class Module(object):
    def forward (self, *input):
        raise NotImplementedError
        
    def backward ( self , * gradwrtoutput ) :
        raise NotImplementedError
    
    def get_parameters( self ) :
        return []   

In [3]:
class Losses(object):        
    def forward():
        return NotImplementedError
    def backward():
        NotImplementedError

In [4]:
class Optimizers(object):
    def step():
        return NotImplementedError

In [5]:
class Parameter():
    def __init__(self):
        self.name = ''
        self.data = None
        self.grad = None

In [6]:
class Linear(Module):
    
    def __init__(self, input_dim, out_dim, bias = True):
        super().__init__()
        std = 1/math.sqrt(input_dim)
        self.weight = Parameter()
        self.parameters = []
        
        self.weight.data = torch.rand(out_dim, input_dim)
        self.weight.data = 2*std*self.weight.data - std
        self.weight.name = 'weight'
        self.parameters += [self.weight]
        
        self.with_bias = bias
        if bias :
            self.bias = Parameter()
            self.bias.data = torch.rand(out_dim)
            self.bias.data = 2*std*self.bias.data - std
            self.bias.name = 'bias'
            self.parameters +=[self.bias]
            
        self.x = None
              
    def forward(self, x):
        self.x = x
        return self.weight.data.mv(x) + self.bias.data
        
    def backward(self, prev_grad):
        
        prev_grad = prev_grad.view(-1, 1)
        if self.x is None:
            raise CallForwardFirst
        
        if self.weight.grad is None:
            self.weight.grad = torch.zeros_like(self.weight.data)
        
        self.weight.grad += prev_grad.view(-1, 1)*self.x.view(1, -1)
        
        if self.with_bias:
            if self.bias.grad is None:
                self.bias.grad = torch.zeros_like(self.bias.data)
            self.bias.grad += prev_grad.view(-1)
        
        next_grad = prev_grad.view(1, -1)@self.weight.data
        next_grad = next_grad.view(-1, 1)
        return next_grad
    
    def get_parameters(self):
        return self.parameters
    

In [7]:
class Tanh(Module):
    def __init__(self):
        self.x = None
    
    def forward (self, x):
        self.x = x
        return torch.tanh(x)
        
    def backward ( self, prev_grad) :
        if self.x is None:
            raise CallForwardFirst
            
        def d(x):
            return 4 * (x.exp() + x.mul(-1).exp()).pow(-2)
        
        return d(self.x)*prev_grad
    

In [8]:
class MSE(Losses):
    def __init__(self):
        self.x = None
    def forward(self, x, t):
        self.x = x
        self.t = t
        return (x - t).pow(2).mean()
    
    def backward(self):
        if self.x == None or self.t == None:
            raise CallForwardFirst
        return 2 * (self.x - self.t)/len(self.x)

In [9]:
class Sequential(object):
    def __init__(self, modules):
        super().__init__()
        self.modules=modules
        self.parameters = []
        for m in self.modules:
            param = m.get_parameters()
            if param:
                self.parameters += param
        
    def forward(self,x):
        for m in self.modules:
            x=m.forward(x)
        return x
    
    def backward(self, loss_grad):
        x = loss_grad
        for m in reversed(self.modules):
            x = m.backward(x)
            
    def get_parameters(self):
        return self.parameters

In [10]:
x = torch.randn(9, requires_grad = False)
y = torch.randn(6,requires_grad = False)

#handmade sequential linear + relu 
linear = Linear(9, 6, True)
sigma = Tanh()
loss = MSE()

net = Sequential([
    linear, 
    sigma
])


In [11]:
for param in reversed(net.get_parameters()):
    print(param.name)

bias
weight


In [12]:
output = net.forward(x)
loss.forward(output, y)

net.backward(loss.backward())

In [13]:
#comparing with builtin methods

In [14]:
b_linear = torch.nn.Linear(9, 6, True)
b_linear.weight.data = linear.weight.data
b_linear.bias.data = linear.bias.data
l = torch.nn.MSELoss()(torch.tanh(b_linear(x)), y)
l.backward()

In [15]:
b_linear.weight.grad, linear.weight.grad,  abs(b_linear.weight.grad - linear.weight.grad).max() 

(tensor([[ 0.1662,  0.0132, -0.2021,  0.1636,  0.0089, -0.1216,  0.0104,  0.0379,
          -0.0240],
         [ 1.2942,  0.1027, -1.5732,  1.2739,  0.0691, -0.9469,  0.0809,  0.2954,
          -0.1872],
         [ 0.4139,  0.0328, -0.5032,  0.4074,  0.0221, -0.3029,  0.0259,  0.0945,
          -0.0599],
         [ 0.1593,  0.0126, -0.1937,  0.1568,  0.0085, -0.1166,  0.0100,  0.0364,
          -0.0230],
         [ 0.4564,  0.0362, -0.5548,  0.4493,  0.0244, -0.3340,  0.0285,  0.1042,
          -0.0660],
         [ 0.3175,  0.0252, -0.3859,  0.3125,  0.0170, -0.2323,  0.0199,  0.0725,
          -0.0459]]),
 tensor([[ 0.1662,  0.0132, -0.2021,  0.1636,  0.0089, -0.1216,  0.0104,  0.0379,
          -0.0240],
         [ 1.2942,  0.1027, -1.5732,  1.2739,  0.0691, -0.9469,  0.0809,  0.2954,
          -0.1872],
         [ 0.4139,  0.0328, -0.5032,  0.4074,  0.0221, -0.3029,  0.0259,  0.0945,
          -0.0599],
         [ 0.1593,  0.0126, -0.1937,  0.1568,  0.0085, -0.1166,  0.0100,  0.0364

In [16]:
b_linear.bias.grad, linear.bias.grad,  abs(b_linear.bias.grad - linear.bias.grad).max()

(tensor([0.0721, 0.5614, 0.1796, 0.0691, 0.1980, 0.1377]),
 tensor([0.0721, 0.5614, 0.1796, 0.0691, 0.1980, 0.1377]),
 tensor(5.9605e-08))