In [1]:
import torch
import math
#torch.set_grad_enabled(False)

In [2]:
class Module(object):
    def forward (self, *input):
        raise NotImplementedError
        
    def backward ( self , * gradwrtoutput ) :
        raise NotImplementedError
    
    def param ( self ) :
        return []   

In [3]:
class Losses(object):        
    def function():
        return NotImplementedError
    def derivative():
        NotImplementedError

In [4]:
class Optimizers(object):
    def step():
        return NotImplementedError

In [5]:
class Parameter():
    def __init__(self):
        self.data = None
        self.grad = None

In [6]:
class Linear(Module):
    
    def __init__(self, input_dim, out_dim, bias = True):
        super().__init__()
        std = 1/math.sqrt(input_dim)
        self.weight = Parameter()
        
        self.weight.data = torch.rand(out_dim, input_dim)
        self.weight.data = 2*std*self.weight.data - std
        
        self.with_bias = bias
        if bias :
            self.bias = Parameter()
            self.bias = torch.rand(out_dim)
            self.bias = 2*std*self.bias.data - std
            
        self.x = None
              
    def forward(self, x):
        self.x = x
        return self.weight.data.mv(x) + self.bias.data
        
    def backward(self, prev_grad):
        
        prev_grad = prev_grad.view(-1, 1)
        if self.x is None:
            raise CallForwardFirst
        
        if self.weight.grad is None:
            self.weight.grad = torch.zeros_like(self.weight.data)
        
        self.weight.grad += prev_grad.view(-1, 1)*self.x.view(1, -1)
        
        if self.with_bias:
            if self.bias.grad is None:
                self.bias.grad = torch.zeros_like(self.bias.data)
            self.bias.grad += prev_grad.view(-1)
        
        next_grad = prev_grad.view(1, -1)@self.weight.data
        next_grad = next_grad.view(-1, 1)
        return next_grad

In [7]:
class Tanh(Module):
    def __init__(self):
        self.x = None
    
    def forward (self, x):
        self.x = x
        return torch.tanh(x)
        
    def backward ( self, prev_grad) :
        if self.x is None:
            raise CallForwardFirst
            
        def d(x):
            return 4 * (x.exp() + x.mul(-1).exp()).pow(-2)
        
        return d(self.x)*prev_grad
    

In [8]:
class MSE(Losses):
    def __init__(self):
        self.x = None
    def forward(self, x, t):
        self.x = x
        self.t = t
        return (x - t).pow(2).mean()
    
    def backward(self):
        if self.x == None or self.t == None:
            raise CallForwardFirst
        return 2 * (self.x - self.t)/len(self.x)

In [18]:
x = torch.randn(5, requires_grad = False)
y = torch.randn(6, requires_grad = False)

#backward, comparing with torch
linear = Linear(5, 6, True)
builtin_linear = torch.nn.Linear(5, 6)
linear.weight.data = builtin_linear.weight.data
linear.bias.data = builtin_linear.bias.data
tanh = Tanh()
loss = MSE()

#building loss derivative
builtin_output = torch.tanh(builtin_linear(x))
builtin_loss = torch.nn.MSELoss()(builtin_output, y)
builtin_loss.backward()

In [20]:
loss.forward(tanh.forward(linear.forward(x)), y)
linear.backward(tanh.backward(loss.backward()))

tensor([[-0.1349],
        [-0.1779],
        [ 0.1092],
        [-0.1505],
        [-0.0194]])

In [27]:
linear.weight.grad, builtin_linear.weight.grad, abs(linear.weight.grad - builtin_linear.weight.grad )<1e-7

(tensor([[ 0.0953, -0.2833,  0.0859,  0.2611,  0.0333],
         [ 0.0919, -0.2732,  0.0828,  0.2517,  0.0321],
         [ 0.0860, -0.2558,  0.0775,  0.2357,  0.0301],
         [ 0.0689, -0.2047,  0.0620,  0.1886,  0.0241],
         [-0.0723,  0.2150, -0.0651, -0.1981, -0.0253],
         [-0.1031,  0.3064, -0.0928, -0.2824, -0.0361]]),
 tensor([[ 0.0953, -0.2833,  0.0859,  0.2611,  0.0333],
         [ 0.0919, -0.2732,  0.0828,  0.2517,  0.0321],
         [ 0.0860, -0.2558,  0.0775,  0.2357,  0.0301],
         [ 0.0689, -0.2047,  0.0620,  0.1886,  0.0241],
         [-0.0723,  0.2150, -0.0651, -0.1981, -0.0253],
         [-0.1031,  0.3064, -0.0928, -0.2824, -0.0361]]),
 tensor([[True, True, True, True, True],
         [True, True, True, True, True],
         [True, True, True, True, True],
         [True, True, True, True, True],
         [True, True, True, True, True],
         [True, True, True, True, True]]))

In [29]:
linear.bias.grad, builtin_linear.bias.grad, abs(linear.bias.grad - builtin_linear.bias.grad) < 1e-7

(tensor([ 0.2483,  0.2394,  0.2241,  0.1794, -0.1884, -0.2685]),
 tensor([ 0.2483,  0.2394,  0.2241,  0.1794, -0.1884, -0.2685]),
 tensor([True, True, True, True, True, True]))