In [2]:
import math
import torch

In [None]:
class Module(object):
    """ Base class """
    def __init__(self):
        self._parameters = dict()
        self._children = dict()
        self.training = True
        
    def forward(self, *input):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        """ backward receives as input a pointer to a tensor or a tuple of tensors containing
        the gradient of the loss (or the function of interest) wrt the module's output, accumulates
        the gradient wrt the parameters, and returns a tensor or a tuple of tensors containing the 
        gradient of the loss wrt the module's input (Application of the chain rule)"""
        raise NotImplementedError
        
    def add_children(self, name, module):
        assert isinstance(module, Module) and is not None, "Not a Module."
        assert name not in self._children, "Module {} already exists".format(name)
        self._children[name] = module
        
    def add_parameter(self, name, param):
        assert isinstance(param, Parameter), "Not a Parameter."
        assert name not in self._parameters, "Parameter {} already exists".format(name)
        self._parameters[name] = param
        
    def param(self):
        return self._parameters

In [44]:
class Parameter(torch.Tensor):
    def __init__(self, tensor=None, grad=None, requires_grad=True):
        self.tensor = tensor
        self.grad = None
        self.requires_grad = requires_grad

In [None]:
class Linear(Module):
    """ Implements a R^C -> R^D fully-connected layer:
        Input: (N x C) tensor
        Ouput: (N x D) tensor """
    def __init__(self, in_features, out_features, bias=True):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
            
    def init_parameters(self):
        
        
    def forward(self, input):
        self.save_for_backward = input
        output = torch.matmul(input, self.weight.t())
        if bias: output += bias
        return output
              
    def backward(self, grad_output):
        input = self.save_for_bachward
        grad_input = torch.matmul(grad_output, self.weight)
        grad_weight = torch.matmul(grad_output.t(), input)
        self.weight.gradient += grad_weight
        if self.bias: 
            grad_bias = grad_output.sum(0).squeeze(0)
            self.bias.grad += grad_bias
        return grad_input    

For the second project, do we first accumulate the gradient then afterwards calculate the derivate of the loss wrt 
to the input.  Or do it the other way around.
They are usually unrelated computations. Think about the following scenario. You have a batch of inputs x_0 to x_9. 
And a single parameter a. Thus the forward pass for this module is s_i = a*x_i. For the backward pass we get as 
input dl/ds_i for all i and we need to compute dl/da and dl/dx_i . It is quite obvious that 
dl/da = sum x_i * dl/ds_i for all i. And dl/dx_i = dl/ds_i * a. The order in which one computes the two is irrelevant.

In [43]:
from torch import nn
from torch import optim
import torch.autograd as autograd

x = torch.tensor([[1, 2], [2, 1], [3, 4]]).type(torch.FloatTensor).requires_grad_()
y = torch.tensor([1, 0.4, 3])
#x = torch.tensor([[1., 2.]]).requires_grad_()
#y = torch.tensor([1.])

model = nn.Sequential(nn.Linear(2, 1))
print("PRINTING PARAMETERS")
for p in model.parameters():
    print("p = ", p)
y_pred = model(x)

print("PRINTING PREDICTION")
print("y_pred = ", y_pred)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

loss = criterion(y_pred, y)
optimizer.zero_grad()

print("PRINTING GRADIENT")
#print("loss.grad = ", autograd.grad(loss, x))
loss.backward()
for p in model.parameters():
    print("p.grad = ", p.grad)

PRINTING PARAMETERS
p =  Parameter containing:
tensor([[-0.6022,  0.5215]], requires_grad=True)
p =  Parameter containing:
tensor([-0.7021], requires_grad=True)
PRINTING PREDICTION
y_pred =  tensor([[-0.2612],
        [-1.3850],
        [-0.4226]], grad_fn=<AddmmBackward>)
PRINTING GRADIENT
p.grad =  tensor([[-8.7326, -9.2429]])
p.grad =  tensor([-4.3125])
