# Start-up of Pytorch
Reference at [Github](https://github.com/jcjohnson/pytorch-examples)

## Warm-up numpy

In [3]:
import numpy as np

# N is batch size, D_in is input dimension
# H is hidden dimension, D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# random initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

for t in range(500):
    
    # forward pass : 
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # compute loss
    loss = np.square(y_pred - y).sum()
    # print(t, loss)
    
    # back prop
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

## Pytorch Tensor

In [5]:
import torch


device = torch.device('cpu')
# N is batch size, D_in is input dimension
# H is hidden dimension, D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# random initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6

for t in range(500):
    
    # forward pass : 
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # compute loss
    # loss is a scalar, so we get it with item
    loss = (y_pred - y).pow(2).sum()
    # print(t, loss.item())
    
    # back prop
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

## Pytorch Autograd

In [8]:

# create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# random initialize weights
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    
    # forward pass :     
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # compute loss
    # loss is a scalar, so we get it with item
    loss = (y_pred - y).pow(2).sum()
    # print(t, loss.item())
    
    # back prop using autograd
    loss.backward()
    
    # update weights    
    with torch.no_grad():            
        w1 -= learning_rate * grad_w1
        w2 -= learning_rate * grad_w2
    
        # Manually zeroing the gradients after the backward pass
        w1.grad.zero_()
        w2.grad.zero_()

## Pytorch Defining new autograd functions

In [11]:
class MyRelu(torch.autograd.Function):
    """
    Custom implementation of autograd function.
    In this case, it is relu
    """
    
    @staticmethod
    def forward(ctx, x):
        # cache for backward pass
        ctx.save_for_backward(x)
        return x.clamp(min=0)
    
    @staticmethod    
    def backward(ctx, grad_output):
        
        # uncache
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0] = 0
        return grad_x

# create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# random initialize weights
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    
    # forward pass with custom implementation of Relu:     
    y_pred = MyRelu.apply(x.mm(w1)).mm(w2)
    
    # compute loss
    # loss is a scalar, so we get it with item
    loss = (y_pred - y).pow(2).sum()
    # print(t, loss.item())
    
    # back prop using autograd
    loss.backward()
    
    # update weights    
    with torch.no_grad():            
        w1 -= learning_rate * grad_w1
        w2 -= learning_rate * grad_w2
    
        # Manually zeroing the gradients after the backward pass
        w1.grad.zero_()
        w2.grad.zero_()

## Pytorch nn

In [17]:
# create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# using nn package with nn sequential to define a model through 
# sandwich layers

model = torch.nn.Sequential(
            torch.nn.Linear(D_in, H),
            torch.nn.ReLU(),
            torch.nn.Linear(H, D_out),
        ).to(device)

# Loss function
loss_fn = torch.nn.MSELoss(reduction='sum') # could also be elementwise_mean
learning_rate = 1e-4

for t in range(500):
    # forward pass
    y_pred = model(x)
    
    # loss
    loss = loss_fn(y_pred, y)
    # print(t, loss.item())
    
    # zero gradients
    model.zero_grad()
    
    # backpropagation
    loss.backward()
    
    # update gradients
    with torch.no_grad():
        for param in model.parameters():    
            param.data -= learning_rate * param.grad
        

## Pytorch optimizer

In [19]:
# using optim package for adam optimizer

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # forward pass
    y_pred = model(x)
    
    # loss
    loss = loss_fn(y_pred, y)
    # print(t, loss.item())
    
    # zero gradients
    optimizer.zero_grad()
    
    # backpropagation
    loss.backward()
    
    # update gradients
    optimizer.step()
        

## Pytorch custom nn modules

In [23]:
class TwoLayerNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we instantiate two nn.Linear modules and assign them as
    member variables.
    """
    super(TwoLayerNet, self).__init__()
    self.linear1 = torch.nn.Linear(D_in, H)
    self.linear2 = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    In the forward function we accept a Tensor of input data and we must return
    a Tensor of output data. We can use Modules defined in the constructor as
    well as arbitrary (differentiable) operations on Tensors.
    """
    h_relu = self.linear1(x).clamp(min=0)
    y_pred = self.linear2(h_relu)
    return y_pred

model = TwoLayerNet(D_in, H, D_out)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
for t in range(500):
    # forward pass
    y_pred = model(x)
    
    # loss
    loss = loss_fn(y_pred, y)
    # print(t, loss.item())
    
    # zero gradients
    optimizer.zero_grad()
    
    # backpropagation
    loss.backward()
    
    # update gradients
    optimizer.step()