In [1]:
from  __future__ import print_function
import torch 

In [18]:
x = torch.empty(5,3)
y = torch.rand(5,3)
print(x)
print(y)
print(torch.tanh(x+y))

tensor([[-1.9393e+00,  4.5562e-41, -1.9393e+00],
        [ 4.5562e-41,  1.6255e-43,  1.5554e-43],
        [ 1.5975e-43,  1.3873e-43,  1.4574e-43],
        [ 6.4460e-44,  1.4153e-43,  1.5274e-43],
        [ 1.5695e-43,  1.6255e-43,  1.6956e-43]])
tensor([[ 0.3226,  0.7654,  0.9202],
        [ 0.9469,  0.8257,  0.2831],
        [ 0.5871,  0.4555,  0.7124],
        [ 0.2747,  0.6945,  0.7272],
        [ 0.1420,  0.0884,  0.1037]])
tensor([[-0.9242,  0.6442, -0.7695],
        [ 0.7384,  0.6782,  0.2758],
        [ 0.5278,  0.4264,  0.6122],
        [ 0.2680,  0.6009,  0.6213],
        [ 0.1411,  0.0882,  0.1033]])


In [26]:
x.view(-1)

tensor([-1.9393e+00,  4.5562e-41, -1.9393e+00,  4.5562e-41,  1.6255e-43,
         1.5554e-43,  1.5975e-43,  1.3873e-43,  1.4574e-43,  6.4460e-44,
         1.4153e-43,  1.5274e-43,  1.5695e-43,  1.6255e-43,  1.6956e-43])

In [33]:
x = torch.ones(2,2, requires_grad=True)
print(x)
y = x**x
print(y)
print(y.grad_fn)
z = y * y * 3
out = z.mean()

print(z, out)

tensor([[ 1.,  1.],
        [ 1.,  1.]])
tensor([[ 1.,  1.],
        [ 1.,  1.]])
<PowBackward1 object at 0x7f028d3eebe0>
tensor([[ 3.,  3.],
        [ 3.,  3.]]) tensor(3.)


#### .requires_grad_( ... ) changes an existing Tensor’s requires_grad flag in-place. The input flag defaults to True if not given.

In [34]:
a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
True
<SumBackward0 object at 0x7f028d3eecc0>


## Gradients
#### Let’s backprop now Because out contains a single scalar, out.backward() is equivalent to out.backward(torch.tensor(1)).

In [36]:
out.backward()

In [38]:
print(x.grad)

tensor([[ 1.5000,  1.5000],
        [ 1.5000,  1.5000]])


#### You can also stops autograd from tracking history on Tensors with requires_grad=True by wrapping the code block in with torch.no_grad():

In [39]:
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
    print((x ** 2).requires_grad)

True
True
False


## Neural Networks
#### Neural networks can be constructed using the torch.nn package.

> #### A typical training procedure for a neural network is as follows:
> * Define the neural network that has some learnable parameters (or weights)
> * Iterate over a dataset of inputs
> * Process input through the network
> * Compute the loss (how far is the output from being correct)
> * Propagate gradients back into the network’s parameters
> * Update the weights of the network, typically using a simple update rule: weight = weight - learning_rate * gradient

## Define the network
### Let’s define this network:

In [43]:
import torch.nn as nn 
import torch.nn.functional as F 

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels
        # 5X5 square convolution kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        
        # an affine operation : y = Wx +b 
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
                
        # Max pool over 2X2 window 
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        
        # can specify single number if size is square 
        x = F.max_pool2d(F.relu(self.conv2(x)), (2))
        
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        # All dimension except batch dimension
        size = x.size()[1:] 
        num_features = 1 
        for s in size:
            num_features *=s
        return num_features
    
    
       
net = Net()
print(net)
            
        
    

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


##### You just have to define the forward function, and the backward function (where gradients are computed) is automatically defined for you using autograd. You can use any of the Tensor operations in the forward function.
##### The learnable parameters of a model are returned by net.parameters()

In [44]:
params = list(net.parameters())
print(len(params))
print(params[0].size())

10
torch.Size([6, 1, 5, 5])


#### Trying net on rand 32x32 input size

In [45]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

# Zero the gradient buffers of all backprops and parameters with random gradients
net.zero_grad()
out.backward(torch.randn(1,10))

tensor([[-0.0079, -0.0257,  0.1050, -0.0205,  0.1779,  0.0609,  0.0243,
         -0.1292,  0.0619,  0.1053]])


### Loss Function
##### A loss function takes the (output, target) pair of inputs, and computes a value that estimates how far away the output is from the target.

In [49]:
output = net(input)
target = torch.arange(1, 11)
target = target.view(1,-1)
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(38.0730)


##### So, when we call loss.backward(), the whole graph is differentiated w.r.t. the loss, and all Tensors in the graph that has requres_grad=True will have their .grad Tensor accumulated with the gradient.

##### For illustration, let us follow a few steps backward:

In [50]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x7f028cdc2d30>
<AddmmBackward object at 0x7f028cdc2d68>
<ExpandBackward object at 0x7f028cdc2d30>


### Backprop
#### To backpropagate the error all we have to do is to loss.backward(). You need to clear the existing gradients though, else gradients will be accumulated to existing gradients.
##### Now we shall call loss.backward(), and have a look at conv1’s bias gradients before and after the backward.

In [51]:
# zero the gradient for all buffers or else will accumulate over previous values
net.zero_grad()

print("conv1.bias before backward")
print(net.conv1.bias.grad)

loss.backward()

print("conv1.bias after backward")
print(net.conv1.bias.grad)


conv1.bias before backward
tensor([ 0.,  0.,  0.,  0.,  0.,  0.])
conv1.bias after backward
tensor([-0.0761,  0.0818, -0.0080,  0.1399, -0.0435,  0.0020])


### Update the weights
#### The simplest update rule used in practice is the Stochastic Gradient Descent (SGD):
  ```weight = weight - learning_rate * gradient```


In [52]:
import torch.optim as optim

#create the optimizer: 
optimizer = optim.SGD(net.parameters(), lr = 0.01)

# training looop : 
optimizer.zero_grad()
output = net(input)
loss = criterion(output, target)
loss.backward()   # calculated the gradients 
optimizer.step()  # does the update 
