In [1]:
import torch, torchvision
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# 1. Tensors

In [2]:
# tensors are specialized data structures similar to arrays and matrices. 
# use tensors to encode inputs and outputs of models, as well as model parameters 
# tensors can run on GPUs or other specialized hardware to accelerate computing. 


# tensor from list of lists
data = [[1, 2], [3, 4]]
x_data = torch.tensor(data)
x_data

tensor([[1, 2],
        [3, 4]])

In [3]:
# tensor from np array
np_array = np.array(data)
x_np = torch.from_numpy(np_array)
x_np

tensor([[1, 2],
        [3, 4]])

In [4]:
# tensor from another tensor
x_ones = torch.ones_like(x_data) # retains the properties of x_data
print(f"Ones Tensor: \n {x_ones} \n")

x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data
print(f"Random Tensor: \n {x_rand} \n")


Ones Tensor: 
 tensor([[1, 1],
        [1, 1]]) 

Random Tensor: 
 tensor([[0.9472, 0.0728],
        [0.6206, 0.8476]]) 



In [5]:
# I am familiar with numpy, so the operations between tensors should be the same. 

# 2. torch.autograd: forward and backward propagation
- Neural Networks are a collection of nested functions that are executed on some input data. 
- Functions are defined by parameters, (with weights and biases), and are stored as tensors in pytorch 
## 2b. Training a neural network
1. First step is <b> FORWARD PROPAGATION </b>
    - In this step, the neural network makes its best guess about the correct output. NN runs input data through each of its functions to make this guess
    
2. Second Step is <b> BACKWARD PROPAGATION </b>
    - In this step, NN adjusts its parameters proportionate to the error in its guess. It will traverse backwards from the output, collecting derivatives of the error with respect to the parameters of the functions. And will optimize the parameters using gradient descent. 
    


In [6]:
# 1. Model is a pretrained resnet18 from torch vision
# 2. data is a random data tensor to represent a single image with 3 channels, with height and width of 64. 
# 3. label of the image is random as well. 

model = torchvision.models.resnet18(pretrained=True)
data = torch.rand(1, 3, 64, 64)
labels = torch.rand(1, 1000)
# 4. run the forward pass
prediction = model(data) # forward pass

# 5. calculate the error 
loss = (prediction - labels).sum()

# 6. backprop the error through the network
loss.backward() #backward pass

# 7. load an optimizer. SGD is used in this case.
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

# 8. gradient descent -- to adjust each parameter by the gradient stored. in each parameter
optim.step()



[W NNPACK.cpp:79] Could not initialize NNPACK! Reason: Unsupported hardware.


# 3. Neural Networks

- Neural Networks can be constructe using the torch.nn package. 
- nn depends on autograd to define models and differentiate them. 
- nn.Module contains layers, and a forward(inut) method that returns the output. 
- typical training procedure for nn:
    1. Define the neural network that has some learnable parameters (weights)
    2. Iterate over a dataset of inputs
    3. Process input through the network
    4. Compute the loss (how far is the output from being correct)
    5. Propagate gradients back into the network's parameters
    6. Update the weights of the network, typically using a simple update rule: <br>
    `weight = weight - learning_rate * gradient`
    

In [7]:
# define the network: 

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        # 1 input image channel, 6 output channels, 5x5 square convolution kernel
        # color, output channels, square convolution kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # max pool over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        
        # if the size is a square, we can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        
        # flatten all dimensions except the batch dimension 
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x
    
    
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


- we only have to define the forward function
- the backward function (GD) is automatically defined for us using autograd. 
- we can use any of the tensor operations in the forward function. 


In [8]:
# view learnable parameters of a model by: net.parameters()
params = list(net.parameters())
params[0].size()


torch.Size([6, 1, 5, 5])

In [9]:
# trying a random 32x32 input. FORWARD PROPAGATION
# expected input size of the net is 32x32. We need to resize images to 32x32 if we wanna test this net on
## another dataset. 

x_test = torch.randn(1, 1, 32,32)
out = net(x_test)
print(out)

# zero the gradient buffers of all parameters, and backprop with random gradients
net.zero_grad()
out.backward(torch.randn(1, 10))



tensor([[ 0.0312,  0.0379,  0.0165, -0.0343, -0.0931,  0.0614, -0.0580,  0.0550,
          0.1098,  0.0340]], grad_fn=<AddmmBackward0>)


In [10]:
output = net(x_test)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

## COMPUTING THE LOSS

loss = criterion(output, target)
print(loss)

tensor(0.6965, grad_fn=<MseLossBackward0>)


In [11]:

print(loss.grad_fn)  # MSELoss 
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward0 object at 0x7ff438c7ee20>
<AddmmBackward0 object at 0x7ff438c7ee50>
<AccumulateGrad object at 0x7ff438c7ee20>


In [12]:
## BACKPROP

net.zero_grad()     # zeroes the gradient buffers of all parameters
## must zero grad or else the gradients will be accumulated to existing gradients. 

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0030,  0.0067,  0.0099, -0.0030,  0.0076,  0.0048])


In [13]:
## UPDATE WEIGHTS/ Optimizer
# we can use torch.optim, look into this.

# stochastic gradient descent is the simplest update rule
# weight = weight - learning_rate * gradient
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)
    