In [1]:
import torch


- NumPy: Ideal for general-purpose numerical computations and works well on CPUs. It lacks built-in support for GPU computation and automatic differentiation.
- PyTorch: Tailored for deep learning with built-in support for GPU acceleration and automatic differentiation. It is highly suitable for training neural networks.

## Football Pitch Example

In [6]:
# Define the coordinate vectors for the length and width of the pitch
length = torch.arange(1, 10)
width = torch.arange(1, 6)

# Generate coordinate grids
grid_x, grid_y = torch.meshgrid(length, width)

print(f"grid_x: \n {grid_x}")

print(f"grid_y: \n {grid_y}")

grid_x: 
 tensor([[1, 1, 1, 1, 1],
        [2, 2, 2, 2, 2],
        [3, 3, 3, 3, 3],
        [4, 4, 4, 4, 4],
        [5, 5, 5, 5, 5],
        [6, 6, 6, 6, 6],
        [7, 7, 7, 7, 7],
        [8, 8, 8, 8, 8],
        [9, 9, 9, 9, 9]])
grid_y: 
 tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])


### Explanation
- grid_x contains the x-coordinates (length) repeated along the columns.
- grid_y contains the y-coordinates (width) repeated along the rows.

Each element (grid_x[i, j], grid_y[i, j]) corresponds to a specific coordinate on the pitch.

*Example: Locating a Football Player*

Let's say we want to place a football player at the coordinate (3, 2). This means the player is standing at the 3rd position in length and the 2nd position in width.

Using the grid indices:

i corresponds to the length index.
j corresponds to the width index.

In [8]:
i = 2
j = 1

player_position = (grid_x[i, j], grid_y[i, j])
print(f"The football player stands at coordinate: {player_position}")

The football player stands at coordinate: (tensor(3), tensor(2))


In [10]:
grid_y.shape

torch.Size([9, 5])

In [18]:
grid_x[i, 3]

tensor(3)

In [11]:
# Scalar tensors have no dimensions 
# and can be converted to standard Python numbers using .item().
grid_x[i, j].item()

3

In [12]:
grid_y[i, j].item()

2

In [14]:
# Visualizing player position

def visualize_player_pos(grid_x, grid_y, player_position):
    for i in range(grid_x.shape[0]):
        row = []
        for j in range(grid_x.shape[1]):
            if (grid_x[i, j], grid_y[i,j]) == player_position:
                row.append(f"({grid_x[i, j].item()},{grid_y[i, j].item()})*")  # Mark player position
            else:
                row.append(f"({grid_x[i, j].item()},{grid_y[i, j].item()})")
        print(" ".join(row))

In [15]:
visualize_player_pos(grid_x, grid_y, player_position)

(1,1) (1,2) (1,3) (1,4) (1,5)
(2,1) (2,2) (2,3) (2,4) (2,5)
(3,1) (3,2)* (3,3) (3,4) (3,5)
(4,1) (4,2) (4,3) (4,4) (4,5)
(5,1) (5,2) (5,3) (5,4) (5,5)
(6,1) (6,2) (6,3) (6,4) (6,5)
(7,1) (7,2) (7,3) (7,4) (7,5)
(8,1) (8,2) (8,3) (8,4) (8,5)
(9,1) (9,2) (9,3) (9,4) (9,5)


In [22]:
# Check if GPU is available
if torch.cuda.is_available():
    torch_tensor = torch_tensor.to('cuda')
    print(torch_tensor)  # Tensor is now on the GPU


In [24]:
torch.cuda.is_available()

False

In [2]:
import torch
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
 
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:{torch.cuda.current_device()}")
       
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")

Is CUDA supported by this system? True
CUDA version: 11.8
ID of current CUDA device:0
Name of current CUDA device:NVIDIA GeForce RTX 4050 Laptop GPU


## Basic example for gradient

In [42]:
import torch 

# Create a tensor with requires_grad = True to track computations
x = torch.tensor(2.0, requires_grad=True)

# Define a simple function
y = x ** 2

# Calculate the gradient of y with respect to x
y.backward() # This computes the gradient dy/dx
print(f"Gradient of y with respect to x: {x.grad}")

Gradient of y with respect to x: 4.0


In [45]:
# Now let's use no_grad to perform operations without tracking gradients
with torch.no_grad():
    z = x + 1
    print(f'Value of z (x + 1) without tracking gradients: {z}')
    print(z.requires_grad)

Value of z (x + 1) without tracking gradients: 3.0
False


In [47]:
# Reset gradient to zero
x.grad.zero_()
x.grad

tensor(0.)

In [48]:
# Perform operations with gradients again
w = x ** 3 + 3 * x ** 2
w.backward() # This computes the gradient dw/dx
print(f'Gradient of w wrt x: {x.grad}')

Gradient of w wrt x: 24.0


In [49]:
# Now demonstrating the effect of torch.no_grad()
x.grad.zero_()

with torch.no_grad():
    q = x ** 3 + 3 * x ** 2
    q.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [50]:
x.grad

tensor(0.)

In [51]:
# Create a tensor with requires_grad=True to track computations
x = torch.tensor(2.0, requires_grad=True)

# First function: y = x^2
y = x ** 2
y.backward()
print(f'Gradient of y with respect to x after first backward: {x.grad}')  # Prints: 4.0

# Store the gradient from the first function
grad_y = x.grad.clone()

# Perform operations without tracking gradients using torch.no_grad()
with torch.no_grad():
    z = x ** 3 + 3 * x ** 2
    print(f'Value of z (x^3 + 3x^2) without tracking gradients: {z}')  # This will not affect x.grad

# Verify that the gradient from the first function is preserved
print(f'Stored gradient of y with respect to x: {grad_y}')  # Should still be 4.0

# Perform another operation with gradient tracking enabled
w = x ** 4
w.backward()
print(f'Gradient of w with respect to x after third backward (without zeroing): {x.grad}')  # Prints: 36.0


Gradient of y with respect to x after first backward: 4.0
Value of z (x^3 + 3x^2) without tracking gradients: 20.0
Stored gradient of y with respect to x: 4.0
Gradient of w with respect to x after third backward (without zeroing): 36.0


Here, w is defined as x^4. The gradient dw/dx = 4x^3, so dw/dx is 32.0 when x = 2.0. Since we did not reset the gradient, x.grad now accumulates the gradient from the previous backward pass (4.0 for y = x^2) and the new backward pass (32.0 for w = x^4). Therefore, x.grad is 4.0 + 32.0 = 36.0.

# PyTorch Tutorial: Basics

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [2]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root = "data",
    train = True,
    download = True,
    transform = ToTensor()
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root = "data",
    train = False,
    download = True,
    transform = ToTensor()
)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz


0.1%

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data\FashionMNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting data\FashionMNIST\raw\train-images-idx3-ubyte.gz to data\FashionMNIST\raw


100.0%


Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data\FashionMNIST\raw\train-labels-idx1-ubyte.gz
Extracting data\FashionMNIST\raw\train-labels-idx1-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz



10.4%

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz


100.0%


Extracting data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz


100.0%

Extracting data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw






In computer vision and deep learning, N, C, H, and W are commonly used abbreviations that represent dimensions in multi-dimensional arrays or tensors. Here's what they typically stand for:

N: Stands for the batch size or the number of samples in a batch. In deep learning, it is common to process data in batches rather than individual samples. The batch size determines how many samples are processed simultaneously during training or inference.

C: Represents the number of channels. In computer vision, an image can have multiple channels, such as Red, Green, and Blue (RGB) channels. The number of channels can also represent different features or filters in a convolutional neural network (CNN).

H: Denotes the height of an image or the number of rows in a tensor. In the context of computer vision, it refers to the vertical dimension of an image or the height of a feature map in a CNN.

W: Represents the width of an image or the number of columns in a tensor. In computer vision, it refers to the horizontal dimension of an image or the width of a feature map in a CNN

In [6]:
# Passing the 'Dataset' as an argument to 'DataLoader' 
# This wraps an iterable over our dataset, and support automatic batcvhing, sampling etc.
# Here we define batch size of 64 -> a botch of 64 features and labels
batch_size = 64

# Create data loaders
train_dataloader = DataLoader(training_data, batch_size = batch_size)
test_dataloader = DataLoader(test_data, batch_size = batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W] {X.shape}") 
    print(f"Shape of Y {y.shape}")
    print(X)
    break

Shape of X [N, C, H, W] torch.Size([64, 1, 28, 28])
Shape of Y torch.Size([64])
tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        ...,


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
        

In [10]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device}")

# Defining NN model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )
    
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork().to(device)
print(model)

Using cuda
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [13]:
model.train()

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

## Optimizing Model Parameters

In [16]:
loss_fn = nn.CrossEntropyLoss() # loss function
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3) # stochastic GD

In [17]:
# In a single training loop, the model makes predictions on the training dataset
# fed to it in batches
# and backpropagates the prediciton error to adjust model's parameters

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediciton error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

In [18]:
# We also check the model's performance against the test dataset to ensure its learning
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(
        f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n"
        )

In deep learning:

- Epoch: One complete pass through the entire training dataset.
- Batch: A subset of the training data used to update model weights in one iteration.

*TL;DR: Epoch = full dataset pass, Batch = data subset for one update.*

In [22]:
%time
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)

CPU times: total: 0 ns
Wall time: 0 ns
Epoch 1
-------------------------------
loss: 2.182179 [   64/60000]
loss: 2.160292 [ 6464/60000]
loss: 2.114250 [12864/60000]
loss: 2.138345 [19264/60000]
loss: 2.080538 [25664/60000]
loss: 2.031904 [32064/60000]
loss: 2.064902 [38464/60000]
loss: 1.978102 [44864/60000]
loss: 1.993188 [51264/60000]
loss: 1.938186 [57664/60000]
Test Error: 
 Accuracy: 52.8%, Avg loss: 1.919589 

Epoch 2
-------------------------------
loss: 1.949327 [   64/60000]
loss: 1.903763 [ 6464/60000]
loss: 1.800116 [12864/60000]
loss: 1.856154 [19264/60000]
loss: 1.737408 [25664/60000]
loss: 1.687487 [32064/60000]
loss: 1.729351 [38464/60000]
loss: 1.612513 [44864/60000]
loss: 1.645349 [51264/60000]
loss: 1.556500 [57664/60000]
Test Error: 
 Accuracy: 58.9%, Avg loss: 1.556003 

Epoch 3
-------------------------------
loss: 1.619138 [   64/60000]
loss: 1.570495 [ 6464/60000]
loss: 1.429782 [12864/60000]
loss: 1.516708 [19264/60000]
loss: 1.392405 [25664/60000]
loss: 1.3788

In [None]:
# Saving model
torch.save(model.state_dict(), "model.pth")
print("Saved as model.pth")

In [None]:
# Loading model
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("model.pth"))

In [24]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"


# Tensors