In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
class NeuralNetwork(torch.nn.Module):
    # Coding the number of inputs and outputs as variables allows us to 
    # reuse the same code for datasets with different numbers of features and classes

    def __init__(self, num_inputs: int, num_outputs: int):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(num_inputs, 30), # The Linear layer takes the number of input and output nodes as arguments.
            torch.nn.ReLU(), # Nonlinear activation functions are placed between the hidden layers.
            torch.nn.Linear(30, 20), # The number of output nodes of one hidden layer has to match the number of inputs of the next layer.
            torch.nn.ReLU(),
            torch.nn.Linear(20, num_outputs)
        )

    def forward(self, x):
        # logits represents the outputs of the last layer
        logits = self.layers(x)
        return logits

In [3]:
model = NeuralNetwork(50, 3)

In [4]:
print(model)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)


In [5]:
# numel stands for number of elements
# model.parameters() returns an iterator over the model's trainable parameters
# requires_grad indicates whether the parameter is trainable
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable model parameters:", num_params)

Total number of trainable model parameters: 2213


In [6]:
print(model.layers[0].weight)

Parameter containing:
tensor([[-0.0927,  0.0058,  0.1368,  ..., -0.0048, -0.0748,  0.0212],
        [ 0.0371, -0.0859, -0.0827,  ...,  0.0809, -0.1081,  0.1148],
        [-0.0823,  0.1336, -0.0933,  ..., -0.0914,  0.0672, -0.0741],
        ...,
        [ 0.0091,  0.1112, -0.0603,  ..., -0.0245,  0.0591, -0.0498],
        [ 0.1117,  0.0394, -0.0382,  ..., -0.0958,  0.0097,  0.0675],
        [ 0.0791, -0.0117, -0.0775,  ...,  0.0127,  0.0506,  0.0389]],
       requires_grad=True)


In [7]:
# It's a 30 x 50 matrix
print(model.layers[0].weight.shape)

torch.Size([30, 50])


In [8]:
torch.manual_seed(123)
model = NeuralNetwork(50, 3)
print(model.layers[0].weight)

Parameter containing:
tensor([[-0.0577,  0.0047, -0.0702,  ...,  0.0222,  0.1260,  0.0865],
        [ 0.0502,  0.0307,  0.0333,  ...,  0.0951,  0.1134, -0.0297],
        [ 0.1077, -0.1108,  0.0122,  ...,  0.0108, -0.1049, -0.1063],
        ...,
        [-0.0787,  0.1259,  0.0803,  ...,  0.1218,  0.1303, -0.1351],
        [ 0.1359,  0.0175, -0.0673,  ...,  0.0674,  0.0676,  0.1058],
        [ 0.0790,  0.1343, -0.0293,  ...,  0.0344, -0.0971, -0.0509]],
       requires_grad=True)


In [9]:
torch.manual_seed(123)
X = torch.rand((1, 50))
out = model(X)
print(out)

tensor([[-0.1262,  0.1080, -0.1792]], grad_fn=<AddmmBackward0>)


`AddmmBackward0` means the last-used function to compute a variable in the computation graph

`Addmm` stands for matrix multiplication('mm') followed by addition ('Add')

In [10]:
# use the network without training or backpropagation

with torch.no_grad():
    out = model(X)
print(out)

tensor([[-0.1262,  0.1080, -0.1792]])


In [11]:
with torch.no_grad():
    # dim=1 indicates the second dimension, which is the row
    out = torch.softmax(model(X), dim=1)
print(out)

tensor([[0.3113, 0.3934, 0.2952]])


implement a custom Dataset class

In [12]:
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])

y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6]
])

y_test = torch.tensor([0, 1])

In [13]:
print(X_train.shape)
print(y_train.shape)

torch.Size([5, 2])
torch.Size([5])


In [14]:
from torch.utils.data import Dataset

class ToyDataset(Dataset):
    def __init__(self, X, y) -> None:
        super().__init__()
        self.features = X
        self.labels = y

    def __getitem__(self, index: int):
        one_X = self.features[index]
        one_y = self.labels[index]
        return one_X, one_y
    
    def __len__(self):
        return self.labels.shape[0]

In [15]:
train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)
print(len(train_ds))
print(len(test_ds))

5
2


In [16]:
from torch.utils.data import DataLoader

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds, # serves as input to the data loader
    batch_size=2,
    shuffle=True, # whether to shuffle the data or not
    num_workers=0 # the number of background processes, 0 means data loading will be done in the main process and not in separate worker processes
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False, # it is not necessary to shuffle a test dataset
    num_workers=0
)

In [17]:
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx + 1}:", x, y)

Batch 1: tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2: tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])
Batch 3: tensor([[ 2.7000, -1.5000]]) tensor([1])


## A typical training loop

In [18]:
import torch.nn.functional as F

torch.manual_seed(123)

model = NeuralNetwork(
    num_inputs=2, # two features
    num_outputs=2 # two classes
)

optimizer = torch.optim.SGD(
    model.parameters(),
    lr=0.5
)

num_epochs = 3
for epoch in range(num_epochs):
    model.train() # put the model into training mode
    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)
        loss = F.cross_entropy(logits, labels) # will internally apply softmax 

        # sets the gradients from the previous round to 0
        # to prevent unintended gradients accumulation
        optimizer.zero_grad() 
        # computes the gradient of the loss given the model parameters
        loss.backward()

        # the optimizer uses the gradients to update the model parameters
        optimizer.step()

        ### LOGGING
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d} "
              f" | Batch {batch_idx:03d}/{len(train_loader):03d} "
              f" | Train Loss: {loss:.2f}")

    model.eval() # put the model into evaluating mode


Epoch: 001/003  | Batch 000/003  | Train Loss: 0.75
Epoch: 001/003  | Batch 001/003  | Train Loss: 0.65
Epoch: 001/003  | Batch 002/003  | Train Loss: 0.42
Epoch: 002/003  | Batch 000/003  | Train Loss: 0.05
Epoch: 002/003  | Batch 001/003  | Train Loss: 0.13
Epoch: 002/003  | Batch 002/003  | Train Loss: 0.00
Epoch: 003/003  | Batch 000/003  | Train Loss: 0.01
Epoch: 003/003  | Batch 001/003  | Train Loss: 0.00
Epoch: 003/003  | Batch 002/003  | Train Loss: 0.02


In [19]:
# Exercise A.3
num_params = sum(p.numel() for p in model.parameters())
print(num_params)

752


In [20]:
model.eval()
with torch.no_grad():
    outputs = model(X_train)
print(outputs)

tensor([[ 2.9320, -4.2563],
        [ 2.6045, -3.8389],
        [ 2.1484, -3.2514],
        [-2.1461,  2.1496],
        [-2.5004,  2.5210]])


In [21]:
torch.set_printoptions(sci_mode=False)
probas = torch.softmax(outputs, dim=1)
print(probas)

tensor([[    0.9992,     0.0008],
        [    0.9984,     0.0016],
        [    0.9955,     0.0045],
        [    0.0134,     0.9866],
        [    0.0066,     0.9934]])


In [22]:
predictions = torch.argmax(probas, dim=1)
print(predictions)

tensor([0, 0, 0, 1, 1])


In [23]:
predictions == y_train

tensor([True, True, True, True, True])

In [24]:
def compute_accuracy(model, dataloader):

    model = model.eval()
    correct = 00
    total_examples = 0

    for idx, (features, labels) in enumerate(dataloader):
        # use the network without training or backpropagation
        with torch.no_grad():
            logits = model(features)
        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions
        correct += torch.sum(compare)
        total_examples += len(compare)
    
    return (correct / total_examples).item()


In [25]:
print(compute_accuracy(model, train_loader))

1.0


In [26]:
print(compute_accuracy(model, test_loader))

1.0


save model

In [27]:
torch.save(model.state_dict(), "model.pth")

fetch model

In [28]:
fetched_model = NeuralNetwork(2, 2)
fetched_model.load_state_dict(torch.load("model.pth"))

  fetched_model.load_state_dict(torch.load("model.pth"))


<All keys matched successfully>

### Use GPU

In [29]:
tensor_1 = torch.tensor([1., 2., 3.])
tensor_2 = torch.tensor([4., 5., 6.])
print(tensor_1 + tensor_2)

tensor([5., 7., 9.])


In [30]:
gpu_tensor_1 = tensor_1.to("cuda")
gpu_tensor_2 = tensor_2.to("cuda")
print(gpu_tensor_1 + gpu_tensor_2)

tensor([5., 7., 9.], device='cuda:0')


Training on GPU

In [31]:
torch.manual_seed(123)

model_for_gpu = NeuralNetwork(num_inputs=2, num_outputs=2)

device = torch.device("cuda")
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
        features, labels = features.to("cuda"), labels.to("cuda") # transfer data onto GPU
        logits = model(features)
        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
              f" | Train/Val Loss: {loss:.2f}")
        
    model.eval()

Epoch: 001/003 | Batch 000/003 | Train/Val Loss: 0.00
Epoch: 001/003 | Batch 001/003 | Train/Val Loss: 0.01
Epoch: 001/003 | Batch 002/003 | Train/Val Loss: 0.01
Epoch: 002/003 | Batch 000/003 | Train/Val Loss: 0.01
Epoch: 002/003 | Batch 001/003 | Train/Val Loss: 0.00
Epoch: 002/003 | Batch 002/003 | Train/Val Loss: 0.00
Epoch: 003/003 | Batch 000/003 | Train/Val Loss: 0.00
Epoch: 003/003 | Batch 001/003 | Train/Val Loss: 0.00
Epoch: 003/003 | Batch 002/003 | Train/Val Loss: 0.01


Exercise 3.4

In [37]:
torch.manual_seed(10086)
mtx_a_cpu = torch.rand(32768, 32768)
mtx_b_cpu = torch.rand(32768, 32768)

In [38]:
mtx_c = mtx_a_cpu @ mtx_b_cpu

In [39]:
mtx_a_gpu = mtx_a_cpu.to("cuda")
mtx_b_gpu = mtx_b_cpu.to("cuda")

In [40]:
mtx_c_gpu = mtx_a_gpu @ mtx_b_gpu

In [36]:
mtx_c_gpu

tensor([[2018.4880, 2041.4791, 2031.8149,  ..., 2018.2648, 2035.3295,
         2052.8489],
        [2026.6726, 2035.2216, 2020.0175,  ..., 2012.9485, 2037.9982,
         2056.8469],
        [2045.0924, 2060.9529, 2056.0767,  ..., 2046.9811, 2061.7751,
         2085.5342],
        ...,
        [2060.3120, 2069.1853, 2045.5758,  ..., 2050.2156, 2064.4539,
         2078.9946],
        [2034.2317, 2038.7274, 2038.5509,  ..., 2036.5804, 2029.4133,
         2063.7549],
        [2029.4775, 2060.0444, 2042.2656,  ..., 2056.8511, 2057.7588,
         2065.4546]], device='cuda:0')

In [41]:
mtx_c

tensor([[8200.2666, 8198.6777, 8181.6396,  ..., 8239.8711, 8225.9951,
         8229.5234],
        [8213.9707, 8196.9639, 8194.6924,  ..., 8226.7373, 8201.8389,
         8220.6729],
        [8189.1118, 8189.0659, 8176.8589,  ..., 8219.3652, 8198.1172,
         8233.2197],
        ...,
        [8212.8291, 8249.0547, 8207.3115,  ..., 8274.4609, 8256.0996,
         8238.7441],
        [8201.0283, 8193.9648, 8179.9829,  ..., 8217.0137, 8235.7051,
         8193.2402],
        [8239.1836, 8245.8057, 8239.6807,  ..., 8268.0664, 8264.3438,
         8227.7705]])

In [42]:
mtx_a_gpu

tensor([[0.3635, 0.8312, 0.2289,  ..., 0.0577, 0.9722, 0.2364],
        [0.2871, 0.4722, 0.9638,  ..., 0.4326, 0.9892, 0.2872],
        [0.0661, 0.0838, 0.1178,  ..., 0.8146, 0.9830, 0.7482],
        ...,
        [0.8237, 0.6000, 0.8225,  ..., 0.4243, 0.5663, 0.9754],
        [0.6306, 0.7463, 0.7954,  ..., 0.4027, 0.3638, 0.1379],
        [0.1886, 0.0503, 0.6127,  ..., 0.4108, 0.7075, 0.6877]],
       device='cuda:0')

In [43]:
mtx_a_cpu

tensor([[0.3635, 0.8312, 0.2289,  ..., 0.0577, 0.9722, 0.2364],
        [0.2871, 0.4722, 0.9638,  ..., 0.4326, 0.9892, 0.2872],
        [0.0661, 0.0838, 0.1178,  ..., 0.8146, 0.9830, 0.7482],
        ...,
        [0.8237, 0.6000, 0.8225,  ..., 0.4243, 0.5663, 0.9754],
        [0.6306, 0.7463, 0.7954,  ..., 0.4027, 0.3638, 0.1379],
        [0.1886, 0.0503, 0.6127,  ..., 0.4108, 0.7075, 0.6877]])

In [None]:
torch.manual_seed(10086)
mtx_a_cpu = torch.rand(32768, 32768)
mtx_b_cpu = torch.rand(32768, 32768)
mtx_c_cpu = mtx_a_cpu @ mtx_b_cpu

mtx_a_gpu = mtx_a_cpu.to("cuda")
mtx_b_gpu = mtx_b_cpu.to("cuda")
mtx_c_gpu = mtx_a_gpu @ mtx_b_gpu

In [44]:
torch.manual_seed(10086)
torch.cuda.manual_seed_all(10086)

mtx_a_cpu = torch.rand(3, 3, dtype=torch.float32)
mtx_b_cpu = torch.rand(3, 3, dtype=torch.float32)
mtx_c_cpu = mtx_a_cpu @ mtx_b_cpu

mtx_a_gpu = mtx_a_cpu.to("cuda")
mtx_b_gpu = mtx_b_cpu.to("cuda")
mtx_c_gpu = mtx_a_gpu @ mtx_b_gpu

print("CPU result:", mtx_c_cpu)
print("GPU result:", mtx_c_gpu.cpu())

CPU result: tensor([[0.3911, 1.1798, 0.5918],
        [0.7147, 1.5514, 0.8035],
        [0.2992, 1.3315, 0.6223]])
GPU result: tensor([[0.3911, 1.1798, 0.5918],
        [0.7147, 1.5514, 0.8035],
        [0.2992, 1.3315, 0.6223]])
