In [264]:
%pip install torch torchvision

Note: you may need to restart the kernel to use updated packages.


In [265]:
import torch 

x = torch.tensor([1,2,3]) # n-dimensional array (similar to numpy)
x

tensor([1, 2, 3])

In [266]:
random_tensor = torch.rand(2,3) # random numbers (2 rows, 3 columns)
random_tensor

tensor([[0.2438, 0.1922, 0.6897],
        [0.2417, 0.7114, 0.3537]])

In [267]:
zeros_tensor = torch.zeros(2,3)
zeros_tensor

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [268]:
ones_tensor = torch.ones(2,3)
ones_tensor

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [269]:
x + ones_tensor # addition with broadcasting 

tensor([[2., 3., 4.],
        [2., 3., 4.]])

In [270]:
# Matrix multiplication

a = torch.tensor([[1.,2],[3,4]])
b = torch.tensor([[5.,6],[7,8]])

a @ b # or torch.matmul(a,b)


tensor([[19., 22.],
        [43., 50.]])

In [271]:
a * b # elementwise 
a.sum() # sum
a.mean() # mean

tensor(2.5000)

In [272]:
# cpu, cuda, mps and possible 
device = torch.device("cpu")
# device = torch.device("mps")
# device = torch.device("cuda")

In [273]:
# Gradients 

x = torch.tensor(2., requires_grad=True) # mode: propagate the gradients over all computation steps
y = x**2 + 3*x +5
# 2*x + 3 -> 2*2.0 +3 = 7
y.backward() # compute gradients using backpropagation algorithm 
x.grad

tensor(7.)

# simple NN and one gradient step

In [274]:
import torch.nn as nn # Neural Networks
import torch.optim as optim # optimizer framework for gradient methods 

# 10 inputs, 1 output -> fully connected feed-forward neural network 
NN = nn.Linear(10,1) # W*x +b, in Tensorflow -> Dense 
# MLP -> Multi-Layer Perceptron 
# in Literature/Publication: FC, FFN, FFNN, MLP 

loss = nn.MSELoss() # mean squared error
# loss, cost, criterion, crit 
optimizer = optim.SGD(NN.parameters(), lr = 1e-2)
# lr = learning rate, eta, alpha 
# NN.parameters() ... weights and biases 

input_data = torch.rand(10) # random data, X
output = NN(input_data) # y_pred, predictions, y_hat
y = torch.ones(1) # ground truth, label, target, regr 

# initial value of the loss function 
loss_output = loss(y, output) # Difference between reality (y) and expectation (output)
print(f"Loss: {loss_output:.2f} (before update)")

# two magical lines 
loss_output.backward() # compute gradients 
optimizer.step() # Update weights and biases 

output_new = NN(input_data) # here are new weights and biases 
loss_new = loss(output_new, y) # value of the loss function after the update
print(f"Loss: {loss_new:.2f} (after update)")



Loss: 0.95 (before update)
Loss: 0.78 (after update)


In [275]:
output # initial output


tensor([0.0269], grad_fn=<ViewBackward0>)

In [276]:
output_new

tensor([0.1163], grad_fn=<ViewBackward0>)

# FashionMNIST Dataset
modern HelloWorld for deep NNs (by Zalando)

In [277]:
# Precalculation of mean and std for FashioMNIST (training set)
from torchvision import datasets, transforms
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
loader = torch.utils.data.DataLoader(train_dataset, batch_size=1024, shuffle=False)

mean = 0.0
std = 0.0 
num_samples = 0

for images, _ in loader:
    images = images.view(images.size(0),-1) # better 
    mean += images.mean(dim=1).sum()
    std += images.std(dim=1).sum()
    num_samples += images.size(0) # number of images

mean /= num_samples
std /= num_samples
mean, std


(tensor(0.2860), tensor(0.3205))

In [278]:
from torchvision import datasets, transforms

transform = transforms.Compose([
    transforms.ToTensor(), # convert to tensor,
    transforms.Normalize((0.2860,), (0.3530,)), # mean, standard deviation  (x - mean) / std for each image x 
    transforms.RandomHorizontalFlip(p=0.5), # Data Augmentation 
    transforms.RandomAffine(
        degrees=10,
        translate=(0.05, 0.05),
        scale=(0.95, 1.05)
    ),
]
)



train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)



In [2]:
28*28*128 +128

100480

In [279]:
# Model 

class NN(nn.Module): 
    def __init__(self):
        super(NN,self).__init__() # init from the superclass
        self.layer1 = nn.Linear(28*28, 128 ) # 28x28 pixel pictures
        self.bn1 = nn.BatchNorm1d(128)
        self.layer2 = nn.Linear(128, 64) # hidden layer 
        self.bn2 = nn.BatchNorm1d(64) 
        self.layer3 = nn.Linear(64, 10) # output lyaer, 10 = number of product categories 
        self.drop = nn.Dropout(0.3) # 30 % of activations are set to zero 
        # 0.1 -> 0.2 ... -> 0.5 
    
    def forward(self,x):
        x = x.view(-1, 28*28) # flatten 2D -> 1D 
        x = torch.relu(self.bn1(self.layer1(x)))  # before activation BatchNorm
        x = self.drop(x)
        x = torch.relu(self.bn2(self.layer2(x)))
        x = self.drop(x) # here dropout is usually placed  
        x = self.layer3(x) # identity activation -> logit (Pytorch optimization) 
        # there is no need to compute gradients of softmax 
        return x
    # alternative 
    def forward(self,x):
        x = x.view(-1, 28*28) # flatten 2D -> 1D 
        x = torch.relu(self.layer1(x))  # after activation BatchNorm
        x = self.bn1(x)
        x = self.drop(x)
        x = torch.relu(self.layer2(x))
        x = self.bn2(x)
        x = self.drop(x) # here dropout is usually placed  
        x = self.layer3(x) # identity activation -> logit (Pytorch optimization) 
        # there is no need to compute gradients of softmax 
        return x

In [280]:
# Now training

model = NN()
lr = 1e-3 # learning rate
loss = nn.CrossEntropyLoss() # because multi-class problem
optimizer = optim.SGD(model.parameters(), lr=lr)

n_epochs = 40

for epoch in range(n_epochs):
    model.train() # training mode 
    running_loss = 0.0 # loss per epoch 
    for images, labels in train_loader:
        optimizer.zero_grad() # reset the gradient 
        # forward step 
        outputs = model(images) # propagate the images
        curr_loss = loss(outputs, labels) 
        running_loss += curr_loss
        # backward
        curr_loss.backward() # gradients
        optimizer.step() # update weights and biases
    
    print(f"Epoch[{epoch +1} / {n_epochs}], Loss: {running_loss}")

Epoch[1 / 40], Loss: 1398.2091064453125
Epoch[2 / 40], Loss: 1009.6746826171875
Epoch[3 / 40], Loss: 889.46533203125
Epoch[4 / 40], Loss: 825.4232788085938
Epoch[5 / 40], Loss: 782.2587280273438
Epoch[6 / 40], Loss: 751.6229858398438
Epoch[7 / 40], Loss: 730.1636962890625
Epoch[8 / 40], Loss: 712.046630859375
Epoch[9 / 40], Loss: 695.0664672851562
Epoch[10 / 40], Loss: 681.6971435546875
Epoch[11 / 40], Loss: 672.7581176757812
Epoch[12 / 40], Loss: 666.9666748046875
Epoch[13 / 40], Loss: 656.494873046875
Epoch[14 / 40], Loss: 648.0926513671875
Epoch[15 / 40], Loss: 640.3860473632812
Epoch[16 / 40], Loss: 637.568359375
Epoch[17 / 40], Loss: 627.7593383789062
Epoch[18 / 40], Loss: 624.2337036132812
Epoch[19 / 40], Loss: 619.9874267578125
Epoch[20 / 40], Loss: 611.7109985351562
Epoch[21 / 40], Loss: 614.1640014648438
Epoch[22 / 40], Loss: 604.67724609375
Epoch[23 / 40], Loss: 600.8709106445312
Epoch[24 / 40], Loss: 599.1452026367188
Epoch[25 / 40], Loss: 593.6200561523438
Epoch[26 / 40], L

In [281]:
len(train_dataset) # size of dataset

60000

In [282]:
len(test_dataset) # size of test dataset

10000

In [283]:
# Evaluation 


model.eval() # setting the model to evaluation mode (implementation optimization)
correct = 0
total = 0

with torch.no_grad(): # we are not interested in gradient anymore 
    for images, labels in test_loader:
        outputs = model(images) # logits
        predicted = torch.max(outputs.data, 1)[-1] # argmax-> at which class we have maximum logit 
        total += labels.size(0) 
        correct += (predicted==labels).sum().item()

accuracy = correct/total
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 80.60%
