In [94]:
#%pip install torch torchvision


In [95]:
import torch

x = torch.tensor([1,2,3]) # tensor is n-dimensional array
x

tensor([1, 2, 3])

In [96]:
random_tensor = torch.rand(2,3) # random tensor, 2 rows, 3 columns
random_tensor

tensor([[0.0152, 0.3439, 0.5670],
        [0.1282, 0.0977, 0.2425]])

In [97]:
zeros_tensor = torch.zeros(2,3)
zeros_tensor

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [98]:
ones_tensor = torch.ones(2,3)
ones_tensor

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [99]:
x + ones_tensor

tensor([[2., 3., 4.],
        [2., 3., 4.]])

In [100]:
a = torch.tensor([[1.,2], [3,4]])
b = torch.tensor([[5.,6], [7,8]])
a @ b # torch.matmul(a,b)

tensor([[19., 22.],
        [43., 50.]])

In [101]:
# elementwise
a*b

tensor([[ 5., 12.],
        [21., 32.]])

In [102]:
# Sum
a.sum()

tensor(10.)

In [103]:
a.mean()

tensor(2.5000)

In [104]:
# cpu, cuda, mps and possibly other frameworks 

# torch.cuda.is_available()
device = torch.device("cpu") # for ordinary cpu
device = torch.device("cpu") # for CUDA
device = torch.device("mps") # for Apple Silicon

In [105]:
# Gradients 

x = torch.tensor(2.0, requires_grad=True) # mode: propagate the gradients over all computation steps
y = x**2 + 3*x +5
# 2*x +3 -> 2*2 + 3 = 7
y.backward() # compute gradients using backpropagation algorithm
x.grad

tensor(7.)

- easy NN, one gradient descent step

In [106]:
import torch.nn as nn # Neural Networks 
import torch.optim as optim # optimizer framework for gradient methods 

# 10 inputs, 1 output -> fully connected feed-forward neural network
NN = nn.Linear(10,1) # W*x + b, in Tensorflow -> Dense
# MLP -> Multi-Layer Perceptron 
# in Literature/Publications: FC, FFN, FFNN, MLP 

loss = nn.MSELoss() # mean squared error
# loss, cost, criterion, crit 
optimizer = optim.SGD(NN.parameters(), lr = 1e-2)
# lr = learning rate, eta, alpha 
# NN.parameters() ... weights and biases

input_data = torch.rand(10) # random stuff, X 
output = NN(input_data) # y_pred, predictions, y_hat 
y = torch.ones(1) # ground truth, target, regr 

# initial value of the loss function 
loss_output = loss(y, output) # Difference between reality and expectation 
print(f"Loss: {loss_output:.2} (before update)")

# two magical lines
loss_output.backward() # compute gradients
optimizer.step() # Update the weights and biases 

output_new = NN(input_data) # here are new weights and biases 
loss_new = loss(output_new,y) # value of the loss function after the update
print(f"Loss: {loss_new:.2} (after update)")


Loss: 1.7 (before update)
Loss: 1.4 (after update)


In [107]:
y

tensor([1.])

In [108]:
output # before Update

tensor([-0.3046], grad_fn=<ViewBackward0>)

In [109]:
output_new

tensor([-0.1997], grad_fn=<ViewBackward0>)

# FashionMNIST dataset 
- modern HelloWorld for NNs (by Zalando)

In [110]:
from torchvision import datasets, transforms

transform = transforms.Compose([
    transforms.ToTensor(), # convert to tensor
])


train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [117]:
28*28*128 + 128 + 128*64 + 64 + 64*10+10 

109386

In [118]:
28*28*128 + 128

100480

In [112]:
# Model 

class NN(nn.Module): # class 

    def __init__(self):
        super(NN,self).__init__() # init from the superclass
        self.layer1 = nn.Linear(28*28, 128 ) # input layer 
        self.bn1 = nn.BatchNorm1d(128)
        self.layer2 = nn.Linear(128, 64) # hidden layer
        self.bn2 = nn.BatchNorm1d(64)
        self.layer3 = nn.Linear(64 ,10)         # output layer, 10 number of product categories
        self.drop = nn.Dropout(0.3) # 30 % of activations are set to zero
        # 0.1 -> 0.2 -> ... -> 0.5 
    
    # First way to apply BatchNorm (preactivation)
    def forward(self,x): # propagate the information through the network
        x = x.view(-1, 28*28) # flatten 2D -> 1D 
        x = torch.relu(self.bn1(self.layer1(x))) # activation using ReLU
        x = self.drop(x) 
        x = torch.relu(self.bn2(self.layer2(x))) 
        x = self.drop(x) # here dropout is usually placed 
        x = self.layer3(x) # identity activation -> logit (no need to compute gradients of softmax)
        return x 
    # Second way to apply BatchNorm (postactivation)
    def forward(self,x): # propagate the information through the network
        x = x.view(-1, 28*28) # flatten 2D -> 1D 
        x = torch.relu(self.layer1(x)) # activation using ReLU
        x = self.bn1(x)
        x = self.drop(x) 
        x = torch.relu(self.layer2(x))
        x = self.bn2(x)
        x = self.drop(x) # here dropout is usually placed 
        x = self.layer3(x) # identity activation -> logit (no need to compute gradients of softmax)
        return x 


In [113]:
# Now training

model = NN()
lr = 1e-2 # learning rate
loss = nn.CrossEntropyLoss() # CE because multi-class problem
optimizer = optim.SGD(model.parameters(), lr = lr )

n_epochs = 20 

for epoch in range(n_epochs):
    model.train() # train mode
    running_loss = 0.0 # loss per epoch
    for images, labels in train_loader:
        optimizer.zero_grad() # reset the gradients
        # forward 
        outputs = model(images) # calculate outputs
        curr_loss = loss(outputs,labels)
        running_loss += curr_loss
        # backward
        curr_loss.backward() # gradients
        optimizer.step()     # update weights and biases
    print(f"Epoch[{epoch + 1}/{n_epochs}], Loss: {running_loss}")


Epoch[1/20], Loss: 671.2218017578125
Epoch[2/20], Loss: 482.37066650390625
Epoch[3/20], Loss: 442.262451171875
Epoch[4/20], Loss: 424.709716796875
Epoch[5/20], Loss: 401.94439697265625
Epoch[6/20], Loss: 392.5033264160156
Epoch[7/20], Loss: 383.63427734375
Epoch[8/20], Loss: 379.515869140625
Epoch[9/20], Loss: 372.1685485839844
Epoch[10/20], Loss: 370.48175048828125
Epoch[11/20], Loss: 369.0382385253906
Epoch[12/20], Loss: 368.524658203125
Epoch[13/20], Loss: 364.98260498046875
Epoch[14/20], Loss: 354.603759765625
Epoch[15/20], Loss: 354.80328369140625
Epoch[16/20], Loss: 356.435302734375
Epoch[17/20], Loss: 353.4370422363281
Epoch[18/20], Loss: 347.3802795410156
Epoch[19/20], Loss: 353.01104736328125
Epoch[20/20], Loss: 347.5052185058594


In [114]:
len(train_dataset)

60000

In [115]:
len(test_dataset)

10000

In [116]:
# Evaluation 

model.eval() # setting the model to evaluation mode (implementation optimization)
correct = 0
total = 0

with torch.no_grad(): # we are not interested in gradient anymore 
    for images, labels in test_loader:
        outputs = model(images)
        predicted = torch.max(outputs.data, 1)[-1] 
        total += labels.size(0) 
        correct += (predicted==labels).sum().item()

accuracy = correct/total
print(f"Accuracy: {accuracy*100:.2f}%")



Accuracy: 86.57%
