<a href="https://colab.research.google.com/github/databyhuseyn/DeepLearning/blob/main/Introduction_to_Deep_Learning_with_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [154]:
import torch

In [155]:
A = torch.tensor([
    [1.0, 4.0, 7.0],
     [2.0, 3.0, 6.0]
])

In [156]:
A

tensor([[1., 4., 7.],
        [2., 3., 6.]])

In [157]:
X = torch.tensor([
    [1.0, 4.0, 7.0],
     [2.0, 3.0, 6.0]
], dtype=torch.bfloat16)

In [158]:
X

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.bfloat16)

In [159]:
X.shape

torch.Size([2, 3])

In [160]:
X.dtype

torch.bfloat16

In [161]:
X[0, 1]

tensor(4., dtype=torch.bfloat16)

In [162]:
X[:, 1]

tensor([4., 3.], dtype=torch.bfloat16)

In [163]:
X.abs()

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.bfloat16)

In [164]:
torch.abs(X)

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.bfloat16)

In [165]:
# torch.cos(), torch.exp(), torch.max(), torch.mean(), torch.sqrt()

In [166]:
X.sqrt()

tensor([[1.0000, 2.0000, 2.6406],
        [1.4141, 1.7344, 2.4531]], dtype=torch.bfloat16)

In [167]:
10 * (X + 1.0)

tensor([[20., 50., 80.],
        [30., 40., 70.]], dtype=torch.bfloat16)

In [168]:
X.exp()

tensor([[   2.7188,   54.5000, 1096.0000],
        [   7.3750,   20.1250,  404.0000]], dtype=torch.bfloat16)

In [169]:
X.mean()

tensor(3.8281, dtype=torch.bfloat16)

In [170]:
X.max(dim=0) #, keepdim=True

torch.return_types.max(
values=tensor([2., 4., 7.], dtype=torch.bfloat16),
indices=tensor([1, 0, 0]))

In [171]:
X

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.bfloat16)

In [172]:
X @ X.T

tensor([[66., 56.],
        [56., 49.]], dtype=torch.bfloat16)

In [173]:
import numpy as np

In [174]:
A.numpy()

array([[1., 4., 7.],
       [2., 3., 6.]], dtype=float32)

In [175]:
torch.tensor(np.array([[1., 4., 7.],
                      [2., 3., 6.]]))

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.float64)

In [176]:
# It’s generally better to use 32 bits in deep learning because this takes half the RAM and speeds up computations,
# and neural nets do not actually need the extra precision offered by 64-bit floats.

In [177]:
N = torch.FloatTensor(np.array([            # automatically converts the array to 32 bits
                            [1., 4., 7.],
                             [2., 3., 6]]))
N

tensor([[1., 4., 7.],
        [2., 3., 6.]])

In [178]:
N.dtype

torch.float32

In [179]:
X[:, 1] = -99
X

tensor([[  1., -99.,   7.],
        [  2., -99.,   6.]], dtype=torch.bfloat16)

In [180]:
X.relu()

tensor([[1., 0., 7.],
        [2., 0., 6.]], dtype=torch.bfloat16)

In [181]:
X

tensor([[  1., -99.,   7.],
        [  2., -99.,   6.]], dtype=torch.bfloat16)

In [182]:
X.relu_()

tensor([[1., 0., 7.],
        [2., 0., 6.]], dtype=torch.bfloat16)

In [183]:
X    # _ changed in place

tensor([[1., 0., 7.],
        [2., 0., 6.]], dtype=torch.bfloat16)

In [184]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [185]:
device

'cuda'

In [186]:
!nvidia-smi

Mon Jan 26 20:56:08 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   77C    P0             33W /   70W |     178MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [187]:
M = torch.tensor([
    [1., 2., 3.],
    [4., 5., 6.]
])
M = M.to(device)

In [188]:
M

tensor([[1., 2., 3.],
        [4., 5., 6.]], device='cuda:0')

In [189]:
M.device

device(type='cuda', index=0)

In [190]:
M = torch.tensor([[1., 2., 3.], [4., 5., 6.]], device=device)

In [191]:
R = M @ M.T

In [192]:
R

tensor([[14., 32.],
        [32., 77.]], device='cuda:0')

In [193]:
M = torch.rand((1000, 1000))  # on the CPU
%timeit M @ M.T


M = torch.rand((1000, 1000), device="cuda")  # on the GPU
%timeit M @ M.T

# now try 100 by 100 :D

16.1 ms ± 3.09 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
694 µs ± 15.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [194]:
x = torch.tensor(5.0, requires_grad=True)
f = x ** 2
f

tensor(25., grad_fn=<PowBackward0>)

In [195]:
f.backward()
x.grad

tensor(10.)

In [196]:
learning_rate = 0.1
with torch.no_grad():
    x -= learning_rate * x.grad  # gradient descent step

In [197]:
x

tensor(4., requires_grad=True)

In [198]:
x_detached = x.detach()
x_detached -= learning_rate * x.grad   # detach yaddashda eyni yerdedi

In [199]:
x_detached

tensor(3.)

In [200]:
x.grad.zero_()

tensor(0.)

In [201]:
learning_rate = 0.1
x = torch.tensor(5.0, requires_grad=True)
for iteration in range(100):
    f = x ** 2  # forward pass
    f.backward()  # backward pass
    with torch.no_grad():
        x -= learning_rate * x.grad  # gradient descent step
    print(x)
    x.grad.zero_()  # reset the gradients

tensor(4., requires_grad=True)
tensor(3.2000, requires_grad=True)
tensor(2.5600, requires_grad=True)
tensor(2.0480, requires_grad=True)
tensor(1.6384, requires_grad=True)
tensor(1.3107, requires_grad=True)
tensor(1.0486, requires_grad=True)
tensor(0.8389, requires_grad=True)
tensor(0.6711, requires_grad=True)
tensor(0.5369, requires_grad=True)
tensor(0.4295, requires_grad=True)
tensor(0.3436, requires_grad=True)
tensor(0.2749, requires_grad=True)
tensor(0.2199, requires_grad=True)
tensor(0.1759, requires_grad=True)
tensor(0.1407, requires_grad=True)
tensor(0.1126, requires_grad=True)
tensor(0.0901, requires_grad=True)
tensor(0.0721, requires_grad=True)
tensor(0.0576, requires_grad=True)
tensor(0.0461, requires_grad=True)
tensor(0.0369, requires_grad=True)
tensor(0.0295, requires_grad=True)
tensor(0.0236, requires_grad=True)
tensor(0.0189, requires_grad=True)
tensor(0.0151, requires_grad=True)
tensor(0.0121, requires_grad=True)
tensor(0.0097, requires_grad=True)
tensor(0.0077, requires_

In [202]:
t = torch.tensor(2.0, requires_grad=True)
z = t.exp()  # this is an intermediate result
z += 1  # this is an in-place operation
z.backward()  #  RuntimeError!

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of ExpBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [203]:
# but

t = torch.tensor(2.0, requires_grad=True)
z = t.exp()  # this is an intermediate result
z = z + 1  # new tensor created, and previous saved for computation graph
z.backward()  #  RuntimeError!

# make sure you don’t modify in place a tensor needed for backprop

In [204]:
import sklearn

In [205]:
X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)

In [206]:
X_train_full, X_test, y_train_full, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)

In [207]:
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_train_full, y_train_full, test_size=0.2)

In [208]:
X_train.shape, X_valid.shape, X_test.shape

((13209, 8), (3303, 8), (4128, 8))

In [209]:
X_train = torch.FloatTensor(X_train)
X_valid = torch.FloatTensor(X_valid)
X_test = torch.FloatTensor(X_test)
means = X_train.mean(dim=0, keepdims=True)
stds = X_train.std(dim=0, keepdims=True)
X_train = (X_train - means) / stds
X_valid = (X_valid - means) / stds
X_test = (X_test - means) / stds

In [210]:
y_train = torch.FloatTensor(y_train).reshape(-1, 1)
y_valid = torch.FloatTensor(y_valid).reshape(-1, 1)
y_test = torch.FloatTensor(y_test).reshape(-1, 1)


In [211]:
torch.manual_seed(42)
n_features = X_train.shape[1]
w = torch.randn((n_features, 1), requires_grad=True)
b = torch.tensor(0., requires_grad=True)

In [212]:
learning_rate = 0.4
n_epochs = 20
for epoch in range(n_epochs):
  y_pred = X_train @ w + b
  loss = ((y_pred - y_train) ** 2).mean()
  loss.backward()
  with torch.no_grad():
    b -= learning_rate * b.grad
    w -= learning_rate * w.grad
    b.grad.zero_()
    w.grad.zero_()
  print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item()}')



Epoch 1/20, Loss: 16.09038543701172
Epoch 2/20, Loss: 4.722507953643799
Epoch 3/20, Loss: 2.1414694786071777
Epoch 4/20, Loss: 1.2615543603897095
Epoch 5/20, Loss: 0.9286397695541382
Epoch 6/20, Loss: 0.7917249798774719
Epoch 7/20, Loss: 0.7280034422874451
Epoch 8/20, Loss: 0.6928783059120178
Epoch 9/20, Loss: 0.6697766780853271
Epoch 10/20, Loss: 0.6523757576942444
Epoch 11/20, Loss: 0.6381605863571167
Epoch 12/20, Loss: 0.626054048538208
Epoch 13/20, Loss: 0.6155350804328918
Epoch 14/20, Loss: 0.606306791305542
Epoch 15/20, Loss: 0.5981702208518982
Epoch 16/20, Loss: 0.5909748077392578
Epoch 17/20, Loss: 0.5845981240272522
Epoch 18/20, Loss: 0.5789371132850647
Epoch 19/20, Loss: 0.5739034414291382
Epoch 20/20, Loss: 0.569420576095581


In [213]:
X_new = X_test[:3]
with torch.no_grad():
  y_pred = X_new @ w + b

y_pred

tensor([[1.8252],
        [2.0052],
        [0.9490]])

In [214]:
y_test[:3]

tensor([[1.9910],
        [1.7250],
        [0.3750]])

In [215]:
import torch.nn as nn

In [216]:
torch.manual_seed(42)
model = nn.Linear(in_features = n_features, out_features=1)

In [217]:
model.bias

Parameter containing:
tensor([0.3117], requires_grad=True)

In [218]:
model.weight

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)

In [219]:
model.parameters()

<generator object Module.parameters at 0x7c02545ed460>

In [220]:
next(model.parameters())

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)

In [221]:
for param in model.parameters():
  print(param)

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
Parameter containing:
tensor([0.3117], requires_grad=True)


In [222]:
for name, param in model.named_parameters():
  print(f'{name} | {param}')

weight | Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
bias | Parameter containing:
tensor([0.3117], requires_grad=True)


In [223]:
model(X_train[:2])

tensor([[0.3336],
        [0.5138]], grad_fn=<AddmmBackward0>)

In [224]:
y_train[:2]

tensor([[1.4750],
        [1.0630]])

In [225]:
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
mse = nn.MSELoss()

In [226]:
model(X_train[:2])

tensor([[0.3336],
        [0.5138]], grad_fn=<AddmmBackward0>)

In [227]:
def train_bgd(model, optimzier, criterion, X_train, y_train, n_epochs):
  for epoch in range(n_epochs):
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item()}')

In [228]:
train_bgd(model, optimizer, mse, X_train, y_train, n_epochs)

Epoch 1/20, Loss: 4.297671318054199
Epoch 2/20, Loss: 0.7803131341934204
Epoch 3/20, Loss: 0.6259387731552124
Epoch 4/20, Loss: 0.6062909960746765
Epoch 5/20, Loss: 0.5956014394760132
Epoch 6/20, Loss: 0.587176501750946
Epoch 7/20, Loss: 0.5800814032554626
Epoch 8/20, Loss: 0.5739718079566956
Epoch 9/20, Loss: 0.56866055727005
Epoch 10/20, Loss: 0.564021646976471
Epoch 11/20, Loss: 0.5599592924118042
Epoch 12/20, Loss: 0.5563951134681702
Epoch 13/20, Loss: 0.5532633662223816
Epoch 14/20, Loss: 0.5505075454711914
Epoch 15/20, Loss: 0.5480791926383972
Epoch 16/20, Loss: 0.5459365844726562
Epoch 17/20, Loss: 0.5440431833267212
Epoch 18/20, Loss: 0.5423677563667297
Epoch 19/20, Loss: 0.5408830642700195
Epoch 20/20, Loss: 0.53956538438797


In [229]:
X_new = X_test[:3]

with torch.no_grad():
  y_pred = model(X_new)

y_pred

tensor([[1.8661],
        [2.0614],
        [0.9104]])

In [230]:
y_test[:3]

tensor([[1.9910],
        [1.7250],
        [0.3750]])

In [231]:
torch.manual_seed(42)
model = nn.Sequential(
    nn.Linear(n_features,50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
)

In [232]:
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()
train_bgd(model, optimizer, mse, X_train, y_train, n_epochs)

Epoch 1/20, Loss: 5.007368087768555
Epoch 2/20, Loss: 2.095228910446167
Epoch 3/20, Loss: 1.0410361289978027
Epoch 4/20, Loss: 0.8958718776702881
Epoch 5/20, Loss: 0.8108903169631958
Epoch 6/20, Loss: 0.7575608491897583
Epoch 7/20, Loss: 0.7230842113494873
Epoch 8/20, Loss: 0.7001765966415405
Epoch 9/20, Loss: 0.6842378377914429
Epoch 10/20, Loss: 0.6724708676338196
Epoch 11/20, Loss: 0.6630788445472717
Epoch 12/20, Loss: 0.6550995707511902
Epoch 13/20, Loss: 0.6480072736740112
Epoch 14/20, Loss: 0.6414843797683716
Epoch 15/20, Loss: 0.6353656649589539
Epoch 16/20, Loss: 0.6295695900917053
Epoch 17/20, Loss: 0.6240125894546509
Epoch 18/20, Loss: 0.6186654567718506
Epoch 19/20, Loss: 0.6134958267211914
Epoch 20/20, Loss: 0.6084879040718079


# Making it MiniBatchessss

In [233]:
from torch.utils.data import TensorDataset, DataLoader

In [234]:
train_dataset = TensorDataset(X_train, y_train)

In [235]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [236]:
torch.manual_seed(42)
model = nn.Sequential(
    nn.Linear(in_features = n_features, out_features = 64),
    nn.ReLU(),
    nn.Linear(in_features = 64, out_features = 32),
    nn.ReLU(),
    nn.Linear(in_features = 32, out_features = 1)
)

model.to(device)

Sequential(
  (0): Linear(in_features=8, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=1, bias=True)
)

In [237]:
learning_rate = 0.02
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
mse = nn.MSELoss()

In [238]:
def train(model, optimizer, criterion, train_loader, n_epochs):
  model.train()
  for epoch in range(n_epochs):
    total_loss = 0.
    for X_batch, y_batch in train_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      loss = criterion(y_pred, y_batch)
      total_loss += loss.item()
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    mean_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{n_epochs}, Loss:{mean_loss:.4f}')

In [239]:
train(model, optimizer, mse, train_loader, n_epochs)

Epoch 1/20, Loss:0.6840
Epoch 2/20, Loss:0.4291
Epoch 3/20, Loss:0.4124
Epoch 4/20, Loss:0.3916
Epoch 5/20, Loss:0.3831
Epoch 6/20, Loss:0.3724
Epoch 7/20, Loss:0.3646
Epoch 8/20, Loss:0.3609
Epoch 9/20, Loss:0.3514
Epoch 10/20, Loss:0.3449
Epoch 11/20, Loss:0.3491
Epoch 12/20, Loss:0.3368
Epoch 13/20, Loss:0.3329
Epoch 14/20, Loss:0.3284
Epoch 15/20, Loss:0.3236
Epoch 16/20, Loss:0.3214
Epoch 17/20, Loss:0.3238
Epoch 18/20, Loss:0.3218
Epoch 19/20, Loss:0.3210
Epoch 20/20, Loss:0.3164


In [240]:
import torch
print(torch.cuda.is_available())


True


In [241]:
# LET'S EVALUATE our model

def evaluate(model, data_loader, metric_fn, aggregate_fn=torch.mean):
  model.eval()
  metrics=[]
  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      metric = metric_fn(y_pred, y_batch)
      metrics.append(metric)
  return aggregate_fn(torch.stack(metrics))


In [242]:
valid_dataset = TensorDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=32)
valid_mse = evaluate(model, valid_loader, mse)
valid_mse

tensor(0.3206, device='cuda:0')

In [243]:
def rmse(y_pred, y_true):
  return ((y_pred - y_true) ** 2).mean().sqrt()

In [244]:
evaluate(model, valid_loader, rmse)


tensor(0.5518, device='cuda:0')

In [245]:
0.3269 ** 0.5 != 0.5619

True

The reason is that instead of calculating the RMSE over the whole validation set, we computed it over each batch and then computed the mean of all these batch RMSEs

In [246]:
evaluate(model, valid_loader, mse,
         aggregate_fn=lambda metrics: torch.sqrt(torch.mean(metrics)))

tensor(0.5662, device='cuda:0')

In [247]:
0.3269 ** 0.5

0.5717516943569123

In [248]:
%pip install torchmetrics



In [249]:
import torchmetrics

In [253]:
def evaluate_tm(model, data_loader, metric):
  model.eval()
  metric.reset()
  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      metric.update(y_pred, y_batch)
  return metric.compute()

In [254]:
rmse = torchmetrics.MeanSquaredError(squared=False).to(device)
evaluate_tm(model, valid_loader, rmse)

tensor(0.5667, device='cuda:0')