<a href="https://colab.research.google.com/github/databyhuseyn/DeepLearning/blob/main/Introduction_to_Deep_Learning_with_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

In [2]:
A = torch.tensor([
    [1.0, 4.0, 7.0],
     [2.0, 3.0, 6.0]
])

In [3]:
A

tensor([[1., 4., 7.],
        [2., 3., 6.]])

In [4]:
X = torch.tensor([
    [1.0, 4.0, 7.0],
     [2.0, 3.0, 6.0]
], dtype=torch.bfloat16)

In [5]:
X

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.bfloat16)

In [6]:
X.shape

torch.Size([2, 3])

In [7]:
X.dtype

torch.bfloat16

In [8]:
X[0, 1]

tensor(4., dtype=torch.bfloat16)

In [9]:
X[:, 1]

tensor([4., 3.], dtype=torch.bfloat16)

In [10]:
X.abs()

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.bfloat16)

In [11]:
torch.abs(X)

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.bfloat16)

In [12]:
# torch.cos(), torch.exp(), torch.max(), torch.mean(), torch.sqrt()

In [13]:
X.sqrt()

tensor([[1.0000, 2.0000, 2.6406],
        [1.4141, 1.7344, 2.4531]], dtype=torch.bfloat16)

In [14]:
10 * (X + 1.0)

tensor([[20., 50., 80.],
        [30., 40., 70.]], dtype=torch.bfloat16)

In [15]:
X.exp()

tensor([[   2.7188,   54.5000, 1096.0000],
        [   7.3750,   20.1250,  404.0000]], dtype=torch.bfloat16)

In [16]:
X.mean()

tensor(3.8281, dtype=torch.bfloat16)

In [17]:
X.max(dim=0) #, keepdim=True

torch.return_types.max(
values=tensor([2., 4., 7.], dtype=torch.bfloat16),
indices=tensor([1, 0, 0]))

In [18]:
X

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.bfloat16)

In [19]:
X @ X.T

tensor([[66., 56.],
        [56., 49.]], dtype=torch.bfloat16)

In [20]:
import numpy as np

In [21]:
A.numpy()

array([[1., 4., 7.],
       [2., 3., 6.]], dtype=float32)

In [22]:
torch.tensor(np.array([[1., 4., 7.],
                      [2., 3., 6.]]))

tensor([[1., 4., 7.],
        [2., 3., 6.]], dtype=torch.float64)

In [23]:
# It’s generally better to use 32 bits in deep learning because this takes half the RAM and speeds up computations,
# and neural nets do not actually need the extra precision offered by 64-bit floats.

In [24]:
N = torch.FloatTensor(np.array([            # automatically converts the array to 32 bits
                            [1., 4., 7.],
                             [2., 3., 6]]))
N

tensor([[1., 4., 7.],
        [2., 3., 6.]])

In [25]:
N.dtype

torch.float32

In [26]:
X[:, 1] = -99
X

tensor([[  1., -99.,   7.],
        [  2., -99.,   6.]], dtype=torch.bfloat16)

In [27]:
X.relu()

tensor([[1., 0., 7.],
        [2., 0., 6.]], dtype=torch.bfloat16)

In [28]:
X

tensor([[  1., -99.,   7.],
        [  2., -99.,   6.]], dtype=torch.bfloat16)

In [29]:
X.relu_()

tensor([[1., 0., 7.],
        [2., 0., 6.]], dtype=torch.bfloat16)

In [30]:
X    # _ changed in place

tensor([[1., 0., 7.],
        [2., 0., 6.]], dtype=torch.bfloat16)

In [31]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [32]:
device

'cuda'

In [33]:
!nvidia-smi

Tue Jan 27 09:55:27 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             10W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [34]:
M = torch.tensor([
    [1., 2., 3.],
    [4., 5., 6.]
])
M = M.to(device)

In [35]:
M

tensor([[1., 2., 3.],
        [4., 5., 6.]], device='cuda:0')

In [36]:
M.device

device(type='cuda', index=0)

In [37]:
M = torch.tensor([[1., 2., 3.], [4., 5., 6.]], device=device)

In [38]:
R = M @ M.T

In [39]:
R

tensor([[14., 32.],
        [32., 77.]], device='cuda:0')

In [40]:
M = torch.rand((1000, 1000))  # on the CPU
%timeit M @ M.T


M = torch.rand((1000, 1000), device="cuda")  # on the GPU
%timeit M @ M.T

# now try 100 by 100 :D

41.1 ms ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
539 µs ± 14.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [41]:
x = torch.tensor(5.0, requires_grad=True)
f = x ** 2
f

tensor(25., grad_fn=<PowBackward0>)

In [42]:
f.backward()
x.grad

tensor(10.)

In [43]:
learning_rate = 0.1
with torch.no_grad():
    x -= learning_rate * x.grad  # gradient descent step

In [44]:
x

tensor(4., requires_grad=True)

In [45]:
x_detached = x.detach()
x_detached -= learning_rate * x.grad   # detach yaddashda eyni yerdedi

In [46]:
x_detached

tensor(3.)

In [47]:
x.grad.zero_()

tensor(0.)

In [48]:
learning_rate = 0.1
x = torch.tensor(5.0, requires_grad=True)
for iteration in range(100):
    f = x ** 2  # forward pass
    f.backward()  # backward pass
    with torch.no_grad():
        x -= learning_rate * x.grad  # gradient descent step
    print(x)
    x.grad.zero_()  # reset the gradients

tensor(4., requires_grad=True)
tensor(3.2000, requires_grad=True)
tensor(2.5600, requires_grad=True)
tensor(2.0480, requires_grad=True)
tensor(1.6384, requires_grad=True)
tensor(1.3107, requires_grad=True)
tensor(1.0486, requires_grad=True)
tensor(0.8389, requires_grad=True)
tensor(0.6711, requires_grad=True)
tensor(0.5369, requires_grad=True)
tensor(0.4295, requires_grad=True)
tensor(0.3436, requires_grad=True)
tensor(0.2749, requires_grad=True)
tensor(0.2199, requires_grad=True)
tensor(0.1759, requires_grad=True)
tensor(0.1407, requires_grad=True)
tensor(0.1126, requires_grad=True)
tensor(0.0901, requires_grad=True)
tensor(0.0721, requires_grad=True)
tensor(0.0576, requires_grad=True)
tensor(0.0461, requires_grad=True)
tensor(0.0369, requires_grad=True)
tensor(0.0295, requires_grad=True)
tensor(0.0236, requires_grad=True)
tensor(0.0189, requires_grad=True)
tensor(0.0151, requires_grad=True)
tensor(0.0121, requires_grad=True)
tensor(0.0097, requires_grad=True)
tensor(0.0077, requires_

In [49]:
t = torch.tensor(2.0, requires_grad=True)
z = t.exp()  # this is an intermediate result
z += 1  # this is an in-place operation
z.backward()  #  RuntimeError!

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of ExpBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [50]:
# but

t = torch.tensor(2.0, requires_grad=True)
z = t.exp()  # this is an intermediate result
z = z + 1  # new tensor created, and previous saved for computation graph
z.backward()  #  RuntimeError!

# make sure you don’t modify in place a tensor needed for backprop

In [51]:
import sklearn

In [52]:
X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)

In [53]:
X_train_full, X_test, y_train_full, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)

In [54]:
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_train_full, y_train_full, test_size=0.2)

In [55]:
X_train.shape, X_valid.shape, X_test.shape

((13209, 8), (3303, 8), (4128, 8))

In [56]:
X_train = torch.FloatTensor(X_train)
X_valid = torch.FloatTensor(X_valid)
X_test = torch.FloatTensor(X_test)
means = X_train.mean(dim=0, keepdims=True)
stds = X_train.std(dim=0, keepdims=True)
X_train = (X_train - means) / stds
X_valid = (X_valid - means) / stds
X_test = (X_test - means) / stds

In [57]:
y_train = torch.FloatTensor(y_train).reshape(-1, 1)
y_valid = torch.FloatTensor(y_valid).reshape(-1, 1)
y_test = torch.FloatTensor(y_test).reshape(-1, 1)


In [58]:
torch.manual_seed(42)
n_features = X_train.shape[1]
w = torch.randn((n_features, 1), requires_grad=True)
b = torch.tensor(0., requires_grad=True)

In [59]:
learning_rate = 0.4
n_epochs = 20
for epoch in range(n_epochs):
  y_pred = X_train @ w + b
  loss = ((y_pred - y_train) ** 2).mean()
  loss.backward()
  with torch.no_grad():
    b -= learning_rate * b.grad
    w -= learning_rate * w.grad
    b.grad.zero_()
    w.grad.zero_()
  print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item()}')



Epoch 1/20, Loss: 16.08509063720703
Epoch 2/20, Loss: 4.923903942108154
Epoch 3/20, Loss: 2.294161796569824
Epoch 4/20, Loss: 1.347658634185791
Epoch 5/20, Loss: 0.9681516885757446
Epoch 6/20, Loss: 0.8047587871551514
Epoch 7/20, Loss: 0.7275892496109009
Epoch 8/20, Loss: 0.6861670017242432
Epoch 9/20, Loss: 0.6603448987007141
Epoch 10/20, Loss: 0.6418980956077576
Epoch 11/20, Loss: 0.6273733973503113
Epoch 12/20, Loss: 0.6152555346488953
Epoch 13/20, Loss: 0.6048279404640198
Epoch 14/20, Loss: 0.5957116484642029
Epoch 15/20, Loss: 0.5876761078834534
Epoch 16/20, Loss: 0.5805610418319702
Epoch 17/20, Loss: 0.5742433667182922
Epoch 18/20, Loss: 0.5686224102973938
Epoch 19/20, Loss: 0.5636130571365356
Epoch 20/20, Loss: 0.5591421723365784


In [60]:
X_new = X_test[:3]
with torch.no_grad():
  y_pred = X_new @ w + b

y_pred

tensor([[1.0591],
        [1.9689],
        [1.7947]])

In [61]:
y_test[:3]

tensor([[0.5740],
        [2.7500],
        [1.3750]])

In [62]:
import torch.nn as nn

In [63]:
torch.manual_seed(42)
model = nn.Linear(in_features = n_features, out_features=1)

In [64]:
model.bias

Parameter containing:
tensor([0.3117], requires_grad=True)

In [65]:
model.weight

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)

In [66]:
model.parameters()

<generator object Module.parameters at 0x7a206b7260a0>

In [67]:
next(model.parameters())

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)

In [68]:
for param in model.parameters():
  print(param)

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
Parameter containing:
tensor([0.3117], requires_grad=True)


In [69]:
for name, param in model.named_parameters():
  print(f'{name} | {param}')

weight | Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
bias | Parameter containing:
tensor([0.3117], requires_grad=True)


In [70]:
model(X_train[:2])

tensor([[ 0.6705],
        [-0.3169]], grad_fn=<AddmmBackward0>)

In [71]:
y_train[:2]

tensor([[1.9600],
        [1.4120]])

In [72]:
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
mse = nn.MSELoss()

In [73]:
model(X_train[:2])

tensor([[ 0.6705],
        [-0.3169]], grad_fn=<AddmmBackward0>)

In [74]:
def train_bgd(model, optimzier, criterion, X_train, y_train, n_epochs):
  for epoch in range(n_epochs):
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item()}')

In [75]:
train_bgd(model, optimizer, mse, X_train, y_train, n_epochs)

Epoch 1/20, Loss: 4.224911689758301
Epoch 2/20, Loss: 0.7593734264373779
Epoch 3/20, Loss: 0.6096521019935608
Epoch 4/20, Loss: 0.59157794713974
Epoch 5/20, Loss: 0.5818150639533997
Epoch 6/20, Loss: 0.5740516185760498
Epoch 7/20, Loss: 0.5674729347229004
Epoch 8/20, Loss: 0.5617811679840088
Epoch 9/20, Loss: 0.5568109154701233
Epoch 10/20, Loss: 0.5524502396583557
Epoch 11/20, Loss: 0.5486139059066772
Epoch 12/20, Loss: 0.5452324748039246
Epoch 13/20, Loss: 0.5422475337982178
Epoch 14/20, Loss: 0.5396090149879456
Epoch 15/20, Loss: 0.5372735857963562
Epoch 16/20, Loss: 0.5352038145065308
Epoch 17/20, Loss: 0.5333670973777771
Epoch 18/20, Loss: 0.5317351222038269
Epoch 19/20, Loss: 0.5302830338478088
Epoch 20/20, Loss: 0.5289893746376038


In [76]:
X_new = X_test[:3]

with torch.no_grad():
  y_pred = model(X_new)

y_pred

tensor([[0.8942],
        [1.9890],
        [1.6891]])

In [77]:
y_test[:3]

tensor([[0.5740],
        [2.7500],
        [1.3750]])

In [78]:
torch.manual_seed(42)
model = nn.Sequential(
    nn.Linear(n_features,50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
)

In [79]:
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()
train_bgd(model, optimizer, mse, X_train, y_train, n_epochs)

Epoch 1/20, Loss: 4.9337849617004395
Epoch 2/20, Loss: 2.0644755363464355
Epoch 3/20, Loss: 1.0229377746582031
Epoch 4/20, Loss: 0.8763200640678406
Epoch 5/20, Loss: 0.7916882038116455
Epoch 6/20, Loss: 0.7387831211090088
Epoch 7/20, Loss: 0.7046931385993958
Epoch 8/20, Loss: 0.6820633411407471
Epoch 9/20, Loss: 0.6663106679916382
Epoch 10/20, Loss: 0.6546448469161987
Epoch 11/20, Loss: 0.6453414559364319
Epoch 12/20, Loss: 0.6374305486679077
Epoch 13/20, Loss: 0.6303848624229431
Epoch 14/20, Loss: 0.6239205002784729
Epoch 15/20, Loss: 0.6178619861602783
Epoch 16/20, Loss: 0.6121279001235962
Epoch 17/20, Loss: 0.606654703617096
Epoch 18/20, Loss: 0.6014005541801453
Epoch 19/20, Loss: 0.5963351130485535
Epoch 20/20, Loss: 0.5914300680160522


# Making it MiniBatchessss

In [80]:
from torch.utils.data import TensorDataset, DataLoader

In [81]:
train_dataset = TensorDataset(X_train, y_train)

In [82]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [83]:
torch.manual_seed(42)
model = nn.Sequential(
    nn.Linear(in_features = n_features, out_features = 64),
    nn.ReLU(),
    nn.Linear(in_features = 64, out_features = 32),
    nn.ReLU(),
    nn.Linear(in_features = 32, out_features = 1)
)

model.to(device)

Sequential(
  (0): Linear(in_features=8, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=1, bias=True)
)

In [84]:
learning_rate = 0.02
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
mse = nn.MSELoss()

In [85]:
def train(model, optimizer, criterion, train_loader, n_epochs):
  model.train()
  for epoch in range(n_epochs):
    total_loss = 0.
    for X_batch, y_batch in train_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      loss = criterion(y_pred, y_batch)
      total_loss += loss.item()
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    mean_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{n_epochs}, Loss:{mean_loss:.4f}')

In [86]:
train(model, optimizer, mse, train_loader, n_epochs)

Epoch 1/20, Loss:0.6437
Epoch 2/20, Loss:0.4431
Epoch 3/20, Loss:0.4366
Epoch 4/20, Loss:0.4031
Epoch 5/20, Loss:0.3801
Epoch 6/20, Loss:0.3700
Epoch 7/20, Loss:0.3629
Epoch 8/20, Loss:0.3521
Epoch 9/20, Loss:0.3505
Epoch 10/20, Loss:0.3477
Epoch 11/20, Loss:0.3370
Epoch 12/20, Loss:0.3333
Epoch 13/20, Loss:0.3289
Epoch 14/20, Loss:0.3226
Epoch 15/20, Loss:0.3225
Epoch 16/20, Loss:0.3168
Epoch 17/20, Loss:0.3116
Epoch 18/20, Loss:0.3098
Epoch 19/20, Loss:0.3085
Epoch 20/20, Loss:0.3064


In [87]:
import torch
print(torch.cuda.is_available())


True


In [88]:
# LET'S EVALUATE our model

def evaluate(model, data_loader, metric_fn, aggregate_fn=torch.mean):
  model.eval()
  metrics=[]
  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      metric = metric_fn(y_pred, y_batch)
      metrics.append(metric)
  return aggregate_fn(torch.stack(metrics))


In [89]:
valid_dataset = TensorDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=32)
valid_mse = evaluate(model, valid_loader, mse)
valid_mse

tensor(0.3149, device='cuda:0')

In [90]:
def rmse(y_pred, y_true):
  return ((y_pred - y_true) ** 2).mean().sqrt()

In [91]:
evaluate(model, valid_loader, rmse)


tensor(0.5497, device='cuda:0')

In [92]:
0.3269 ** 0.5 != 0.5619

True

The reason is that instead of calculating the RMSE over the whole validation set, we computed it over each batch and then computed the mean of all these batch RMSEs

In [93]:
evaluate(model, valid_loader, mse,
         aggregate_fn=lambda metrics: torch.sqrt(torch.mean(metrics)))

tensor(0.5611, device='cuda:0')

In [94]:
0.3269 ** 0.5

0.5717516943569123

In [95]:
%pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.15.2 torchmetrics-1.8.2


In [96]:
import torchmetrics

In [97]:
def evaluate_tm(model, data_loader, metric):
  model.eval()
  metric.reset()
  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      metric.update(y_pred, y_batch)
  return metric.compute()

In [98]:
rmse = torchmetrics.MeanSquaredError(squared=False).to(device)
evaluate_tm(model, valid_loader, rmse)

tensor(0.5623, device='cuda:0')

# Deep and Wide Networks

In [99]:
class WideAndDeep(nn.Module):
  def __init__(self, n_features):
    super().__init__()
    self.deep_stack = nn.Sequential(
        nn.Linear(n_features, 50),
        nn.ReLU(),
        nn.Linear(50, 40),
        nn.ReLU()
    )
    self.output_layer = nn.Linear(40 + n_features, 1)

  def forward(self, X):
    deep_output = self.deep_stack(X)
    wide_and_deep = torch.concat([X, deep_output], dim=1)
    return self.output_layer(wide_and_deep)

In [100]:
[i for i in WideAndDeep(8).children()]

[Sequential(
   (0): Linear(in_features=8, out_features=50, bias=True)
   (1): ReLU()
   (2): Linear(in_features=50, out_features=40, bias=True)
   (3): ReLU()
 ),
 Linear(in_features=48, out_features=1, bias=True)]

In [101]:
[i for i in WideAndDeep(8).named_children()]

[('deep_stack',
  Sequential(
    (0): Linear(in_features=8, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=40, bias=True)
    (3): ReLU()
  )),
 ('output_layer', Linear(in_features=48, out_features=1, bias=True))]

In [102]:
torch.manual_seed(42)
model = WideAndDeep(n_features).to(device)
learning_rate = 0.002
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()

train(model, optimizer, mse, train_loader, n_epochs)

Epoch 1/20, Loss:1.3444
Epoch 2/20, Loss:0.6167
Epoch 3/20, Loss:0.5747
Epoch 4/20, Loss:0.5460
Epoch 5/20, Loss:0.5239
Epoch 6/20, Loss:0.5096
Epoch 7/20, Loss:0.4981
Epoch 8/20, Loss:0.4850
Epoch 9/20, Loss:0.4785
Epoch 10/20, Loss:0.4696
Epoch 11/20, Loss:0.4627
Epoch 12/20, Loss:0.4577
Epoch 13/20, Loss:0.4527
Epoch 14/20, Loss:0.4480
Epoch 15/20, Loss:0.4415
Epoch 16/20, Loss:0.4367
Epoch 17/20, Loss:0.4323
Epoch 18/20, Loss:0.4284
Epoch 19/20, Loss:0.4246
Epoch 20/20, Loss:0.4205


(20640, 8)

In [114]:
class WideAndDeepV2(nn.Module):
  def __init__(self, n_features):
    super().__init__()

    self.deep_stack = nn.Sequential(
        nn.Linear(n_features - 2, 50), nn.ReLU(),   # 6 features left
        nn.Linear(50, 40), nn.ReLU()
    )
    self.output_layer = nn.Linear(40 + 5, 1)   # 5 comes from wide


  def forward(self, X):
    X_wide = X[:, :5]
    X_deep = X[:, 2:]

    deep_output = self.deep_stack(X_deep)
    wide_and_deep = torch.concat([X_wide, deep_output], dim=1)

    return self.output_layer(wide_and_deep)


In [115]:
torch.manual_seed(42)
model = WideAndDeepV2(n_features).to(device)
learning_rate = 0.002
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()

train(model, optimizer, mse, train_loader, n_epochs)

Epoch 1/20, Loss:1.4002
Epoch 2/20, Loss:0.6169
Epoch 3/20, Loss:0.5604
Epoch 4/20, Loss:0.5316
Epoch 5/20, Loss:0.5145
Epoch 6/20, Loss:0.5024
Epoch 7/20, Loss:0.4939
Epoch 8/20, Loss:0.4849
Epoch 9/20, Loss:0.4806
Epoch 10/20, Loss:0.4756
Epoch 11/20, Loss:0.4686
Epoch 12/20, Loss:0.4643
Epoch 13/20, Loss:0.4605
Epoch 14/20, Loss:0.4565
Epoch 15/20, Loss:0.4535
Epoch 16/20, Loss:0.4509
Epoch 17/20, Loss:0.4483
Epoch 18/20, Loss:0.4463
Epoch 19/20, Loss:0.4437
Epoch 20/20, Loss:0.4418


In [116]:
class WideAndDeepV3(nn.Module):
  def __init__(self, n_features):
    super().__init__()

    self.deep_stack = nn.Sequential(
        nn.Linear(n_features - 2, 50), nn.ReLU(),   # 6 features left
        nn.Linear(50, 40), nn.ReLU()
    )
    self.output_layer = nn.Linear(40 + 5, 1)   # 5 comes from wide

  def forward(self, X_wide, X_deep):
    deep_output = self.deep_stack(X_deep)
    wide_and_deep = torch.concat([X_wide, deep_output], dim=1)

    return self.output_layer(wide_and_deep)

In [120]:
train_data_wd = TensorDataset(X_train[:, :5], X_train[:, 2:], y_train)
train_loader_wd = DataLoader(train_data_wd, batch_size=32, shuffle=True)
valid_data_wd = TensorDataset(X_valid[:, :5], X_valid[:, 2:], y_valid)
valid_loader_wd = DataLoader(valid_data_wd, batch_size=32, shuffle=True)
test_data_wd = TensorDataset(X_test[:, :5], X_test[:, 2:], y_test)
test_loader_wd = DataLoader(test_data_wd, batch_size=32, shuffle=True)

In [121]:
def train(model, optimizer, criterion, train_loader, n_epochs):
  for epoch in range(n_epochs):
    total_loss = 0.
    for X_batch_wide, X_batch_deep, y_batch in train_loader:
      X_batch_wide, X_batch_deep, y_batch = X_batch_wide.to(device), X_batch_deep.to(device), y_batch.to(device)
      y_pred = model(X_batch_wide, X_batch_deep)
      loss = criterion(y_pred, y_batch)
      total_loss = total_loss + loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    mean_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{n_epochs}, Loss:{mean_loss:.4f}')


In [123]:
torch.manual_seed(42)
model = WideAndDeepV3(n_features).to(device)
learning_rate = 0.002
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()

train(model, optimizer, mse, train_loader_wd, n_epochs)

Epoch 1/20, Loss:1.4002
Epoch 2/20, Loss:0.6169
Epoch 3/20, Loss:0.5604
Epoch 4/20, Loss:0.5316
Epoch 5/20, Loss:0.5145
Epoch 6/20, Loss:0.5024
Epoch 7/20, Loss:0.4939
Epoch 8/20, Loss:0.4849
Epoch 9/20, Loss:0.4806
Epoch 10/20, Loss:0.4756
Epoch 11/20, Loss:0.4686
Epoch 12/20, Loss:0.4643
Epoch 13/20, Loss:0.4605
Epoch 14/20, Loss:0.4565
Epoch 15/20, Loss:0.4535
Epoch 16/20, Loss:0.4509
Epoch 17/20, Loss:0.4483
Epoch 18/20, Loss:0.4463
Epoch 19/20, Loss:0.4437
Epoch 20/20, Loss:0.4418
