# Machine Learning with PyTorch

This notebook is used to understand how to implement SGD with PyTorch before experimenting with other algorithms.

In [1]:
# test if pytorch is working
import torch
torch.__version__

'2.7.0.dev20250128+cpu'

## Data Wranggling

PyTorch works with Tensor objects. Operations of Tensors behave like Numpy arrays and can be passed back and forth.

In [2]:
a = torch.tensor(1)
print(a)

b = torch.tensor([1, 2])
print(b)

tensor(1)
tensor([1, 2])


In [3]:
X = torch.tensor([[1,2],[3,4]])
Y = torch.tensor([[5,6],[7,8]])

print(X + Y)
print(X * Y)
print(X @ Y)

tensor([[ 6,  8],
        [10, 12]])
tensor([[ 5, 12],
        [21, 32]])
tensor([[19, 22],
        [43, 50]])


In [4]:
print(X.dim()) #dimensions
print(b.dim())
print(a.dim())

print("---")

print(X.ndim)
print(b.ndim)
print(a.ndim)

2
1
0
---
2
1
0


In [5]:
u = torch.tensor([3, 4])
v = torch.tensor([2, 3])

torch.dot(u, v) # dot product

tensor(18)

In [6]:
try:
    torch.dot(X, Y) # doesn't work because dot products only works on vectors
except Exception as e:
    print(e)

1D tensors expected, but got 2D and 2D tensors


In [7]:
print(X.T) #transpose matrix

tensor([[1, 3],
        [2, 4]])


### Dealing with Numpy

PyTorch tensors can go hand to hand with Numpy arrays

In [8]:
import numpy as np

print(X.numpy())
type(X.numpy())

[[1 2]
 [3 4]]


numpy.ndarray

In [9]:
print(torch.from_numpy(np.array([[4,5],[6,7]])))
type(torch.from_numpy(np.array([[4,5],[6,7]])))

tensor([[4, 5],
        [6, 7]])


torch.Tensor

### Random Tensors

In [10]:
print(torch.randn(1))
print(torch.randn(3,2))

tensor([-0.2763])
tensor([[-1.0392, -0.7893],
        [-0.0870,  0.0144],
        [-0.6517,  0.2873]])


In [11]:
print(torch.randn(2, requires_grad=True, dtype=torch.float16))

tensor([ 0.5776, -1.3838], dtype=torch.float16, requires_grad=True)


In [12]:
print(torch.normal(mean=0, std=1, size=(2,4)))

tensor([[-1.2942,  1.7905, -0.6427, -0.6889],
        [ 1.0933,  1.5447, -1.0101,  0.1803]])


## Working with GPUs

In [13]:
# detect whether CUDA is available

print(f"is CUDA available: {torch.cuda.is_available()}")

# detect whether silicon GPU is available

print(f"is apple silicon available: {torch.backends.mps.is_available()}")

is CUDA available: False
is apple silicon available: False


Tensors can be moved to GPUs using `TENSOR.to(device = "cuda")` or `TENSOR.to(device = "mps")`.

Note that once a tensor is in a GPU, it cannot be exported to Numpy.

## OLS with PyTorch

Below is code to create a simple OLS model with PyTorch.

In [14]:
from torch import nn


In [15]:
# mock regression data

weight = [0.77, -0.56]
bias = np.random.normal(0,12)
SEED = 9999

X = np.random.rand(100, 2) * 10
y = X @ weight + bias

print(X.size)
print(y.size)

X = torch.from_numpy(X)
print(X.dtype)
y = torch.from_numpy(y)

print(X.size())
print(y.size())


200
100
torch.float64
torch.Size([100, 2])
torch.Size([100])


All models in PyTorch must be created as a class (subclass of nn.Module) with a forward method.

In [16]:
class PyTorchOLS(nn.Module):
    """
    Specifies NN architecture here
    """
    def __init__(self):
        super().__init__()

        #initialize weights with a random vector
        self.weights = nn.Parameter(
            torch.randn(
                2,
                requires_grad=True, #PyTorch will track gradients of this param
                dtype=torch.float64
            )
        )

        #initialize bias with a random scalar
        self.bias = nn.Parameter(
            torch.randn(
                1,
                requires_grad=True, #PyTorch will track gradients of this param
                dtype=torch.float64
            )
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        This defines the forward computation of the model
        """
        return torch.matmul(x, self.weights) + self.bias

The class above defines the architecture of the model which is really a linear regression with 2 regressors. We can then create an instance of the model class and look at the parameters and its initial predictions.

In [17]:
torch.manual_seed(42) #set seed

OLS = PyTorchOLS() #initialize class

list(OLS.parameters()) #checks current state of model (prior to training)

[Parameter containing:
 tensor([0.3367, 0.1288], dtype=torch.float64, requires_grad=True),
 Parameter containing:
 tensor([0.2345], dtype=torch.float64, requires_grad=True)]

In [18]:
OLS.state_dict() #checks current state of model (prior to training)

OrderedDict([('weights', tensor([0.3367, 0.1288], dtype=torch.float64)),
             ('bias', tensor([0.2345], dtype=torch.float64))])

In [19]:
# train validation test split

X_train, X_validate, X_test = X[:80], X[80:90], X[90:]
y_train, y_validate, y_test = y[:80], y[80:90], y[90:]

print(X_train.size())
print(X_validate.size())
print(X_test.size())

print(y_train.size())
print(y_validate.size())
print(y_test.size())

torch.Size([80, 2])
torch.Size([10, 2])
torch.Size([10, 2])
torch.Size([80])
torch.Size([10])
torch.Size([10])


In [20]:
with torch.inference_mode():
    y_pred = OLS(X_test)
y_pred

tensor([3.9215, 1.7584, 4.1914, 3.1828, 2.1138, 2.0429, 2.3022, 0.4895, 3.4685,
        2.8963], dtype=torch.float64)

In [21]:
#Training the model

epochs = 100
learning_rate = 0.01

def MSE(
    actual: torch.Tensor,
    predicted: torch.Tensor
) -> torch.Tensor:
    return torch.mean((actual - predicted) ** 2)

train_losses = []
epoch_index = []

for epoch in range(epochs):
    OLS.train() #puts model into train mode
    y_pred = OLS(X_train)

    loss = MSE(y_train, y_pred) #loss function here is MSE

    loss.backward() #backwards pass

    OLS.eval() #model is now in evaluation mode

    with torch.inference_mode():
        # Update weights
        OLS.weights -= learning_rate * OLS.weights.grad #updates weight param
        OLS.bias -= learning_rate * OLS.bias.grad #updates bias param
        
        # Zero the gradients after updating weights
        OLS.weights.grad.zero_()
        OLS.bias.grad.zero_()

        y_pred_validate = OLS(X_validate)
        test_loss = MSE(y_pred_validate, y_validate.type(torch.float64))

    epoch_index.append(epoch+1)
    if epoch % 10 == 0:
        print(f"""Epoch {epoch+1}/{epochs}, 
            Training Loss: {loss.item():.4f},
            Test Loss: {test_loss.item():.4f}""")

Epoch 1/100, 
            Training Loss: 24.4103,
            Test Loss: 6.7783
Epoch 11/100, 
            Training Loss: 4.3433,
            Test Loss: 2.2605
Epoch 21/100, 
            Training Loss: 4.0065,
            Test Loss: 1.9760
Epoch 31/100, 
            Training Loss: 3.7824,
            Test Loss: 1.8518
Epoch 41/100, 
            Training Loss: 3.5727,
            Test Loss: 1.7472
Epoch 51/100, 
            Training Loss: 3.3747,
            Test Loss: 1.6501
Epoch 61/100, 
            Training Loss: 3.1876,
            Test Loss: 1.5586
Epoch 71/100, 
            Training Loss: 3.0109,
            Test Loss: 1.4722
Epoch 81/100, 
            Training Loss: 2.8440,
            Test Loss: 1.3906
Epoch 91/100, 
            Training Loss: 2.6864,
            Test Loss: 1.3135


In [22]:
y_pred = OLS(X_validate)
torch.mean((y_validate - y_pred)).backward()
print(OLS.weights)
print(OLS.weights.grad)

Parameter containing:
tensor([ 1.1114, -0.1739], dtype=torch.float64, requires_grad=True)
tensor([-4.5440, -5.3779], dtype=torch.float64)


In PyTorch "optimizers" can be used to update weights with every training iteration. It is better for larger datasets calculations involving only a subset of the training dataset maybe used per iterations (ex. SGD algorithm).

In [23]:
#Training the model - with optimizers

epochs = 100
learning_rate = 0.01

def MSE(
    actual: torch.Tensor,
    predicted: torch.Tensor
) -> torch.Tensor:
    return torch.mean((actual - predicted) ** 2)

train_losses = []
epoch_index = []

for epoch in range(epochs):
    OLS.train() #puts model into train mode
    y_pred = OLS(X_train)

    loss = MSE(y_train, y_pred) #loss function here is MSE

    loss.backward() #backwards pass

    OLS.eval() #model is now in evaluation mode

    with torch.inference_mode():
        # Update weights
        OLS.weights -= learning_rate * OLS.weights.grad #updates weight param
        OLS.bias -= learning_rate * OLS.bias.grad #updates bias param
        
        # Zero the gradients after updating weights
        OLS.weights.grad.zero_()
        OLS.bias.grad.zero_()

        y_pred_validate = OLS(X_validate)
        test_loss = MSE(y_pred_validate, y_validate.type(torch.float64))

    epoch_index.append(epoch+1)
    if epoch % 10 == 0:
        print(f"""Epoch {epoch+1}/{epochs}, 
            Training Loss: {loss.item():.4f},
            Test Loss: {test_loss.item():.4f}""")

Epoch 1/100, 
            Training Loss: 2.5375,
            Test Loss: 1.2024
Epoch 11/100, 
            Training Loss: 2.3953,
            Test Loss: 1.1721
Epoch 21/100, 
            Training Loss: 2.2625,
            Test Loss: 1.1064
Epoch 31/100, 
            Training Loss: 2.1371,
            Test Loss: 1.0450
Epoch 41/100, 
            Training Loss: 2.0187,
            Test Loss: 0.9870
Epoch 51/100, 
            Training Loss: 1.9068,
            Test Loss: 0.9323
Epoch 61/100, 
            Training Loss: 1.8011,
            Test Loss: 0.8806
Epoch 71/100, 
            Training Loss: 1.7012,
            Test Loss: 0.8318
Epoch 81/100, 
            Training Loss: 1.6069,
            Test Loss: 0.7857
Epoch 91/100, 
            Training Loss: 1.5179,
            Test Loss: 0.7422
