# Feed Forward Networks (FFN) 

- AKA Artificial Neural Networks (ANN)



In [1]:
# Built-in library
import logging
from typing import Any, Optional, Sequence, Union

# Standard imports
import numpy as np
import numpy.typing as npt
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

# Configure the backend
import matplotlib_inline.backend_inline

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 2_000

matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
import seaborn as sns

# Custom import
from src.utilities import (
    set_up_logger,
    create_iris_data,
    create_qwerties_data,
    smooth,
)
from src.data_manager import (
    load_data,
    create_data_loader,
    split_into_train_n_validation,
)
from src.preprocessor import Standardizer, Normalizer


# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

### Load/Download MNIST Digits Data

In [2]:
fp = "../../data/mnist_digit"

# Chain multiple transformations
transform = transforms.Compose(
    [
        # Convert to PyTorch tensors
        transforms.ToTensor(),
        #  It performs per-channel normalization, where each channel
        # (e.g., red, green, blue for an RGB image) is normalized independently.
        # Since it's a single channel, we have (0.5,)
        transforms.Normalize(mean=(0.5,), std=(0.5,)),
    ]
)

train_dataset = MNIST(root=fp, train=True, transform=transform, download=True)
test_dataset = MNIST(root=fp, train=False, transform=transform, download=True)

In [3]:
test_dataset

Dataset MNIST
    Number of datapoints: 10000
    Root location: ../../data/mnist_digit
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.5,), std=(0.5,))
           )

### Configurations

In [4]:
# Set random seeds for reproducibility
RANDOM_STATE = 123

torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [5]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# If we're on a CUDA machine, this should print a CUDA device:
print(f"Working on device={device}")

Working on device=cpu


### Hyper-parameters


In [6]:
import torchvision.transforms as transforms


# Each MNSIT image is 1x28x28, so it is an 2D array [28,28]
# I'll flatten the image as vector dim=1*28*28
input_size = 1 * 28 * 28
hidden_size = 128
num_classes = 10
num_epochs = 5
batch_size = 64
learning_rate = 0.001

train_size = int(0.8 * len(train_dataset))  # 80% of training data
val_size = len(train_dataset) - train_size  # 20% of training data

### Prepare The Data

In [7]:
# Split the training dataset into training and validation sets
train_dataset, val_dataset = random_split(
    dataset=train_dataset,
    lengths=[train_size, val_size],
)

# Create DataLoader for each dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### Define FNN

In [8]:
class FFN(nn.Module):
    """This is used to build a Feed Forward Network architecture that
    is used for classification."""

    def __init__(self, input_size: int, hidden_size: int, num_classes: int) -> None:
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 64)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """This performs the forward propagation."""
        # Flatten the input images
        x = x.view(-1, (28 * 28))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [9]:
# Test the model with random data
ffn = FFN(input_size=input_size, hidden_size=hidden_size, num_classes=num_classes)
x_ = torch.rand(size=(1_000, 28, 28))
x_.shape
result = ffn.forward(x=x_)
result

tensor([[-0.0669, -0.0147, -0.0402,  ..., -0.0764,  0.0500,  0.0240],
        [-0.1014,  0.0012, -0.0063,  ..., -0.0860,  0.0868,  0.0196],
        [-0.1144, -0.0045, -0.0631,  ..., -0.0724,  0.0064, -0.0021],
        ...,
        [-0.0906, -0.0359, -0.0019,  ..., -0.0985,  0.0290,  0.0308],
        [-0.0753, -0.0220, -0.0032,  ..., -0.0829,  0.0367,  0.0334],
        [-0.0603, -0.0191, -0.0079,  ..., -0.1157,  0.0658,  0.0115]],
       grad_fn=<AddmmBackward0>)

In [10]:
result.shape

torch.Size([1000, 10])

In [11]:
# It returns the value and the index.
# We're interested in the index
values, _labels = torch.max(result, dim=1)
values[:5], _labels[:5]

(tensor([0.0914, 0.0868, 0.0698, 0.1125, 0.1337], grad_fn=<SliceBackward0>),
 tensor([5, 8, 3, 8, 3]))

In [12]:
# OR
_labels = torch.argmax(result, dim=1)
_labels[:5]

tensor([5, 8, 3, 8, 3])

### Train Model

In [13]:
# ==== Init model ====
model = FFN(
    input_size=input_size,
    hidden_size=hidden_size,
    num_classes=num_classes,
).to(device=device)

# ==== Define loss function and optimizer ====
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

# ==== Training loop ====
for epoch in tqdm(range(num_epochs)):
    model.train()
    running_loss = 0.0

    # ==== Batch training loop ====
    for images, labels in train_loader:
        # Push the data to GPU if available
        images, labels = images.to(device), labels.to(device)

        # ==== Forwardprop ====
        outputs = model(images)
        loss: nn.CrossEntropyLoss = criterion(outputs, labels)

        # ==== Backprop ====
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Update the loss
        running_loss += loss.item()

    # ==== Validation loop ====
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            # Push the data to GPU if available
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            val_loss = criterion(outputs, labels).item()
            # It returns the value and the index.
            # We're interested in the index
            _, predicted = torch.max(outputs, dim=1)
            val_total += labels.size(0)  # or labels.shape[0]
            val_correct += (predicted == labels).sum().item()
    val_accuracy = (val_correct / val_total) * 100

    print(
        f"Epoch {epoch + 1}/{num_epochs}, "
        f"Training Loss: {running_loss / len(train_loader)}, "
        f"Validation Loss: {val_loss / len(val_loader)}, "
        f"Validation Accuracy: {val_accuracy:.2f}%"
    )

# Test the model on the test dataset
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for images, labels in test_loader:
        # Push the data to GPU if available
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        # It returns the value and the index.
        # We're interested in the index
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = (test_correct / test_total) * 100
print(f"Test Accuracy: { test_accuracy:.2f}%")

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5, Training Loss: 0.4448000779549281, Validation Loss: 0.0010753319935595735, Validation Accuracy: 91.18%
Epoch 2/5, Training Loss: 0.21855246077477933, Validation Loss: 0.0002921261685959836, Validation Accuracy: 94.29%
Epoch 3/5, Training Loss: 0.15856496324390174, Validation Loss: 0.00029735376146879604, Validation Accuracy: 95.35%
Epoch 4/5, Training Loss: 0.12930977767209212, Validation Loss: 0.00021179367173859413, Validation Accuracy: 96.05%
Epoch 5/5, Training Loss: 0.11113543171621859, Validation Loss: 0.00017865598598059188, Validation Accuracy: 95.94%
Test Accuracy: 96.12%


## Putting It All Together

In [14]:
class FFN(nn.Module):
    """This is used to build a Feed Forward Network architecture that
    is used for classification."""

    def __init__(self, input_size: int, hidden_size: int, num_classes: int) -> None:
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 64)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """This performs the forward propagation."""
        # Flatten the input images
        x = x.view(-1, (28 * 28))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [15]:
def train(
    model: FFN,
    device: Any,
    train_loader: DataLoader,
    val_loader: DataLoader,
    criterion: Any,
    optimizer: torch.optim,
    num_epochs: int,
) -> FFN:
    """This is used for training the model."""
    for epoch in tqdm(range(num_epochs)):
        model.train()
        running_loss = 0.0

        # ==== Batch training loop ====
        for images, labels in train_loader:
            # Push the data to GPU if available
            images, labels = images.to(device), labels.to(device)

            # ==== Forwardprop ====
            outputs = model(images)
            loss: nn.CrossEntropyLoss = criterion(outputs, labels)

            # ==== Backprop ====
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Update the loss
            running_loss += loss.item()

        # ==== Validation loop ====
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                # Push the data to GPU if available
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                val_loss = criterion(outputs, labels).item()
                # It returns the value and the index.
                # We're interested in the index
                _, predicted = torch.max(outputs, dim=1)
                val_total += labels.size(0)  # or labels.shape[0]
                val_correct += (predicted == labels).sum().item()
        val_accuracy = (val_correct / val_total) * 100

        print(
            f"Epoch {epoch + 1}/{num_epochs}, "
            f"Training Loss: {running_loss / len(train_loader)}, "
            f"Validation Loss: {val_loss / len(val_loader)}, "
            f"Validation Accuracy: {val_accuracy:.2f}%"
        )
    return model


def test(model: FFN, device: Any, test_loader: DataLoader):
    """This is used to the model on the test dataset."""
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            # Push the data to GPU if available
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            predicted = torch.argmax(outputs, dim=1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

    test_accuracy = (test_correct / test_total) * 100
    print(f"Test Accuracy: { test_accuracy:.2f}%")

In [16]:
### Hyper-parameters

# Each MNSIT image is 1x28x28, so it is an 2D array [28,28]
# I'll flatten the image as vector dim=1*28*28
input_size = 1 * 28 * 28
hidden_size = 128
num_classes = 10
num_epochs = 10
batch_size = 64
learning_rate = 0.001

train_size = int(0.8 * len(train_dataset))  # 80% of training data
val_size = len(train_dataset) - train_size  # 20% of training data

In [17]:
# Create DataLoader for each dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [18]:
def main():
    """This is the main function."""
    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # If we're on a CUDA machine, this should print a CUDA device:
    print(f"Working on device={device!r}")

    # ==== Init model ====
    model = FFN(
        input_size=input_size,
        hidden_size=hidden_size,
        num_classes=num_classes,
    ).to(device=device)

    # ==== Define loss function and optimizer ====
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

    # ==== Train the model ====
    model = train(
        model,
        device=device,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        num_epochs=num_epochs,
    )

    # ==== Evaluate the model ====
    test(model, device=device, test_loader=test_loader)

In [19]:
# Train and evaluate
main()

Working on device=device(type='cpu')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10, Training Loss: 0.42105956345796586, Validation Loss: 0.0008805161143871064, Validation Accuracy: 92.10%
Epoch 2/10, Training Loss: 0.2069486830085516, Validation Loss: 0.00026071828572039906, Validation Accuracy: 94.63%
Epoch 3/10, Training Loss: 0.15061235540608564, Validation Loss: 0.0002729851316581381, Validation Accuracy: 95.68%
Epoch 4/10, Training Loss: 0.11916445186485847, Validation Loss: 0.00011550384434930821, Validation Accuracy: 95.88%
Epoch 5/10, Training Loss: 0.10217102098837495, Validation Loss: 0.00022950998329101724, Validation Accuracy: 96.29%
Epoch 6/10, Training Loss: 0.08659990383436282, Validation Loss: 0.00013341131481401465, Validation Accuracy: 96.64%
Epoch 7/10, Training Loss: 0.07828768156965574, Validation Loss: 2.9218328641133106e-05, Validation Accuracy: 96.78%
Epoch 8/10, Training Loss: 0.06601344964032371, Validation Loss: 0.00018226797197093354, Validation Accuracy: 95.30%
Epoch 9/10, Training Loss: 0.0628584173200652, Validation Loss: 3.2