In [1]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Sequence, TypeAlias, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Add seed
np.random.seed(0)

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
%load_ext watermark
%watermark -v -p numpy,pandas,matplotlib,torch,lightning,scikit-learn --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy       : 1.26.4
pandas      : 2.2.1
matplotlib  : 3.8.3
torch       : 2.2.2
lightning   : 2.2.1
scikit-learn: 1.4.1.post1

conda environment: torch_p11



### Load Data


In [3]:
train_fp: str = "../../data/titanic/train_features.parquet"
train_fp1: str = "../../data/titanic/train_target.parquet"
test_fp: str = "../../data/titanic/test_features.parquet"
test_fp1: str = "../../data/titanic/test_target.parquet"

train_df: pl.DataFrame = pl.read_parquet(train_fp)
train_target_df: pl.DataFrame = pl.read_parquet(train_fp1)
test_df: pl.DataFrame = pl.read_parquet(test_fp)
test_target_df: pl.DataFrame = pl.read_parquet(test_fp1)

train_df.shape, train_target_df.shape, test_df.shape, test_target_df.shape

((1100, 11), (1100, 1), (328, 11), (328, 1))

In [4]:
train_df.head()

num_vars__age,num_vars__fare,num_vars__parch,num_vars__pclass,num_vars__sibsp,num_vars__ticket,cat_vars__embarked_c,cat_vars__embarked_q,cat_vars__embarked_s,cat_vars__sex_female,cat_vars__sex_male
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.524008,0.014737,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
0.19833,0.039525,0.111111,1.0,0.125,1.0,0.0,0.0,1.0,0.0,1.0
0.436325,0.039525,0.111111,1.0,0.125,1.0,0.0,0.0,1.0,1.0,0.0
0.348643,0.046845,0.0,0.5,0.125,1.0,1.0,0.0,0.0,1.0,0.0
0.373695,0.014151,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

In [6]:
data: pl.DataFrame = pl.concat([train_df, train_target_df], how="horizontal")

X_: pd.DataFrame = data.to_pandas().drop(columns=["survived"])
y_: pd.Series = data.to_pandas()["survived"]

X_test: npt.NDArray[np.float_ | np.int_] = test_df.to_numpy()
y_test: npt.NDArray[np.float_ | np.int_] = test_target_df.to_numpy().squeeze()

X_train, X_val, y_train, y_val = train_test_split(
    X_, y_, test_size=0.15, random_state=42
)
X_train: npt.NDArray[np.float_ | np.int_] = X_train.values
X_val: npt.NDArray[np.float_ | np.int_] = X_val.values

console.print(X_train.shape, X_val.shape, X_test.shape)

In [7]:
y_train: npt.NDArray[np.float_ | np.int_] = y_train.to_numpy()  # .reshape(-1, 1)
y_val: npt.NDArray[np.float_ | np.int_] = y_val.to_numpy()  # .reshape(-1, 1)


console.print(y_train.shape, y_val.shape, y_test.shape)

In [8]:
from torch.utils.data import Dataset, DataLoader


class TitanicClassifier(nn.Module):

    def __init__(self, input_size: int, hidden_units: int, num_classes: int) -> None:
        super().__init__()
        # Hidden layer 1
        self.fc1 = nn.Linear(input_size, hidden_units)
        self.relu1 = nn.ReLU()

        # Hidden layer 2
        self.fc2 = nn.Linear(hidden_units, hidden_units)
        self.relu2 = nn.ReLU()

        # Output layer
        self.fc3 = nn.Linear(hidden_units, num_classes)

    def forward(self, x) -> torch.Tensor:
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x


class MyDataset(Dataset):
    def __init__(self, X: torch.Tensor, y: torch.Tensor) -> None:

        self.features = torch.tensor(X, dtype=torch.float32)
        self.labels = torch.tensor(y, dtype=torch.long)

    def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]:
        x = self.features[index]
        y = self.labels[index]
        return x, y

    def __len__(self) -> int:
        return self.labels.shape[0]

In [9]:
train_ds = MyDataset(X_train, y_train)
val_ds = MyDataset(X_val, y_val)
test_ds = MyDataset(X_test, y_test)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=32,
    shuffle=True,
)

val_loader = DataLoader(
    dataset=val_ds,
    batch_size=32,
    shuffle=False,
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=32,
    shuffle=False,
)

### Training Loop


In [10]:
Model: TypeAlias = torch.nn.Module

In [11]:
def compute_accuracy(model: Model, dataloader: DataLoader):

    model = model.eval()

    correct: float = 0.0
    total_examples: int = 0

    for _, (features, labels) in enumerate(dataloader):

        with torch.inference_mode():  # Basically the same as torch.no_grad
            logits = model(features)

        predictions: torch.Tensor = torch.argmax(logits, dim=1)

        compare: bool = (labels == predictions).float()
        correct += torch.sum(compare)
        total_examples += len(compare)

    return correct / total_examples

In [12]:
torch.manual_seed(1)

epochs: int = 100
learning_rate: float = 0.01

torch.manual_seed(1)
model = TitanicClassifier(input_size=11, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in tqdm(range(epochs)):

    model = model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):

        logits = model(features)

        loss = nn.CrossEntropyLoss(reduction="mean")(logits, labels)  # Loss function

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ### LOGGING
        print(
            f"Epoch: {epoch+1:03d}/{epochs:03d}"
            f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
            f" | Train/Val Loss: {loss:.2f}"
        )

    train_acc = compute_accuracy(model, train_loader)
    val_acc = compute_accuracy(model, val_loader)
    console.print(
        f"Train Acc {train_acc*100:.2f}% | Val Acc {val_acc*100:.2f}%", style="info"
    )

TypeError: TitanicClassifier.__init__() missing 1 required positional argument: 'hidden_units'

In [None]:
train_acc: float = compute_accuracy(model, train_loader)
val_acc: float = compute_accuracy(model, val_loader)
test_acc: float = compute_accuracy(model, test_loader)

print(f"Train Acc: {train_acc*100:.2f}%")
print(f"Val Acc: {val_acc*100:.2f}%")
print(f"Test Acc: {test_acc*100:.2f}%")

In [None]:
from sklearn.metrics import roc_auc_score


def compute_auc(model: Model, dataloader: DataLoader) -> float:

    model = model.eval()

    correct: list[Any] = []
    predictions: list[Any] = []

    for _, (features, labels) in enumerate(dataloader):

        with torch.inference_mode():  # Same as torch.no_grad
            logits = model(features)
            preds = F.sigmoid(logits)[:, 1]

        correct.append(list(labels.numpy()))
        predictions.append(list(preds))

    correct = [v for list_x in correct for v in list_x]
    predictions = [v for list_x in predictions for v in list_x]

    auc_score = roc_auc_score(y_true=correct, y_score=predictions)
    return auc_score

In [None]:
train_auc: float = compute_auc(model, train_loader)
val_auc: float = compute_auc(model, val_loader)
test_auc: float = compute_auc(model, test_loader)

console.print(f"Train AUC: {train_auc*100:.2f}%")
console.print(f"Val AUC: {val_auc*100:.2f}%")
console.print(f"Test AUC: {test_auc*100:.2f}%")