# Cross Validation

In [1]:
# Built-in library
from typing import Any, Optional, Sequence, Union

# Standard imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Configure the backend
import matplotlib_inline.backend_inline

matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
import seaborn as sns

In [3]:
def create_iris_data() -> tuple[np.ndarray, np.ndarray]:
    """This returns the independent and the target features."""
    # load data
    iris_data = sns.load_dataset("iris")

    # Preprocess the data
    condlist = [
        (iris_data["species"] == "setosa"),
        (iris_data["species"] == "versicolor"),
        iris_data["species"] == "virginica",
    ]
    choicelist = [0, 1, 2]
    iris_data["target"] = np.select(condlist=condlist, choicelist=choicelist)

    # Convert the data to Torch tensor
    X = torch.tensor(iris_data.loc[:, iris_data.columns[:4]].values).float()
    y = torch.tensor(iris_data["target"].values).long()

    print(f"Shape of X: {X.shape}, Shape of X: {y.shape}")
    return (X, y)

In [4]:
# Load data
X, y = create_iris_data()

Shape of X: torch.Size([150, 4]), Shape of X: torch.Size([150])


#### Separate Into Train and Test

In [5]:
rng = np.random.default_rng(seed=1)
arr = rng.random(10)
arr

array([0.51182162, 0.9504637 , 0.14415961, 0.94864945, 0.31183145,
       0.42332645, 0.82770259, 0.40919914, 0.54959369, 0.02755911])

In [6]:
training_pct = 0.8
training_size = int(arr.shape[0] * training_pct)

data_bools = np.zeros(shape=(arr.shape[0]), dtype=bool)

# Select the training indices
training_idxs = np.random.choice(
    a=range(arr.shape[0]), size=training_size, replace=False
)
training_idxs

array([6, 4, 1, 3, 7, 5, 8, 0])

In [7]:
# Add the training indices and equate them to True
data_bools[training_idxs] = True
data_bools

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
       False])

In [8]:
# Select the training data
training_data = arr[data_bools]
training_data

array([0.51182162, 0.9504637 , 0.94864945, 0.31183145, 0.42332645,
       0.82770259, 0.40919914, 0.54959369])

In [9]:
# Select the inverse
test_data = arr[~data_bools]
test_data

array([0.14415961, 0.02755911])

In [10]:
# Putting it together
def split_data_into_train_test(
    input_arr: np.ndarray, training_pct: float
) -> tuple[np.ndarray, np.ndarray]:

    training_size = int(input_arr.shape[0] * training_pct)
    # Array containing the training and test indices
    data_bools = np.zeros(shape=(input_arr.shape[0]), dtype=bool)

    # Select the training indices
    training_idxs = np.random.choice(
        a=range(input_arr.shape[0]), size=training_size, replace=False
    )
    # Add the training indices and equate them to True
    data_bools[training_idxs] = True
    # Select the training data
    training_data = input_arr[data_bools]
    # Select the inverse
    test_data = input_arr[~data_bools]
    return (training_data, test_data)

In [11]:
X_train, X_test = split_data_into_train_test(input_arr=X, training_pct=0.8)

X_train.shape, X_test.shape

(torch.Size([120, 4]), torch.Size([30, 4]))

In [12]:
y_train, y_test = split_data_into_train_test(input_arr=y, training_pct=0.8)
y_train.shape, y_test.shape

(torch.Size([120]), torch.Size([30]))

In [13]:
# Build ANN
class Net(nn.Module):
    """Artificial Neural Network."""

    def __init__(self, n_units: int, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.input = nn.Linear(4, n_units)
        self.output = nn.Linear(n_units, 3)

    def forward(self, X: torch.Tensor) -> torch.Tensor:
        """This is used to perform forward propagation."""
        X = self.input(X)
        X = F.relu(X)
        X = self.output(X)
        X = F.softmax(X)
        return X

In [14]:
net = Net(n_units=32)
net

Net(
  (input): Linear(in_features=4, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=3, bias=True)
)

In [15]:
# Train the model
learning_rate, epochs = 0.01, 1_000
optimizer = torch.optim.SGD(params=net.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
losses = torch.zeros(size=(epochs,))

for epoch_idx in np.arange(epochs):
    # Reset the gradients from prev. back prop
    optimizer.zero_grad()

    # Fwd prop
    _y_pred = net(X_train)

    # Compute loss
    loss = criterion(_y_pred, y_train)
    losses[epoch_idx] = loss

    # Back prop
    loss.backward()
    optimizer.step()


# Make predictions
# Training data
y_proba_train = net(X_train)
y_pred_train = torch.argmax(y_proba_train, axis=1)
accuracy_train = torch.mean((y_pred_train == y_train).float()) * 100

# Test data
y_proba_test = net(X_test)
y_pred_test = torch.argmax(y_proba_test, axis=1)
accuracy_test = torch.mean((y_pred_test == y_test).float()) * 100


accuracy_train, accuracy_test

  X = F.softmax(X)


(tensor(92.5000), tensor(96.6667))

In [16]:
torch.unique(y_train)

from collections import Counter

Counter(list(np.array(y_test)))

Counter({0: 8, 1: 14, 2: 8})