# Cross Validation

In [1]:
# Built-in library
from typing import Any, Optional, Sequence, Union

# Standard imports
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Configure the backend
import matplotlib_inline.backend_inline

matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
import seaborn as sns

In [3]:
def create_iris_data() -> tuple[np.ndarray, np.ndarray]:
    """This returns the independent and the target features."""
    # load data
    iris_data = sns.load_dataset("iris")

    # Preprocess the data
    condlist = [
        (iris_data["species"] == "setosa"),
        (iris_data["species"] == "versicolor"),
        iris_data["species"] == "virginica",
    ]
    choicelist = [0, 1, 2]
    iris_data["target"] = np.select(condlist=condlist, choicelist=choicelist)

    # Convert the data to Torch tensor
    X = torch.tensor(iris_data.loc[:, iris_data.columns[:4]].values).float()
    y = torch.tensor(iris_data["target"].values).long()

    print(f"Shape of X: {X.shape}, Shape of X: {y.shape}")
    return (X, y)

In [4]:
# Load data
X, y = create_iris_data()

Shape of X: torch.Size([150, 4]), Shape of X: torch.Size([150])


#### Separate Into Train and Test

In [5]:
rng = np.random.default_rng(seed=1)
arr = rng.random(10)
arr

array([0.51182162, 0.9504637 , 0.14415961, 0.94864945, 0.31183145,
       0.42332645, 0.82770259, 0.40919914, 0.54959369, 0.02755911])

In [6]:
training_pct = 0.8
training_size = int(arr.shape[0] * training_pct)

data_bools = np.zeros(shape=(arr.shape[0]), dtype=bool)

# Select the training indices
training_idxs = np.random.choice(
    a=range(arr.shape[0]), size=training_size, replace=False
)
training_idxs

array([1, 7, 6, 9, 0, 5, 3, 4])

In [7]:
# Add the training indices and equate them to True
data_bools[training_idxs] = True
data_bools

array([ True,  True, False,  True,  True,  True,  True,  True, False,
        True])

In [8]:
# Select the training data
training_data = arr[data_bools]
training_data

array([0.51182162, 0.9504637 , 0.94864945, 0.31183145, 0.42332645,
       0.82770259, 0.40919914, 0.02755911])

In [9]:
# Select the inverse
test_data = arr[~data_bools]
test_data

array([0.14415961, 0.54959369])

In [10]:
# Putting it together
def split_data_into_train_test(
    input_arr: np.ndarray, training_pct: float
) -> tuple[np.ndarray, np.ndarray]:

    training_size = int(input_arr.shape[0] * training_pct)
    # Array containing the training and test indices
    data_bools = np.zeros(shape=(input_arr.shape[0]), dtype=bool)

    # Select the training indices
    training_idxs = np.random.choice(
        a=range(input_arr.shape[0]), size=training_size, replace=False
    )
    # Add the training indices and equate them to True
    data_bools[training_idxs] = True
    # Select the training data
    training_data = input_arr[data_bools]
    # Select the inverse
    test_data = input_arr[~data_bools]
    return (training_data, test_data)

In [11]:
training_data, test_data = split_data_into_train_test(input_arr=X, training_pct=0.8)

training_data.shape, test_data.shape

(torch.Size([120, 4]), torch.Size([30, 4]))