# Import required libraries

In [1]:
import pandas as pd

import torch
from torch.utils.data import TensorDataset, random_split, Subset

# custom utility functions
from modules.utils import Utils

# Preliminary Setup

In [2]:
utils = Utils()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Exploring the Data

In [3]:
df = pd.read_csv('./data/train.csv')

In [4]:
df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_491,feature_492,feature_493,feature_494,feature_495,feature_496,feature_497,feature_498,feature_499,label
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063888,43
1,1,0.0,0.0,0.0,0.071982,0.0,0.0,0.0,0.071982,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.071982,0.0,0.0,16
2,2,0.111111,0.0,0.111111,0.0,0.0,0.111111,0.0,0.111111,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21
3,3,0.0,0.087039,0.0,0.0,0.0,0.0,0.0,0.0,0.087039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,4,0.0,0.0,0.069673,0.0,0.069673,0.0,0.069673,0.0,0.0,...,0.069673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [5]:
df['label'].head()

0    43
1    16
2    21
3     2
4     1
Name: label, dtype: int64

In [6]:
# verify for any rows with null values
null_counts = df.isnull().sum()
print(null_counts[null_counts > 0])

Series([], dtype: int64)


In [7]:
# drop 'id' and 'label' for features
X_df = df.drop(columns=['id', 'label'])

# extract labels separately
y_df = df['label']

print("Number of feature columns:", X_df.shape[1])

Number of feature columns: 500


In [8]:
# verifying class count distribution
class_counts = y_df.value_counts().sort_index()
print(class_counts)

label
0     2309
1     2306
2     2307
3     2309
4     2309
5     2309
6     2310
7     2307
8     2304
9     2309
10    2307
11    2307
12    2310
13    2308
14    2307
15    2303
16    2305
17    2309
18    2309
19    2309
20    2310
21    2307
22    2310
23    2309
24    2309
25    2304
26    2308
27    2310
28    2309
29    2310
30    2310
31    2308
32    2310
33    2310
34    2309
35    2309
36    2307
37    2305
38    2309
39    2307
40    2310
41    2310
42    2308
43    2308
44    2306
45    2310
46    2307
47    2305
48    2310
49    2309
Name: count, dtype: int64


# Processing Data before K-Fold

In [9]:
X_tensor = torch.tensor(X_df.values, dtype=torch.float32)
y_tensor = torch.tensor(y_df, dtype=torch.long)

print(X_tensor.shape)
print(y_tensor.shape)

torch.Size([115406, 500])
torch.Size([115406])


In [10]:
# NORMALIZING WITH MEAN 0 AND UNIT VARIANCE
mean = X_tensor.mean(dim=0, keepdim=True)
std = X_tensor.std(dim=0, unbiased=False, keepdim=True)

# avoid division by zero for constant columns
std[std == 0] = 1.0

X_tensor_norm = (X_tensor - mean) / std

print(X_tensor_norm.shape)
print(X_tensor_norm.mean(dim=0))  # should be ~0
print(X_tensor_norm.std(dim=0))   # should be ~1

torch.Size([115406, 500])
tensor([-6.8555e-08, -4.5615e-08,  1.3685e-08,  2.0196e-08, -1.8378e-08,
        -7.7480e-08, -1.3883e-08, -2.7634e-08, -8.6405e-08, -8.4620e-09,
        -2.9088e-08, -1.9965e-08, -2.1915e-08,  3.9732e-08, -3.6889e-08,
         1.4544e-08,  3.8542e-08,  5.5532e-09,  3.7352e-08,  4.4128e-08,
        -1.9998e-09, -4.4822e-08,  2.4857e-08,  2.0229e-08, -1.3883e-08,
         4.0327e-09,  2.8030e-08,  3.0542e-08,  2.0890e-08, -8.6603e-09,
         2.8890e-08, -1.3288e-08, -3.3848e-08,  4.3500e-08, -1.8048e-08,
        -1.4263e-08,  1.2561e-09,  1.9833e-08, -2.1089e-08, -3.1468e-08,
         2.1287e-08, -9.4701e-09,  1.9833e-09,  1.8709e-08, -3.8277e-08,
         1.9568e-08, -3.9732e-08,  1.9172e-09, -5.5267e-08,  1.5271e-08,
        -7.8670e-09,  2.0031e-08, -6.3465e-09, -2.1485e-09, -8.1645e-09,
         2.6642e-08,  4.7202e-08,  6.0027e-08, -3.1865e-08, -1.6924e-08,
         3.4972e-08, -4.0988e-08, -4.4954e-09, -6.6407e-08,  8.7661e-08,
        -4.4095e-08, -2.3

In [11]:
print(y_tensor)

tensor([43, 16, 21,  ..., 32, 46, 33])


In [12]:
# total samples
n_samples = len(X_tensor)
n_train = int(0.8 * n_samples)
n_val = n_samples - n_train

# combine into a dataset
dataset = TensorDataset(X_tensor_norm, y_tensor)

# random split
utils.set_seed(433)
g = torch.Generator().manual_seed(433)
train_set, val_set = random_split(dataset, [n_train, n_val], generator=g)

print(f"Train samples: {len(train_set)}")
print(f"Validation samples: {len(val_set)}")

Train samples: 92324
Validation samples: 23082


# Performing K-Fold Cross Validation

In [13]:
def k_fold_cross_validation(X_tensor, y_tensor, device, params, k=5, grad_clip=True, gauss=True, log=True, patience=5):
    """
    Perform K-fold cross-validation using PyTorch only.

    Args:
        X_tensor: training dataset features
        y_tensor: training dataset labels
        device: device to run training on ("cpu" or "cuda")
        params: dictionary of training hyperparameters
        k: number of folds
        grad_clip: whether to apply gradient clipping during training
        gauss: whether to apply Gaussian noise
        log: whether to log training and validation metrics during training
        patience: number of epochs to wait for validation improvement before early stopping

    Returns:
        model: the trained model with the best validation weights restored
        fold_train_accs: list of training accuracies for each fold
        fold_val_accs: list validation accuracies for each fold
        fold_losses: list of training losses for each fold
        fold_best_val_accs: list of the best validation accuracy for each fold
    """
    utils.set_seed(433)

    dataset = TensorDataset(X_tensor, y_tensor)
    n_samples = len(dataset)
    indices = torch.randperm(n_samples)  # shuffle indices reproducibly

    fold_size = n_samples // k

    models = []
    fold_train_accs = []
    fold_val_accs = []
    fold_best_val_accs = []
    fold_losses = []

    for fold in range(k):
        print(f"\n===== Fold {fold+1}/{k} =====")

        # determine validation indices
        start = fold * fold_size
        end = start + fold_size if fold != k - 1 else n_samples  # last fold takes the remainder
        val_idx = indices[start:end]
        train_idx = torch.cat([indices[:start], indices[end:]])

        # create subsets
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)

        # train the model on this fold
        model, train_accs, val_accs, losses, best_val_acc = utils.train_and_validate(
            train_subset, val_subset, device, params=params, grad_clip=grad_clip, gauss=gauss, log=log, patience=patience
        )

        models.append(model)
        fold_train_accs.append(train_accs)
        fold_val_accs.append(val_accs)
        fold_best_val_accs.append(best_val_acc)
        fold_losses.append(losses)

        print(f"Fold {fold+1} Best Validation Accuracy: {best_val_acc:.2f}%")

    # wait this doesn't help at all if we just take fold_val_accs... should take fold_best_val_accs instead.
    mean_acc = float(torch.tensor(fold_best_val_accs).mean())
    std_acc = float(torch.tensor(fold_best_val_accs).std())

    print(f"\nK-Fold Validation Results ({k} folds): Mean = {mean_acc:.2f}%, Std = {std_acc:.2f}%")

    return models, fold_train_accs, fold_val_accs, fold_best_val_accs, fold_losses

In [14]:
"""

    This cell will run  k-fold validation, which will take a very long time.

"""

# Medium MLP first submission w/modified architecture, gets 0.827 on FULL TRAIN SET
# grad_clip=True, gauss=True, patience=10. HITS 81.05% VAL ACC
params = {'hidden_size': 4096, 'lr': 0.001, 'weight_decay': 0.1, 'batch_size': 512, 'init_type': 'xavier', 'dropout': 0.5, 'noise_std': 0.65, 'num_epochs': 100, 'warmup_epochs': 12}

# Medium MLP second submission w/modified architecture, gets 0.834 on FULL TRAIN SET
# grad_clip=True, gauss=True, patience=10. HITS 81.45% VAL ACC
params = {'hidden_size': 4096, 'lr': 0.005, 'weight_decay': 0.05, 'batch_size': 512, 'init_type': 'xavier', 'dropout': 0.5, 'noise_std': 0.6, 'num_epochs': 100, 'warmup_epochs': 12}

# Medium MLP third submission w/modified architecture, gets 0.839 on FULL TRAIN SET
# grad_clip=True, gauss=True, patience=10. HITS 81.93% VAL ACC
# grad_clip=True, gauss=True, patience=10, label_smoothing=0.15, max_norm=10.0. HITS 82.65% VAL ACC
params = {'hidden_size': 4096, 'lr': 0.005, 'weight_decay': 0.05, 'batch_size': 512, 'init_type': 'xavier', 'dropout': 0.5, 'noise_std': 0.6, 'num_epochs': 125, 'warmup_epochs': 12}

models, fold_train_accs, fold_val_accs, fold_best_val_accs, fold_losses = k_fold_cross_validation(
    X_tensor_norm, y_tensor, device, params, k=5, grad_clip=True, gauss=True, log=True, patience=10
)


===== Fold 1/5 =====


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [1/125] | Train Acc: 1.96% | Val Acc: 1.99% | Loss: 4.3651023191673595
Epoch [2/125] | Train Acc: 21.35% | Val Acc: 39.58% | Loss: 3.3904668094703183
Epoch [3/125] | Train Acc: 34.54% | Val Acc: 47.58% | Loss: 2.9099782410153456
Epoch [4/125] | Train Acc: 41.69% | Val Acc: 52.44% | Loss: 2.687107433773075
Epoch [5/125] | Train Acc: 46.51% | Val Acc: 54.91% | Loss: 2.5493847274057204
Epoch [6/125] | Train Acc: 49.63% | Val Acc: 57.04% | Loss: 2.4488761279087807
Epoch [7/125] | Train Acc: 52.85% | Val Acc: 59.27% | Loss: 2.360531159312364
Epoch [8/125] | Train Acc: 55.68% | Val Acc: 61.44% | Loss: 2.2792690683334396
Epoch [9/125] | Train Acc: 58.73% | Val Acc: 62.26% | Loss: 2.1949017452567436
Epoch [10/125] | Train Acc: 61.78% | Val Acc: 63.89% | Loss: 2.111141539831133
Epoch [11/125] | Train Acc: 64.37% | Val Acc: 65.91% | Loss: 2.0403343357950763
Epoch [12/125] | Train Acc: 66.48% | Val Acc: 66.19% | Loss: 1.9844195529370652
Epoch [13/125] | Train Acc: 68.32% | Val Acc: 66.49% |

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [1/125] | Train Acc: 1.98% | Val Acc: 1.92% | Loss: 4.363386110811417
Epoch [2/125] | Train Acc: 21.14% | Val Acc: 39.83% | Loss: 3.3906664583788246
Epoch [3/125] | Train Acc: 34.32% | Val Acc: 47.57% | Loss: 2.9147979160353255
Epoch [4/125] | Train Acc: 41.69% | Val Acc: 52.27% | Loss: 2.685438067071844
Epoch [5/125] | Train Acc: 46.33% | Val Acc: 54.90% | Loss: 2.547281445405879
Epoch [6/125] | Train Acc: 49.94% | Val Acc: 56.90% | Loss: 2.443604927207602
Epoch [7/125] | Train Acc: 52.98% | Val Acc: 58.84% | Loss: 2.355111506991744
Epoch [8/125] | Train Acc: 55.88% | Val Acc: 60.87% | Loss: 2.2748509401981987
Epoch [9/125] | Train Acc: 58.71% | Val Acc: 62.28% | Loss: 2.191845334236647
Epoch [10/125] | Train Acc: 61.84% | Val Acc: 63.69% | Loss: 2.111089475617769
Epoch [11/125] | Train Acc: 64.46% | Val Acc: 64.74% | Loss: 2.04173057208047
Epoch [12/125] | Train Acc: 66.67% | Val Acc: 65.65% | Loss: 1.9813848089067845
Epoch [13/125] | Train Acc: 68.73% | Val Acc: 66.06% | Loss:

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [1/125] | Train Acc: 1.96% | Val Acc: 1.90% | Loss: 4.364548193190177
Epoch [2/125] | Train Acc: 21.20% | Val Acc: 39.88% | Loss: 3.3912318716557923
Epoch [3/125] | Train Acc: 34.26% | Val Acc: 48.06% | Loss: 2.916291303213715
Epoch [4/125] | Train Acc: 41.62% | Val Acc: 52.58% | Loss: 2.6860221153670394
Epoch [5/125] | Train Acc: 46.39% | Val Acc: 54.67% | Loss: 2.548887589002992
Epoch [6/125] | Train Acc: 49.73% | Val Acc: 56.74% | Loss: 2.445974023058716
Epoch [7/125] | Train Acc: 52.88% | Val Acc: 58.34% | Loss: 2.3589025006267135
Epoch [8/125] | Train Acc: 55.84% | Val Acc: 61.25% | Loss: 2.2741844451753486
Epoch [9/125] | Train Acc: 58.89% | Val Acc: 62.47% | Loss: 2.193562033180941
Epoch [10/125] | Train Acc: 61.73% | Val Acc: 63.69% | Loss: 2.1125043389392424
Epoch [11/125] | Train Acc: 64.36% | Val Acc: 65.53% | Loss: 2.040511078980438
Epoch [12/125] | Train Acc: 66.59% | Val Acc: 66.29% | Loss: 1.9807314712685555
Epoch [13/125] | Train Acc: 68.40% | Val Acc: 66.44% | Lo

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [1/125] | Train Acc: 1.97% | Val Acc: 1.93% | Loss: 4.365409058246256
Epoch [2/125] | Train Acc: 21.23% | Val Acc: 39.53% | Loss: 3.3889893794918398
Epoch [3/125] | Train Acc: 34.30% | Val Acc: 47.91% | Loss: 2.9141537012254646
Epoch [4/125] | Train Acc: 41.71% | Val Acc: 52.26% | Loss: 2.688505468337542
Epoch [5/125] | Train Acc: 46.19% | Val Acc: 55.18% | Loss: 2.5493432024891964
Epoch [6/125] | Train Acc: 49.81% | Val Acc: 57.74% | Loss: 2.4459380714970167
Epoch [7/125] | Train Acc: 52.60% | Val Acc: 59.50% | Loss: 2.3616297011061667
Epoch [8/125] | Train Acc: 55.65% | Val Acc: 61.59% | Loss: 2.275896930963148
Epoch [9/125] | Train Acc: 58.64% | Val Acc: 63.43% | Loss: 2.195141626813048
Epoch [10/125] | Train Acc: 61.74% | Val Acc: 64.98% | Loss: 2.1144052772589115
Epoch [11/125] | Train Acc: 64.30% | Val Acc: 65.75% | Loss: 2.0429595942075034
Epoch [12/125] | Train Acc: 66.52% | Val Acc: 66.67% | Loss: 1.9829411277647042
Epoch [13/125] | Train Acc: 68.43% | Val Acc: 67.03% | 

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [1/125] | Train Acc: 1.97% | Val Acc: 2.12% | Loss: 4.364784382176841
Epoch [2/125] | Train Acc: 21.29% | Val Acc: 39.32% | Loss: 3.388981940960028
Epoch [3/125] | Train Acc: 34.07% | Val Acc: 47.41% | Loss: 2.9163020414298124
Epoch [4/125] | Train Acc: 41.76% | Val Acc: 51.95% | Loss: 2.685524843577976
Epoch [5/125] | Train Acc: 46.34% | Val Acc: 54.93% | Loss: 2.5500038021240194
Epoch [6/125] | Train Acc: 49.74% | Val Acc: 57.60% | Loss: 2.4480302714169384
Epoch [7/125] | Train Acc: 52.87% | Val Acc: 59.51% | Loss: 2.3593630921070594
Epoch [8/125] | Train Acc: 55.84% | Val Acc: 60.96% | Loss: 2.2769150534378975
Epoch [9/125] | Train Acc: 58.53% | Val Acc: 62.79% | Loss: 2.1966774172667414
Epoch [10/125] | Train Acc: 61.67% | Val Acc: 64.30% | Loss: 2.1155038865185567
Epoch [11/125] | Train Acc: 64.37% | Val Acc: 65.71% | Loss: 2.0449028108620024
Epoch [12/125] | Train Acc: 66.83% | Val Acc: 66.92% | Loss: 1.9789001054570836
Epoch [13/125] | Train Acc: 68.32% | Val Acc: 67.41% |