# Import required libraries

In [1]:
import random
import itertools
import pandas as pd

import torch
from torch.utils.data import TensorDataset, random_split

from modules.utils import Utils

# Preliminary Setup

In [2]:
utils = Utils()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Exploring the Data

In [3]:
df = pd.read_csv('./data/train.csv')

In [4]:
df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_491,feature_492,feature_493,feature_494,feature_495,feature_496,feature_497,feature_498,feature_499,label
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063888,43
1,1,0.0,0.0,0.0,0.071982,0.0,0.0,0.0,0.071982,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.071982,0.0,0.0,16
2,2,0.111111,0.0,0.111111,0.0,0.0,0.111111,0.0,0.111111,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21
3,3,0.0,0.087039,0.0,0.0,0.0,0.0,0.0,0.0,0.087039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,4,0.0,0.0,0.069673,0.0,0.069673,0.0,0.069673,0.0,0.0,...,0.069673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [5]:
df['label'].head()

0    43
1    16
2    21
3     2
4     1
Name: label, dtype: int64

In [6]:
# verify for any rows with null values
null_counts = df.isnull().sum()
print(null_counts[null_counts > 0])

Series([], dtype: int64)


In [7]:
# drop 'id' and 'label' for features
X_df = df.drop(columns=['id', 'label'])

# extract labels separately
y_df = df['label']

print("Number of feature columns:", X_df.shape[1])

Number of feature columns: 500


In [8]:
# verifying class count distribution
class_counts = y_df.value_counts().sort_index()
print(class_counts)

label
0     2309
1     2306
2     2307
3     2309
4     2309
5     2309
6     2310
7     2307
8     2304
9     2309
10    2307
11    2307
12    2310
13    2308
14    2307
15    2303
16    2305
17    2309
18    2309
19    2309
20    2310
21    2307
22    2310
23    2309
24    2309
25    2304
26    2308
27    2310
28    2309
29    2310
30    2310
31    2308
32    2310
33    2310
34    2309
35    2309
36    2307
37    2305
38    2309
39    2307
40    2310
41    2310
42    2308
43    2308
44    2306
45    2310
46    2307
47    2305
48    2310
49    2309
Name: count, dtype: int64


# Processing Data for Hyperparameter Search

In [9]:
X_tensor = torch.tensor(X_df.values, dtype=torch.float32)
y_tensor = torch.tensor(y_df, dtype=torch.long)

print(X_tensor.shape)
print(y_tensor.shape)

torch.Size([115406, 500])
torch.Size([115406])


In [10]:
# NORMALIZING WITH MEAN 0 AND UNIT VARIANCE
mean = X_tensor.mean(dim=0, keepdim=True)
std = X_tensor.std(dim=0, unbiased=False, keepdim=True)

# avoid division by zero for constant columns
std[std == 0] = 1.0

X_tensor_norm = (X_tensor - mean) / std

print(X_tensor_norm.shape)
print(X_tensor_norm.mean(dim=0))  # should be ~0
print(X_tensor_norm.std(dim=0))   # should be ~1

torch.Size([115406, 500])
tensor([-6.8555e-08, -4.5615e-08,  1.3685e-08,  2.0196e-08, -1.8378e-08,
        -7.7480e-08, -1.3883e-08, -2.7634e-08, -8.6405e-08, -8.4620e-09,
        -2.9088e-08, -1.9965e-08, -2.1915e-08,  3.9732e-08, -3.6889e-08,
         1.4544e-08,  3.8542e-08,  5.5532e-09,  3.7352e-08,  4.4128e-08,
        -1.9998e-09, -4.4822e-08,  2.4857e-08,  2.0229e-08, -1.3883e-08,
         4.0327e-09,  2.8030e-08,  3.0542e-08,  2.0890e-08, -8.6603e-09,
         2.8890e-08, -1.3288e-08, -3.3848e-08,  4.3500e-08, -1.8048e-08,
        -1.4263e-08,  1.2561e-09,  1.9833e-08, -2.1089e-08, -3.1468e-08,
         2.1287e-08, -9.4701e-09,  1.9833e-09,  1.8709e-08, -3.8277e-08,
         1.9568e-08, -3.9732e-08,  1.9172e-09, -5.5267e-08,  1.5271e-08,
        -7.8670e-09,  2.0031e-08, -6.3465e-09, -2.1485e-09, -8.1645e-09,
         2.6642e-08,  4.7202e-08,  6.0027e-08, -3.1865e-08, -1.6924e-08,
         3.4972e-08, -4.0988e-08, -4.4954e-09, -6.6407e-08,  8.7661e-08,
        -4.4095e-08, -2.3

In [11]:
print(y_tensor)

tensor([43, 16, 21,  ..., 32, 46, 33])


In [12]:
# total samples
n_samples = len(X_tensor)
n_train = int(0.8 * n_samples)
n_val = n_samples - n_train

# combine into a dataset
dataset = TensorDataset(X_tensor_norm, y_tensor)

# random split
utils.set_seed(433)
g = torch.Generator().manual_seed(433)
train_set, val_set = random_split(dataset, [n_train, n_val], generator=g)

print(f"Train samples: {len(train_set)}")
print(f"Validation samples: {len(val_set)}")

Train samples: 92324
Validation samples: 23082


# Performing Random Search for Hyperparameters

In [13]:
def random_search(train_set, val_set, device, search_space, num_trials=20, grad_clip=True, gauss=True, log=True, patience=5):
    """
    Performs hyperparameter search on a hyperparameter search space.

    Args:
        train_set: training subset from training dataset
        val_set: validation subset from training dataset
        device: device to run training on ("cpu" or "cuda")
        search_space: dictionary of training hyperparameters
        num_trials: number of trials
        grad_clip: whether to apply gradient clipping during training
        gauss: whether to apply Gaussian noise
        log: whether to log training and validation metrics during training
        patience: number of epochs to wait for validation improvement before early stopping
    """
    # find all hyperparameter combinations
    keys, values = zip(*search_space.items())
    all_combos = [dict(zip(keys, v)) for v in itertools.product(*values)]

    # random hyperparameter sampling
    rng = random.SystemRandom()
    rng.shuffle(all_combos)
    configs = all_combos[:num_trials]

    for trial in range(num_trials):
        # --- Sample a shuffled hyperparameter combination --- #
        params = configs[trial]
        print(f"\n=== Trial {trial+1}/{num_trials} ===")
        print(params)

        model, train_accs, val_accs, losses, best_val_acc = utils.train_and_validate(
            train_set, val_set, device, params, grad_clip=grad_clip, gauss=gauss, log=log, patience=patience
        )

        print(f"Trial {trial+1} finished with Best AVG Val Acc = {best_val_acc:.2f}%")

In [14]:
search_space = {
    'hidden_size':     [4096],                          # only high values
    'lr':              [0.005],                         # fixed
    'weight_decay':    [0.001, 0.005, 0.01, 0.05, 0.1], # i have no idea, trying wide range
    'batch_size':      [384, 512, 768],                 # fixed
    'init_type':       ['xavier'],                      # fixed
    'dropout':         [0.45, 0.5, 0.55],               # centered around sweet spot
    'noise_std':       [0.50, 0.55, 0.60, 0.65, 0.70],  # centered around sweet spot
    'num_epochs':      [125],                           # fixed
    'warmup_epochs':   [8, 12, 16],                     # interacts strongly w/ noise
}


random_search(train_set, val_set, device, search_space, num_trials=180, grad_clip=True, gauss=True, log=False, patience=10)


=== Trial 1/180 ===
{'hidden_size': 4096, 'lr': 0.005, 'weight_decay': 0.01, 'batch_size': 768, 'init_type': 'xavier', 'dropout': 0.5, 'noise_std': 0.6, 'num_epochs': 125, 'warmup_epochs': 16}


  0%|          | 0/125 [00:00<?, ?it/s]

Trial 1 finished with Best AVG Val Acc = 82.19%

=== Trial 2/180 ===
{'hidden_size': 4096, 'lr': 0.005, 'weight_decay': 0.005, 'batch_size': 768, 'init_type': 'xavier', 'dropout': 0.55, 'noise_std': 0.6, 'num_epochs': 125, 'warmup_epochs': 12}


  0%|          | 0/125 [00:00<?, ?it/s]

Early stopping at epoch 122 (no improvement for 10 epochs).
Trial 2 finished with Best AVG Val Acc = 81.94%

=== Trial 3/180 ===
{'hidden_size': 4096, 'lr': 0.005, 'weight_decay': 0.1, 'batch_size': 512, 'init_type': 'xavier', 'dropout': 0.45, 'noise_std': 0.6, 'num_epochs': 125, 'warmup_epochs': 8}


  0%|          | 0/125 [00:00<?, ?it/s]

KeyboardInterrupt: 