In [10]:
!pip install numpy pandas matplotlib tensorflow keras 



Base Neural Net

In [11]:
# Dictionary mapping names to lambda functions
# Signature: func(n_inputs, n_neurons) -> np.array
INIT_STRATEGIES = {
    "he": lambda i, o: np.random.randn(i, o) * np.sqrt(2. / i),
    "uniform": lambda i, o: np.random.uniform(-1 / np.sqrt(i), 1 / np.sqrt(i), (i, o)),
    "normal": lambda i, o: np.random.randn(i, o) * 0.01
}

In [12]:
import numpy as np
# --- 1. Linear Layer---
class LinearLayer:
    def __init__(self, n_inputs, n_neurons, init_fn):
        """
        init_fn: A function that accepts (n_inputs, n_neurons) 
                 and returns a weight matrix.
        """
        
        
        self.W = init_fn(n_inputs, n_neurons)
        self.b = np.zeros((1, n_neurons))

        # Velocity for Momentum
        self.v_W = np.zeros_like(self.W)
        self.v_b = np.zeros_like(self.b)

        # Cache
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        return np.dot(x, self.W) + self.b

    def backward(self, d_out, alpha, gamma):
        self.dW = np.dot(self.x.T, d_out)
        self.db = np.sum(d_out, axis=0, keepdims=True)
        d_input = np.dot(d_out, self.W.T)

        # Update with Momentum
        self.v_W = (gamma * self.v_W) + (alpha * self.dW)
        self.v_b = (gamma * self.v_b) + (alpha * self.db)
        self.W -= self.v_W
        self.b -= self.v_b

        return d_input


# --- 2. ReLU Activation ---
class ReLU:
    def forward(self, x):
        self.x = x
        return np.maximum(0, x)

    def backward(self, d_out,alpha,gamma):
        # Pass gradient through only if x was > 0
        d_input = d_out.copy()
        d_input[self.x <= 0] = 0
        return d_input


# --- 3. The Neural Network Manager ---
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, init_strategy="he"):

        # 1. Select the specific lambda function
        if init_strategy not in INIT_STRATEGIES:
            raise ValueError(f"Strategy must be one of {list(INIT_STRATEGIES.keys())}")

        weight_fn = INIT_STRATEGIES[init_strategy]

        self.layers = []

        # Helper to make adding layers cleaner
        def add_block(n_in, n_out):
            # Pass the lambda (weight_fn) to the layer
            self.layers.append(LinearLayer(n_in, n_out, weight_fn))
            self.layers.append(ReLU())

        # --- Build Architecture ---

        # Input to Hidden 1
        add_block(input_size, hidden_size)

        # Hidden to Hidden (4 blocks)
        for _ in range(1):
            add_block(hidden_size, hidden_size)

        self.layers.append(LinearLayer(hidden_size, output_size, weight_fn))

    def forward(self, x):
        out = x
        for layer in self.layers:
            if isinstance(layer, LinearLayer) or isinstance(layer, ReLU):
                out = layer.forward(out)
        return out

    def softmax(self, z):
        # Subtract max for numerical stability (prevents exploding exponentials)
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def train_step(self, X_batch, y_batch, alpha, gamma):
        # 1. Forward Pass
        logits = self.forward(X_batch)
        
        #predict
        probs = self.softmax(logits)

        # 2. Backward Pass
        batch_size = X_batch.shape[0]
        
        # average error based on batch_size
        d_out = (probs - y_batch) / batch_size

        # Propagate error backwards
        for layer in reversed(self.layers):
            #alpha = learning rate gamma = momentum rate
            d_out = layer.backward(d_out, alpha, gamma)

        return probs

    def evaluate(self, X, y):
        # Helper to check accuracy
        logits = self.forward(X)
        probs = self.softmax(logits)
        preds = np.argmax(probs, axis=1)
        truth = np.argmax(y, axis=1)
        return np.mean(preds == truth)

Running training on the Z normalized dataset (fully supervised)

In [None]:
from DataLoader import DataLoader
from Overfitting import OverfitDetector
from Results import Results
import numpy as np
import pandas as pd

# ---------------------------------------------------------
# A. Load and Split Data (supervised, 80% of dataset)
# ---------------------------------------------------------
loader = DataLoader()

print("\n1. Getting Z-Score Normalized Data...")
df_zscore = loader.get_z_score_df()

print("\n2. Splitting")
data_dict = loader.get_supervised_split(df_zscore, dataset_pct=0.8)

original_train = data_dict["train"]
print(f"   Original Labeled Size: {original_train.shape[0]}")

# ---------------------------------------------------------
# B. Augmentation
# ---------------------------------------------------------
print("\n3. Augmenting Labeled Data (Horizontal Flip)...")
flipped_train = loader.augment_horizontal_flip(original_train)
final_train_df = pd.concat([original_train, flipped_train], axis=0).reset_index(drop=True)
print(f"   Final Training Size (Original + Flipped): {final_train_df.shape[0]}")

# ---------------------------------------------------------
# C. Convert DataFrames to NumPy Arrays
# ---------------------------------------------------------
X_train, y_train = loader.to_numpy(final_train_df)
X_val, y_val     = loader.to_numpy(data_dict["validation"])
X_test, y_test   = loader.to_numpy(data_dict["test"])

# ---------------------------------------------------------
# D. Training Loop (manual numpy NeuralNetwork from above)
# ---------------------------------------------------------
print("\n4. Initializing Network...")
nn = NeuralNetwork(input_size=784, hidden_size=1028, output_size=10, init_strategy="he")

BATCH_SIZE = 64
EPOCHS = 200
ALPHA = 0.01
GAMMA = 0.04

results = Results("results.csv")
results.begin_run(seed=42, weight_init="he", normalization="z_score",
                  augmentations="horizontal_flip")

overfit_detector = OverfitDetector()
converged_epoch = EPOCHS

print(f"   Starting Training (Batch={BATCH_SIZE}, Alpha={ALPHA}, Gamma={GAMMA})...")
num_batches = int(X_train.shape[0] / BATCH_SIZE)

for epoch in range(EPOCHS):
    perm = np.random.permutation(X_train.shape[0])
    X_shuffled = X_train[perm]
    y_shuffled = y_train[perm]

    for b in range(num_batches):
        start = b * BATCH_SIZE
        end = start + BATCH_SIZE
        nn.train_step(X_shuffled[start:end], y_shuffled[start:end], ALPHA, GAMMA)

    # Validation
    val_acc = nn.evaluate(X_val, y_val)
    train_acc = nn.evaluate(X_train[:5000], y_train[:5000])  # sample for speed
    val_error = 1.0 - val_acc

    results.log_epoch(epoch + 1, train_loss=0.0, train_acc=train_acc,
                      val_loss=0.0, val_acc=val_acc)

    is_overfitting, mean_ev, std_ev = overfit_detector.check(val_error)

    if (epoch + 1) % 10 == 0:
        status = " ** OVERFIT **" if is_overfitting else ""
        print(f"   Epoch {epoch+1:03d} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}{status}")

    if is_overfitting:
        converged_epoch = epoch + 1
        print(f"   >>> Overfitting at epoch {converged_epoch}.")
        break

# ---------------------------------------------------------
# E. Final Test
# ---------------------------------------------------------
test_acc = nn.evaluate(X_test, y_test)
results.end_run(test_acc=test_acc, converged_epoch=converged_epoch)

print(f"\nFinal Test Accuracy (Numpy Vanilla, Z-Score, He init): {test_acc:.4f}")

Visualization

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define label names for Fashion MNIST
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# Pick 5 random indices from the test set
# We use len(X_test) from the previous step where we prepared the Z-score data
indices = np.random.choice(len(X_test), 5, replace=False)

fig, axes = plt.subplots(1, 5, figsize=(15, 3))

print("--- Visualizing Predictions on Z-Score Normalized Data ---")

for i, idx in enumerate(indices):
    img = X_test[idx]  # Shape: (784,)

    # 1. Forward pass to get prediction
    # We reshape to (1, 784) because the network expects a batch of inputs
    logits = nn.forward(img.reshape(1, -1))

    # 2. Softmax to get probabilities
    probs = nn.softmax(logits)
    prediction = np.argmax(probs)

    # 3. Get True Label
    # y_test is one-hot encoded, so we use argmax to get the index (0-9)
    true_label = np.argmax(y_test[idx])

    # 4. Plot
    # Reshape back to 28x28 for the image
    axes[i].imshow(img.reshape(28, 28), cmap='gray')

    # Set title color: Green if correct, Red if wrong
    title_color = 'green' if prediction == true_label else 'red'

    axes[i].set_title(f"Pred: {class_names[prediction]}\nTrue: {class_names[true_label]}", color=title_color)
    axes[i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
#Still must integrate momentum into the NN, and do the other weight initializations 

Next, I will integrate the validation metrics, like overfitting threshold, and do the stopiing condition, then maybe put it for some hyperparam tuning like weight decay or learning rate decay eventually. 

## Overfitting Detection

**Criterion:** $E_V^{(t)} > \bar{E}_V + \sigma_{E_V}$

At epoch $t$, if the current validation error exceeds the running mean plus one standard deviation, the model is considered to be overfitting.

| Symbol | Meaning |
|--------|---------|
| $E_V^{(t)}$ | Validation error at epoch $t$ |
| $\bar{E}_V = \frac{1}{t} \sum_{i=1}^{t} E_V^{(i)}$ | Running mean of all validation errors so far |
| $\sigma_{E_V} = \sqrt{\frac{1}{t} \sum_{i=1}^{t} (E_V^{(i)} - \bar{E}_V)^2}$ | Running standard deviation |

**Intuition:** The validation error naturally fluctuates. If it spikes above the historical mean by more than one standard deviation, the model has likely begun memorizing the training data rather than learning generalizable patterns.


**Mods** Include a scalar multiple of the runnning std to terminate off, perhaps 2*$\sigma$

In [5]:
import numpy as np

class OverfitDetector:
    """
    Detects overfitting using:  E_V(t) > mean(E_V) + std(E_V)
    
    Tracks all validation errors and checks if the current one
    exceeds the running mean by more than one standard deviation.
    """
    def __init__(self):
        self.val_errors = []
    
    def check(self, val_error):
        """
        Args:
            val_error: validation error (1 - val_accuracy) at current epoch
        Returns:
            is_overfitting (bool), mean_ev, std_ev
        """
        self.val_errors.append(val_error)
        
        # Need at least 2 points to compute a meaningful std
        if len(self.val_errors) < 2:
            return False, val_error, 0.0
        
        # E_V_bar = (1/t) * sum(E_V)
        mean_ev = np.mean(self.val_errors)
        
        # sigma_EV = sqrt( (1/t) * sum( (E_V - E_V_bar)^2 ) )
        std_ev = np.std(self.val_errors)
        
        threshold = mean_ev + std_ev
        is_overfitting = val_error > threshold
        
        return is_overfitting, mean_ev, std_ev

# --- Quick sanity test ---
detector = OverfitDetector()

# Simulate: error decreases then spikes
fake_errors = [0.30, 0.25, 0.20, 0.18, 0.16, 0.15, 0.14, 0.13, 0.13, 0.30]

print("Epoch | E_V   | E_V_bar | sigma  | Threshold | Overfit?")
print("------+-------+---------+--------+-----------+---------")
for i, ev in enumerate(fake_errors):
    overfit, mean_ev, std_ev = detector.check(ev)
    threshold = mean_ev + std_ev
    print(f"  {i+1:02d}  | {ev:.3f} |  {mean_ev:.3f}  | {std_ev:.3f} |   {threshold:.3f}   |  {overfit}")

Epoch | E_V   | E_V_bar | sigma  | Threshold | Overfit?
------+-------+---------+--------+-----------+---------
  01  | 0.300 |  0.300  | 0.000 |   0.300   |  False
  02  | 0.250 |  0.275  | 0.025 |   0.300   |  False
  03  | 0.200 |  0.250  | 0.041 |   0.291   |  False
  04  | 0.180 |  0.232  | 0.047 |   0.279   |  False
  05  | 0.160 |  0.218  | 0.051 |   0.269   |  False
  06  | 0.150 |  0.207  | 0.053 |   0.259   |  False
  07  | 0.140 |  0.197  | 0.054 |   0.251   |  False
  08  | 0.130 |  0.189  | 0.055 |   0.244   |  False
  09  | 0.130 |  0.182  | 0.055 |   0.238   |  False
  10  | 0.300 |  0.194  | 0.063 |   0.257   |  True


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader as TorchDataLoader
import numpy as np
import pandas as pd
from DataLoader import DataLoader

# ---------------------------------------------------------
# Weight Initialization Strategies 
# ---------------------------------------------------------
TORCH_INIT_STRATEGIES = {
    "he": lambda w: nn.init.kaiming_normal_(w, mode='fan_in', nonlinearity='relu'),
    "uniform": lambda w: nn.init.uniform_(w, -1 / np.sqrt(w.shape[1]), 1 / np.sqrt(w.shape[1])),
    "normal": lambda w: nn.init.normal_(w, mean=0.0, std=0.01),
}

# ---------------------------------------------------------
# A. Custom Dataset Class
# ---------------------------------------------------------
class dataset_to_torch(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        # Convert one-hot encoded labels to class indices for CrossEntropyLoss
        self.y = torch.LongTensor(np.argmax(y, axis=1))
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# ---------------------------------------------------------
# B. Neural Network Definition (matches manual numpy version)
#    Architecture: Linear -> ReLU -> Linear -> ReLU -> Linear
# ---------------------------------------------------------
class NeuralNetwork(nn.Module):
    def __init__(self, input_size=784, hidden_size=1028, output_size=10, init_strategy="he"):
        super(NeuralNetwork, self).__init__()
        
        if init_strategy not in TORCH_INIT_STRATEGIES:
            raise ValueError(f"Strategy must be one of {list(TORCH_INIT_STRATEGIES.keys())}")
        
        # 2 hidden layers + output, matching the manual version
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )
        
        # Apply selected weight initialization
        weight_fn = TORCH_INIT_STRATEGIES[init_strategy]
        for m in self.modules():
            if isinstance(m, nn.Linear):
                weight_fn(m.weight)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        return self.network(x)

# ---------------------------------------------------------
# C. Load and Split Data
# ---------------------------------------------------------
loader = DataLoader()

print("\n1. Getting Z-Score Normalized Data...")
df_zscore = loader.get_z_score_df()

print("\n2. Splitting...")
data_dict = loader.get_supervised_split(df_zscore, dataset_pct=0.8)

original_train = data_dict["train"]
print(f"   Original Labeled Size: {original_train.shape[0]}")

# ---------------------------------------------------------
# D. Augmentation
# ---------------------------------------------------------
print("\n3. Augmenting Labeled Data (Horizontal Flip)...")
flipped_train = loader.augment_horizontal_flip(original_train)

final_train_df = pd.concat([original_train, flipped_train], axis=0).reset_index(drop=True)
print(f"   Final Training Size (Original + Flipped): {final_train_df.shape[0]}")

# ---------------------------------------------------------
# E. Convert to NumPy and Create PyTorch Datasets
# ---------------------------------------------------------
X_train, y_train = loader.to_numpy(final_train_df)
X_val, y_val = loader.to_numpy(data_dict["validation"])
X_test, y_test = loader.to_numpy(data_dict["test"])

train_dataset = dataset_to_torch(X_train, y_train)
val_dataset = dataset_to_torch(X_val, y_val)
test_dataset = dataset_to_torch(X_test, y_test)

# ---------------------------------------------------------
# F. Create Data Loaders
# ---------------------------------------------------------
BATCH_SIZE = 64
import platform
NUM_WORKERS = 0 if platform.system() == 'Windows' else 2

train_loader = TorchDataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
val_loader = TorchDataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
test_loader = TorchDataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())

# ---------------------------------------------------------
# G. Training Setup (matching manual hyperparams exactly)
# ---------------------------------------------------------
print("\n4. Initializing Network...")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"   Using device: {device}")

model = NeuralNetwork(input_size=784, hidden_size=1028, output_size=10, init_strategy="he")
model = model.to(device)

criterion = nn.CrossEntropyLoss()

EPOCHS = 200
ALPHA = 0.01   # Learning Rate
GAMMA = 0.04   # Momentum

optimizer = optim.SGD(model.parameters(), lr=ALPHA, momentum=GAMMA)

# Overfitting detector: E_V > E_V_bar + sigma_EV
overfit_detector = OverfitDetector()

print(f"   Starting Training (Batch={BATCH_SIZE}, Alpha={ALPHA}, Gamma={GAMMA})...")

# ---------------------------------------------------------
# H. Training Loop
# ---------------------------------------------------------
for epoch in range(EPOCHS):
    model.train()
    
    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validation every 10 epochs
    if (epoch + 1) % 10 == 0:
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)
                outputs = model(batch_X)
                _, predicted = torch.max(outputs, 1)
                val_total += batch_y.size(0)
                val_correct += (predicted == batch_y).sum().item()
        
        val_acc = val_correct / val_total
        val_error = 1.0 - val_acc
        
        # Check overfitting: E_V > E_V_bar + sigma_EV
        is_overfitting, mean_ev, std_ev = overfit_detector.check(val_error)
        threshold = mean_ev + std_ev
        
        status = "** OVERFITTING **" if is_overfitting else ""
        print(f"   Epoch {epoch + 1:03d} | Val Acc: {val_acc:.4f} | "
              f"E_V: {val_error:.4f} | E_V_bar: {mean_ev:.4f} | "
              f"sigma: {std_ev:.4f} | Thresh: {threshold:.4f} {status}")
        
        if is_overfitting:
            print(f"   Stopping: E_V({val_error:.4f}) > E_V_bar + sigma({threshold:.4f})")
            break

# ---------------------------------------------------------
# I. Final Test
# ---------------------------------------------------------
model.eval()
test_correct = 0
test_total = 0

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        test_total += batch_y.size(0)
        test_correct += (predicted == batch_y).sum().item()

test_acc = test_correct / test_total
print(f"\nFinal Test Accuracy: {test_acc:.4f}")

Data Loaded. Shape: (70000, 785)

1. Getting Z-Score Normalized Data...

2. Splitting...
   Original Labeled Size: 44800

3. Augmenting Labeled Data (Horizontal Flip)...
   Final Training Size (Original + Flipped): 89600

4. Initializing Network...
   Using device: cuda
   Starting Training (Batch=64, Alpha=0.01, Gamma=0.04)...
   Epoch 010 | Val Acc: 0.8920 | E_V: 0.1080 | E_V_bar: 0.1080 | sigma: 0.0000 | Thresh: 0.1080 
   Epoch 020 | Val Acc: 0.8945 | E_V: 0.1055 | E_V_bar: 0.1068 | sigma: 0.0012 | Thresh: 0.1080 
   Epoch 030 | Val Acc: 0.8936 | E_V: 0.1064 | E_V_bar: 0.1067 | sigma: 0.0010 | Thresh: 0.1077 
   Epoch 040 | Val Acc: 0.8962 | E_V: 0.1038 | E_V_bar: 0.1059 | sigma: 0.0015 | Thresh: 0.1075 
   Epoch 050 | Val Acc: 0.8950 | E_V: 0.1050 | E_V_bar: 0.1057 | sigma: 0.0014 | Thresh: 0.1072 
   Epoch 060 | Val Acc: 0.8955 | E_V: 0.1045 | E_V_bar: 0.1055 | sigma: 0.0014 | Thresh: 0.1069 
   Epoch 070 | Val Acc: 0.8954 | E_V: 0.1046 | E_V_bar: 0.1054 | sigma: 0.0013 | Thresh:

## Stage 2e: Baseline Training (10% Labeled Only)

Per the assignment:
- **Data split:** 10% test, 10% validation, 80% training pool (from all 70k)
- **Semi-supervised setup:** From training pool, 20% labeled / 80% unlabeled
- **Baseline:** Train using **only** the labeled portion (~11,200 samples)
- **Seeds:** {1, 123, 12345} — run 3 times, record mean ± std
- **Record:** Training/validation loss & accuracy curves, final test accuracy, epochs to convergence

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataLoader import DataLoader
from VanillaModel import VanillaModel
from Overfitting import OverfitDetector
from Results import Results

# ---------------------------------------------------------
# Seed-setting utility
# ---------------------------------------------------------
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# ---------------------------------------------------------
# Results pipeline
# ---------------------------------------------------------
results = Results("results.csv")

# ---------------------------------------------------------
# Single training run
# ---------------------------------------------------------
def run_baseline(seed, init_strategy="he"):
    set_seed(seed)

    results.begin_run(
        seed=seed,
        weight_init=init_strategy,
        normalization="z_score",
        augmentations="horizontal_flip"
    )

    # A. Load, normalize, split
    loader = DataLoader()
    df_zscore = loader.get_z_score_df()
    data_dict = loader.get_semi_supervised_split(
        df_zscore, test_size=0.10, val_size=0.10, labeled_ratio=0.20, seed=seed
    )

    labeled_train = data_dict["labeled_train"]
    print(f"   Seed {seed} | Labeled train: {len(labeled_train)} | "
          f"Val: {len(data_dict['validation'])} | Test: {len(data_dict['test'])}")

    # B. Augmentation
    flipped = loader.augment_horizontal_flip(labeled_train)
    final_train = pd.concat([labeled_train, flipped], axis=0).reset_index(drop=True)

    # C. Build torch loaders via DataLoader helper
    loaders = loader.to_torch_loaders({
        "train": final_train,
        "validation": data_dict["validation"],
        "test": data_dict["test"]
    }, batch_size=64)
    train_loader = loaders["train"]
    val_loader = loaders["validation"]
    test_loader = loaders["test"]

    # D. Model setup
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    set_seed(seed)
    model = VanillaModel(init_strategy=init_strategy).to(device)

    criterion = nn.CrossEntropyLoss()
    ALPHA = 0.01
    GAMMA = 0.04
    optimizer = optim.SGD(model.parameters(), lr=ALPHA, momentum=GAMMA)

    # E. Training loop
    EPOCHS = 200
    overfit_detector = OverfitDetector()
    converged_epoch = EPOCHS

    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_total += batch_y.size(0)
            train_correct += (predicted == batch_y).sum().item()

        epoch_train_loss = train_loss / len(train_loader)
        epoch_train_acc = train_correct / train_total

        # --- Validate ---
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                val_total += batch_y.size(0)
                val_correct += (predicted == batch_y).sum().item()

        epoch_val_loss = val_loss / len(val_loader)
        epoch_val_acc = val_correct / val_total

        results.log_epoch(epoch + 1, epoch_train_loss, epoch_train_acc,
                          epoch_val_loss, epoch_val_acc)

        val_error = 1.0 - epoch_val_acc
        is_overfitting, mean_ev, std_ev = overfit_detector.check(val_error)

        if (epoch + 1) % 10 == 0:
            status = " ** OVERFIT **" if is_overfitting else ""
            print(f"   Epoch {epoch+1:03d} | Train Loss: {epoch_train_loss:.4f} | "
                  f"Train Acc: {epoch_train_acc:.4f} | Val Acc: {epoch_val_acc:.4f}{status}")

        if is_overfitting:
            converged_epoch = epoch + 1
            print(f"   >>> Overfitting at epoch {converged_epoch}. "
                  f"E_V={val_error:.4f} > threshold={mean_ev + std_ev:.4f}")
            break

    # F. Test
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            _, predicted = torch.max(outputs, 1)
            test_total += batch_y.size(0)
            test_correct += (predicted == batch_y).sum().item()

    test_acc = test_correct / test_total
    results.end_run(test_acc=test_acc, converged_epoch=converged_epoch)

    return {"seed": seed, "test_acc": test_acc, "converged_epoch": converged_epoch}

# ---------------------------------------------------------
# Run 3 seeds
# ---------------------------------------------------------
SEEDS = [1, 123, 12345]
all_results = []

for seed in SEEDS:
    print(f"{'='*60}")
    print(f"Running baseline with seed={seed}")
    print(f"{'='*60}")
    result = run_baseline(seed, init_strategy="he")
    all_results.append(result)

# ---------------------------------------------------------
# Summary
# ---------------------------------------------------------
print(f"\n{'='*60}")
print(f"BASELINE RESULTS (He init, 10% labeled only)")
print(f"{'='*60}")
results.summary()

# ---------------------------------------------------------
# Plot from CSV
# ---------------------------------------------------------
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for i, r in enumerate(all_results):
    run_df = results.get_run(r["seed"], "he")

    axes[0][i].plot(run_df["epoch"], run_df["train_loss"], label="Train Loss")
    axes[0][i].plot(run_df["epoch"], run_df["val_loss"], label="Val Loss")
    axes[0][i].set_title(f"Seed {r['seed']} - Loss")
    axes[0][i].set_xlabel("Epoch")
    axes[0][i].set_ylabel("Loss")
    axes[0][i].legend()
    axes[0][i].grid(True, alpha=0.3)

    axes[1][i].plot(run_df["epoch"], run_df["train_acc"], label="Train Acc")
    axes[1][i].plot(run_df["epoch"], run_df["val_acc"], label="Val Acc")
    axes[1][i].set_title(f"Seed {r['seed']} - Accuracy (Test: {r['test_acc']:.4f})")
    axes[1][i].set_xlabel("Epoch")
    axes[1][i].set_ylabel("Accuracy")
    axes[1][i].legend()
    axes[1][i].grid(True, alpha=0.3)

plt.suptitle("Stage 2e: Baseline Training (Labeled Data Only, He Init)", fontsize=14)
plt.tight_layout()
plt.show()