In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedShuffleSplit
import time
from tqdm import tqdm
import json

In [55]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

### Loading and downsampling data

In [28]:
data = pd.read_csv('../data/xor_data.csv')

y = data['y']
X = data.drop(columns='y')

sss = StratifiedShuffleSplit(n_splits=1, train_size=10000, random_state=42)
for train_idx, _ in sss.split(X, y):
    X_s = X.iloc[train_idx]
    y_s = y.iloc[train_idx]

### K-fold setup

In [29]:
def generate_folds(X, y, n_folds = 10):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    folds = []
    splits = kf.split(X, y)
    for train_index, val_index in splits:
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        folds.append({'Xt': X_train, 'Xv': X_val, 'yt': y_train, 'yv': y_val})
    return folds

In [30]:
folds = generate_folds(X_s, y_s, 10)

### Auxiliary functions

In [None]:
def extend_with_last(arr, target_length):
    return arr + [arr[-1]] * (target_length - len(arr)) 

class FeedforwardNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.act = nn.ReLU()  
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.act(self.hidden(x))
        x = self.output(x)
        return x

def plot_results(results_in): 
    plt.plot(results_in['train_losses_by_epochs'], label='Train Loss')
    plt.plot(results_in['val_losses'], label='Validation Loss')
    #plt.plot(results_in['train_losses_by_samples'])
    plt.ylim(bottom=0) 
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss Curve for Classification Problem 1')
    plt.show()

def save_json(data, path, indent=4):
    with open(path, "w") as f:
        json.dump(data, f, indent=indent)

## SGD

### Model Training function

In [74]:
def train_sgd_model(fold, hidden_dim=2, input_dim = 2, output_dim = 2, batch_size=1, learning_rate = 0.1, max_epochs=10, to_print=True, patience=5): 
    X_val = torch.tensor(fold['Xv'].values, dtype=torch.float32)
    y_val = torch.tensor(fold['yv'].values, dtype=torch.long)
    X_train = torch.tensor(fold['Xt'].values, dtype=torch.float32)
    y_train = torch.tensor(fold['yt'].values, dtype=torch.long)

    train_dataset = TensorDataset(X_train, y_train)
    loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model = FeedforwardNet(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    per_sample_loss = []
    times = []
    samples_seen = 0
    best_val_loss = float('inf')
    epochs_since_improve = 0
    global_start = time.time()
    total_epochs_ran = 0
    for epoch in range(max_epochs):
        local_start = time.time()
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        for batch_X, batch_y in loader: 
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)
            per_sample_loss.append(loss.item())
            samples_seen += batch_y.size(0)
        
        avg_train_loss = train_loss / total
        train_acc = correct / total
        train_losses.append(avg_train_loss)
        train_accs.append(train_acc)
        
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)              
            val_loss = criterion(val_outputs, y_val).item()  
            _, val_pred = torch.max(val_outputs, 1)
            val_correct = 0
            val_total = y_val.size(0)
            val_correct = (val_pred == y_val).sum().item()
            val_acc = val_correct / val_total
            val_losses.append(val_loss)
            val_accs.append(val_acc)

        if val_loss < best_val_loss - 1e-6:  
            best_val_loss = val_loss
            epochs_since_improve = 0
        else:
            epochs_since_improve += 1

        local_stop = time.time()
        local_duration = local_stop - local_start
        times.append(local_duration)
        total_epochs_ran += 1

        if to_print == True:
            print(f"Epoch {epoch+1}/{max_epochs} | "
                f"Train Loss: {avg_train_loss:.4f} | "
                f"Train Acc: {train_acc:.4f} | "
                f"Val Loss: {val_loss:.4f} | "
                f"Val Acc: {val_acc:.4f} | "
                f"Duration: {local_duration:.4f}")
            
        if epochs_since_improve >= patience or all(abs(x - 1.0) < 1e-6 for x in val_accs[-3:]):
            if to_print == True:
                print(f"Early stopping at epoch {epoch+1}")
            break
            
    global_end = time.time()
    global_duration = global_end - global_start

    results = {
        "model": model,
        "train_losses_by_epochs": train_losses,
        "train_losses_by_samples": per_sample_loss,
        "val_losses": val_losses,
        "train_accs": train_accs,
        "val_accs": val_accs,
        "global_duration": global_duration, 
        "epoch_durations": times,
        "total_epochs": total_epochs_ran
    }
    return results

### Evals of dimension and learning rate

In [73]:
def find_optimal_hidden_layers(folds, dim_list = [1, 2, 3], max_epochs=20):
    results = {}
    for n in dim_list:
        results_per_size = {}
        for i in tqdm(range(len(folds))):
            results_per_size[f'fold_{i}'] = train_sgd_model(folds[i], hidden_dim=n, max_epochs=max_epochs, to_print=False)
        results[f'dim_{n}'] = results_per_size
    return results 

In [68]:
dim_results = find_optimal_hidden_layers(folds)

 10%|█         | 1/10 [00:40<06:02, 40.28s/it]

Early stopping at epoch 8


 20%|██        | 2/10 [01:25<05:45, 43.15s/it]

Early stopping at epoch 8


 30%|███       | 3/10 [02:02<04:43, 40.51s/it]

Early stopping at epoch 6


 40%|████      | 4/10 [03:44<06:28, 64.73s/it]

Early stopping at epoch 16


 50%|█████     | 5/10 [04:27<04:43, 56.76s/it]

Early stopping at epoch 7


 60%|██████    | 6/10 [05:20<03:42, 55.61s/it]

Early stopping at epoch 10


 70%|███████   | 7/10 [06:32<03:02, 60.92s/it]

Early stopping at epoch 12


 80%|████████  | 8/10 [07:08<01:46, 53.02s/it]

Early stopping at epoch 6


 90%|█████████ | 9/10 [08:00<00:52, 52.80s/it]

Early stopping at epoch 10


 90%|█████████ | 9/10 [08:12<00:54, 54.70s/it]


KeyboardInterrupt: 