In [None]:
import os,sys
sys.path.insert(0, os.path.abspath('..'))
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, ConcatDataset
from pkldataset import PKLDataset, NoisyPKLDataset

# Utility to set seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Define model class
def get_model(input_length: int = 2800, num_classes: int = 10, input_channels: int = 1):
    class CNN(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Sequential(
                nn.Conv1d(input_channels, 16, kernel_size=31, padding=15),
                nn.BatchNorm1d(16),
                nn.ReLU(inplace=True),
                nn.MaxPool1d(2)
            )
            self.conv2 = nn.Sequential(
                nn.Conv1d(16, 32, kernel_size=31, padding=15),
                nn.BatchNorm1d(32),
                nn.ReLU(inplace=True),
                nn.MaxPool1d(2)
            )
            self.conv3 = nn.Sequential(
                nn.Conv1d(32, 64, kernel_size=31, padding=15),
                nn.BatchNorm1d(64),
                nn.ReLU(inplace=True),
                nn.MaxPool1d(2)
            )
            conv_output_length = input_length // 8
            self.fc = nn.Sequential(
                nn.Flatten(),
                nn.Linear(64 * conv_output_length, 128),
                nn.ReLU(inplace=True),
                nn.Dropout(0.5),
                nn.Linear(128, num_classes)
            )

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            if x.dim() == 2:
                x = x.unsqueeze(1)
            x = self.conv1(x)
            x = self.conv2(x)
            x = self.conv3(x)
            return self.fc(x)
    return CNN()

# Training function
def train_model(model, loader, criterion, optimizer, scheduler,
                num_epochs, device, max_grad_norm=1.0):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            y_idx = targets.argmax(dim=1)
            optimizer.zero_grad()
            out = model(inputs)
            loss = criterion(out, y_idx)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        scheduler.step()
        avg_loss = running_loss / len(loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    return model

# Evaluation function
def eval_model(model, loader, device):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            y_idx = targets.argmax(dim=1)
            out = model(inputs)
            preds = out.argmax(dim=1)
            correct += (preds == y_idx).sum().item()
            total += targets.size(0)
    return 100. * correct / total

if __name__ == '__main__':
    # Configuration
    train_paths = ["../datasets/RPDC197/train_20", "../datasets/RPDC197/train_50", "../datasets/RPDC197/train_100", "../datasets/RPDC197/train_200", "../datasets/RPDC197/train_300",
 "../datasets/RPDC197/train_400", "../datasets/RPDC197/train_500", "../datasets/RPDC197/train_600"]

    # Validation datasets to test each model on
    val_paths = [
        "../datasets/RPDC185/val_1000",
        "../datasets/RPDC188/val_1000",
        "../datasets/RPDC191/val_1000",
        "../datasets/RPDC194/val_1000",
        "../datasets/RPDC197/val_1000",
    ]
    seeds = [101, 202, 303, 404, 505, 606, 707, 808, 909, 1001]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()

    # Container for results: {train_path: {val_path: [accs]}}
    results = {tp: {vp: [] for vp in val_paths} for tp in train_paths}

    # Multi-seed evaluation
    for seed in seeds:
        print(f"\n=== Seed {seed} ===")
        set_seed(seed)
        for tp in train_paths:
            print(f"-- Training on {tp}")
            # Prepare combined dataset
            ds_real = PKLDataset(tp)
            ds_noisy = NoisyPKLDataset(tp)
            combined = ConcatDataset([ds_real, ds_noisy])
            train_loader = DataLoader(combined, batch_size=32, shuffle=True)

            # Initialize and train model
            model = get_model().to(device)
            optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
            scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
            model = train_model(model, train_loader, criterion,
                                optimizer, scheduler, num_epochs=100,
                                device=device)

            # Evaluate on each validation set
            for vp in val_paths:
                val_loader = DataLoader(PKLDataset(vp), batch_size=64, shuffle=False)
                acc = eval_model(model, val_loader, device)
                results[tp][vp].append(acc)
                print(f"[{tp} -> {vp}] Seed {seed}: Acc = {acc:.2f}%")

    # Summary
    print("\n=== Summary across seeds ===")
    for tp in train_paths:
        for vp in val_paths:
            arr = np.array(results[tp][vp])
            mean, std = arr.mean(), arr.std(ddof=1)
            print(f"{tp} -> {vp}: Mean = {mean:.2f}%, Std = {std:.2f}%")


=== Seed 101 ===
-- Training on ../datasets/RPDC197/train_20
Epoch 1/100, Loss: 2.4692
Epoch 2/100, Loss: 2.5645
Epoch 3/100, Loss: 2.4145
Epoch 4/100, Loss: 2.0454
Epoch 5/100, Loss: 1.3329
Epoch 6/100, Loss: 1.5097
Epoch 7/100, Loss: 1.1092
Epoch 8/100, Loss: 1.3080
Epoch 9/100, Loss: 0.8562
Epoch 10/100, Loss: 0.9479
Epoch 11/100, Loss: 0.8235
Epoch 12/100, Loss: 0.8043
Epoch 13/100, Loss: 0.7993
Epoch 14/100, Loss: 0.5771
Epoch 15/100, Loss: 0.5400
Epoch 16/100, Loss: 0.9197
Epoch 17/100, Loss: 0.7630
Epoch 18/100, Loss: 0.5550
Epoch 19/100, Loss: 0.4413
Epoch 20/100, Loss: 0.3313
Epoch 21/100, Loss: 0.4275
Epoch 22/100, Loss: 0.5222
Epoch 23/100, Loss: 0.3967
Epoch 24/100, Loss: 0.3433
Epoch 25/100, Loss: 0.1866
Epoch 26/100, Loss: 0.2162
Epoch 27/100, Loss: 0.1349
Epoch 28/100, Loss: 0.3110
Epoch 29/100, Loss: 0.2350
Epoch 30/100, Loss: 0.1546
Epoch 31/100, Loss: 0.1846
Epoch 32/100, Loss: 0.1613
Epoch 33/100, Loss: 0.4824
Epoch 34/100, Loss: 0.1791
Epoch 35/100, Loss: 0.2411
Ep

In [2]:
# Build the summary dict keyed by the numeric RPDC ID
data = {}
for vp in val_paths:
    # extract e.g. 185 from "RPDC185/val_1000"
    key = int(vp.split('/')[-2].replace('RPDC', ''))
    means = []
    stds = []
    for tp in train_paths:
        arr = np.array(results[tp][vp])
        means.append(round(arr.mean(), 2))
        stds.append(round(arr.std(ddof=1), 2))
    data[key] = {'mean': means, 'std': stds}

In [6]:
data

{185: {'mean': [54.72, 60.32, 69.42, 69.95, 71.58, 67.77, 69.24, 70.12],
  'std': [3.86, 5.51, 3.81, 3.87, 2.6, 3.4, 3.25, 2.39]},
 188: {'mean': [44.71, 56.71, 71.47, 72.43, 74.48, 72.99, 71.64, 74.39],
  'std': [3.55, 3.93, 5.24, 5.16, 2.65, 4.81, 5.15, 5.21]},
 191: {'mean': [70.43, 72.92, 85.24, 90.06, 92.14, 89.98, 91.05, 92.23],
  'std': [5.88, 6.55, 3.96, 2.84, 1.92, 3.38, 2.26, 2.2]},
 194: {'mean': [75.02, 82.06, 94.79, 97.27, 97.87, 96.33, 97.81, 97.34],
  'std': [5.58, 4.83, 3.2, 2.08, 1.09, 2.13, 0.95, 1.81]},
 197: {'mean': [74.21, 93.63, 97.04, 97.85, 99.01, 99.17, 99.32, 99.48],
  'std': [5.62, 1.53, 0.51, 0.29, 0.26, 0.34, 0.22, 0.18]}}