In [None]:
import random, sys, os
sys.path.insert(0, os.path.abspath('..'))
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader
from pkldataset import PKLDataset
from helpers import set_seed, get_model, eval_model, train_model


# === CONFIGURATION ===
train_path_1     = r"C:\Users\gus07\Desktop\data hiwi\preprocessing\HC\T197\RP"# add path to pretrain folder here (e.g. "RPHC197")
# Training dataset names
transfer_sets = ["../datasets/RPDC197/train_20", "../datasets/RPDC197/train_50", "../datasets/RPDC197/train_100", "../datasets/RPDC197/train_200", "../datasets/RPDC197/train_300",
 "../datasets/RPDC197/train_400", "../datasets/RPDC197/train_500", "../datasets/RPDC197/train_600"]

# Validation datasets to test each model on
val_paths = [
    "../datasets/RPDC185/val_1000",
    "../datasets/RPDC188/val_1000",
    "../datasets/RPDC191/val_1000",
    "../datasets/RPDC194/val_1000",
    "../datasets/RPDC197/val_1000",
]
seeds            = [101,202,303,404,505,606,707,808,909,1001]
device           = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion        = nn.CrossEntropyLoss()

# Prepare results container
# results[train_t][val_path] = list of accuracies (one per seed)
results = {
    t: {vp: [] for vp in val_paths}
    for t in transfer_sets
}

for seed in seeds:
    print(f"\n>>> Full pipeline with seed {seed}")
    set_seed(seed)

    # --- FIRST PHASE on train_path_1 ---
    train_ds, val_ds = PKLDataset.split_dataset(train_path_1)
    train_loader1 = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader1   = DataLoader(val_ds,   batch_size=64, shuffle=True)

    model = get_model().to(device)
    opt   = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    sch   = optim.lr_scheduler.StepLR(opt, step_size=50, gamma=0.1)

    model = train_model(
        model, train_loader1,
        criterion, opt, sch,
        num_epochs=10,
        device=device
    )
    # keep pretrained weights in memory
    pretrained_state = model.state_dict()

    # --- SECOND PHASE (TRANSFER) ---
    for t in transfer_sets:
        # reload pretrained
        tl_model = get_model().to(device)
        tl_model.load_state_dict(pretrained_state)

        # train on t
        loader_t = DataLoader(PKLDataset(t), batch_size=64, shuffle=True)
        opt2     = optim.Adam(tl_model.parameters(), lr=1e-3, weight_decay=1e-5)
        sch2     = optim.lr_scheduler.StepLR(opt2, step_size=25, gamma=0.1)

        tl_model = train_model(
            tl_model, loader_t,
            criterion, opt2, sch2,
            num_epochs=100,
            device=device
        )

        # evaluate on each val_path
        for vp in val_paths:
            loader_vp = DataLoader(PKLDataset(vp), batch_size=64, shuffle=False)
            acc       = eval_model(tl_model, loader_vp, device)
            results[t][vp].append(acc)
            print(f"  [Seed {seed}] {t} → {vp}: {acc:.2f}%")

# --- FINAL SUMMARY ---
print("\n=== Mean ± Std Dev over seeds ===")
for t in transfer_sets:
    for vp in val_paths:
        acc_list = results[t][vp]
        mean_acc = np.mean(acc_list)
        std_acc  = np.std(acc_list, ddof=1)
        print(f"{t} → {vp}: mean = {mean_acc:.2f}%,  std = {std_acc:.2f}%")
