In [None]:
import os, sys
import random
sys.path.insert(0, os.path.abspath('..'))
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, ConcatDataset
from pkldataset import PKLDataset, NoisyPKLDataset
import gen
import form

# Utility to set seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Model definition
def get_model(input_length: int = 2800, num_classes: int = 10, input_channels: int = 1):
    class CNN(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Sequential(
                nn.Conv1d(input_channels, 16, kernel_size=31, padding=15),
                nn.BatchNorm1d(16),
                nn.ReLU(inplace=True),
                nn.MaxPool1d(kernel_size=2)
            )
            self.conv2 = nn.Sequential(
                nn.Conv1d(16, 32, kernel_size=31, padding=15),
                nn.BatchNorm1d(32),
                nn.ReLU(inplace=True),
                nn.MaxPool1d(kernel_size=2)
            )
            self.conv3 = nn.Sequential(
                nn.Conv1d(32, 64, kernel_size=31, padding=15),
                nn.BatchNorm1d(64),
                nn.ReLU(inplace=True),
                nn.MaxPool1d(kernel_size=2)
            )
            conv_output_length = input_length // 8
            self.fc_layers = nn.Sequential(
                nn.Flatten(),
                nn.Linear(64 * conv_output_length, 128),
                nn.ReLU(inplace=True),
                nn.Dropout(0.5),
                nn.Linear(128, num_classes)
            )

        def forward(self, x):
            if x.dim() == 2:
                x = x.unsqueeze(1)
            x = self.conv1(x)
            x = self.conv2(x)
            x = self.conv3(x)
            return self.fc_layers(x)
    return CNN()

# Training with validation for Phase 1

def train_model_phase1(model, train_loader, val_loader, criterion, optimizer, scheduler,
                       num_epochs=10, device=torch.device("cpu"), max_grad_norm=1.0):
    best_loss = float('inf')
    best_state = None
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            targets = y.argmax(dim=1)
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            running_loss += loss.item() * x.size(0)
        scheduler.step()
        epoch_loss = running_loss / len(train_loader.dataset)
        # Validation
        model.eval()
        correct = total = 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                targets = y.argmax(dim=1)
                out = model(x)
                pred = out.argmax(dim=1)
                correct += (pred == targets).sum().item()
                total += y.size(0)
        acc = 100. * correct / total
        print(f"Phase1 Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f}, Val Acc: {acc:.2f}%")
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            best_state = model.state_dict()
    # load best state
    model.load_state_dict(best_state)
    return model

# Evaluation on arbitrary loader
def eval_model(model, loader, device=torch.device("cpu")):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            targets = y.argmax(dim=1)
            out = model(x)
            pred = out.argmax(dim=1)
            correct += (pred == targets).sum().item()
            total += y.size(0)
    return 100. * correct / total

if __name__ == '__main__':
    # Configuration
    train_path_1 = r"C:\Users\gus07\Desktop\data hiwi\preprocessing\HC\T197\RP"
    pretrained_model_path = "cnn_model.pth"
    train_sizes = ["../datasets/RPDC197/train_20", "../datasets/RPDC197/train_50", "../datasets/RPDC197/train_100", "../datasets/RPDC197/train_200", "../datasets/RPDC197/train_300",
 "../datasets/RPDC197/train_400", "../datasets/RPDC197/train_500", "../datasets/RPDC197/train_600"]

    # Validation datasets to test each model on
    val_paths = [
        "../datasets/RPDC185/val_1000",
        "../datasets/RPDC188/val_1000",
        "../datasets/RPDC191/val_1000",
        "../datasets/RPDC194/val_1000",
        "../datasets/RPDC197/val_1000",
    ]
    seeds = [101,202,303,404,505,606,707,808,909,1001]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()

    # Phase 1: Pretrain on original split
    ds_train1, ds_val1 = PKLDataset.split_dataset(train_path_1)
    loader_train1 = DataLoader(ds_train1, batch_size=64, shuffle=True)
    loader_val1 = DataLoader(ds_val1, batch_size=64, shuffle=True)
    model_phase1 = get_model().to(device)
    opt1 = optim.Adam(model_phase1.parameters(), lr=1e-3, weight_decay=1e-5)
    sch1 = optim.lr_scheduler.StepLR(opt1, step_size=50, gamma=0.1)
    print("=== Phase 1: Pretraining ===")
    model_phase1 = train_model_phase1(model_phase1, loader_train1, loader_val1,
                                      criterion, opt1, sch1,
                                      num_epochs=10, device=device)
    torch.save(model_phase1.state_dict(), pretrained_model_path)
    gen.generate(loader_train1, num_epochs=150, num_samples=10,
             save_new_generator_path="generator_model.pth",)

    # Container for results
    results = {t: {vp: [] for vp in val_paths} for t in train_sizes}

    for seed in seeds:
        print(f"\n>>> Seed {seed}")
        set_seed(seed)
        # Phase 2: Transfer + eval
        for t in train_sizes:
            print(f"-- Transfer on {t}")
            # Synthetic generation
            ds_t = PKLDataset(t)
            loader_t = DataLoader(ds_t, batch_size=64, shuffle=True)
            gen.generate(loader_t,
                         num_epochs=150,
                         num_samples=20,
                         pretrained_generator_path="generator_model.pth")
            form.format()
            # Load synthetic and noisy sets
            ds_synth = PKLDataset("synth_data/individual_samples")
            ds_noisy = NoisyPKLDataset(t)
            combined = ConcatDataset([ds_t, ds_synth, ds_noisy])
            loader_comb = DataLoader(combined, batch_size=32, shuffle=True)

            # Initialize and load pretrained
            net = get_model().to(device)
            net.load_state_dict(torch.load(pretrained_model_path))
            opt2 = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)
            sch2 = optim.lr_scheduler.StepLR(opt2, step_size=50, gamma=0.1)

            # Train on combined
            net = train_model_phase1(net, loader_comb, loader_val1,
                                     criterion, opt2, sch2,
                                     num_epochs=100, device=device)

            # Evaluate on all validation splits
            for vp in val_paths:
                ds_vp = PKLDataset(vp)
                loader_vp = DataLoader(ds_vp, batch_size=64, shuffle=False)
                acc = eval_model(net, loader_vp, device)
                results[t][vp].append(acc)
                print(f"Seed {seed}, {t} -> {vp}: {acc:.2f}%")

    # Summary
    print("\n=== Summary over seeds ===")
    for t in train_sizes:
        for vp in val_paths:
            arr = np.array(results[t][vp])
            mean = arr.mean()
            std = arr.std(ddof=1)
            print(f"{t} -> {vp}: Mean={mean:.2f}%, Std={std:.2f}%")