# Hybrid Generative & Noisy-Augmented Transfer Learning Pipeline

This notebook:
1. Pretrains a CNN on a primary dataset split with supervised training (Phase 1).
2. Trains a data generator and saves its weights.
3. For each seed and training subset:
   - Generates synthetic samples
   - Formats synthetic data
   - Combines real, synthetic, and noisy data
   - Fine-tunes the pretrained CNN (Phase 2)
   - Evaluates on multiple validation sets
Results are summarized as mean ± std for each train → validation pair.

## 1. Imports, Utilities & Model Definition

Load libraries, define reproducibility and model, and helper training/evaluation functions.

In [None]:
import os, sys, random
sys.path.insert(0, os.path.abspath('..'))
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, ConcatDataset
from pkldataset import PKLDataset, NoisyPKLDataset
import gen, form
from helpers import set_seed, get_model, eval_model

# Training with validation for Phase 1
def train_model_phase1(model, train_loader, val_loader, criterion, optimizer, scheduler,
                       num_epochs=10, device=torch.device('cpu'), max_grad_norm=1.0):
    best_loss = float('inf')
    best_state = None
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            targets = y.argmax(dim=1)
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            running_loss += loss.item() * x.size(0)
        scheduler.step()
        epoch_loss = running_loss / len(train_loader.dataset)
        model.eval()
        correct = total = 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                targets = y.argmax(dim=1)
                pred = model(x).argmax(dim=1)
                correct += (pred == targets).sum().item()
                total += y.size(0)
        acc = 100. * correct / total
        print(f"Phase1 Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f}, Val Acc: {acc:.2f}%")
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            best_state = model.state_dict()
    model.load_state_dict(best_state)
    return model


## 2. Configuration

- **Primary dataset**: path to pretraining folder  
- **Generator & Model checkpoints**  
- **Training subsets**: for Phase 2  
- **Validation sets**: for evaluation  
- **Seeds**: for reproducibility  
- **Results container**

In [None]:
# Paths and settings
train_path_1 = r"C:\Users\gus07\Desktop\data hiwi\preprocessing\HC\T197\RP"
pretrained_model_path = "cnn_model.pth"

train_sizes = [
    "../datasets/RPDC197/train_20",
    "../datasets/RPDC197/train_50",
    "../datasets/RPDC197/train_100",
    "../datasets/RPDC197/train_200",
    "../datasets/RPDC197/train_300",
    "../datasets/RPDC197/train_400",
    "../datasets/RPDC197/train_500",
    "../datasets/RPDC197/train_600",
]
val_paths = [
    "../datasets/RPDC185/val_1000",
    "../datasets/RPDC188/val_1000",
    "../datasets/RPDC191/val_1000",
    "../datasets/RPDC194/val_1000",
    "../datasets/RPDC197/val_1000",
]
seeds = [101,202,303,404,505,606,707,808,909,1001]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

results = {t: {vp: [] for vp in val_paths} for t in train_sizes}


## 3. Phase 1: Supervised Pretraining & Generator Training

Split the primary dataset, train CNN, save model, then train generator.

In [None]:
from torch.utils.data import DataLoader
from pkldataset import PKLDataset

ds_train1, ds_val1 = PKLDataset.split_dataset(train_path_1)
loader_train1 = DataLoader(ds_train1, batch_size=64, shuffle=True)
loader_val1 = DataLoader(ds_val1, batch_size=64, shuffle=True)

print("=== Phase 1: Supervised Pretraining ===")
model_phase1 = get_model().to(device)
opt1 = optim.Adam(model_phase1.parameters(), lr=1e-3, weight_decay=1e-5)
sch1 = optim.lr_scheduler.StepLR(opt1, step_size=50, gamma=0.1)
model_phase1 = train_model_phase1(
    model_phase1, loader_train1, loader_val1, criterion, opt1, sch1,
    num_epochs=10, device=device
)
torch.save(model_phase1.state_dict(), pretrained_model_path)

print("=== Phase 1: Generator Training ===")
gen.generate(loader_train1, num_epochs=150, num_samples=10, save_new_generator_path="generator_model.pth")

## 4. Phase 2: Transfer with Synthetic & Noisy Augmentation

For each seed and training subset:
1. Generate synthetic samples
2. Format synthetic data
3. Combine real, synthetic, and noisy datasets
4. Fine-tune pretrained CNN with supervised training
5. Evaluate on validation sets

In [None]:
for seed in seeds:
    print(f"\n>>> Seed {seed}")
    set_seed(seed)
    for t in train_sizes:
        print(f"-- Transfer on {t}")
        ds_t = PKLDataset(t)
        loader_t = DataLoader(ds_t, batch_size=64, shuffle=True)
        gen.generate(loader_t, num_epochs=150, num_samples=20, pretrained_generator_path="generator_model.pth")
        form.format()
        ds_synth = PKLDataset("synth_data/individual_samples")
        ds_noisy = NoisyPKLDataset(t)
        combined = ConcatDataset([ds_t, ds_synth, ds_noisy])
        loader_comb = DataLoader(combined, batch_size=32, shuffle=True)
        net = get_model().to(device)
        net.load_state_dict(torch.load(pretrained_model_path))
        opt2 = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)
        sch2 = optim.lr_scheduler.StepLR(opt2, step_size=50, gamma=0.1)
        net = train_model_phase1(net, loader_comb, loader_val1, criterion, opt2, sch2, num_epochs=100, device=device)
        for vp in val_paths:
            ds_vp = PKLDataset(vp)
            loader_vp = DataLoader(ds_vp, batch_size=64, shuffle=False)
            acc = eval_model(net, loader_vp, device)
            results[t][vp].append(acc)
            print(f"Seed {seed}, {t} -> {vp}: {acc:.2f}%")

## 5. Summary of Results

Compute mean and std deviation over seeds for each train → validation pair.

In [None]:
print("\n=== Summary over seeds ===")
for t in train_sizes:
    for vp in val_paths:
        arr = np.array(results[t][vp])
        print(f"{t} -> {vp}: Mean={arr.mean():.2f}%, Std={arr.std(ddof=1):.2f}%")