In [None]:
# -*- coding: utf-8 -*-
"""
Simplified Model Comparison: GAP, TPA, Gated-TPA
- TPA Top-k 마스킹 적용
- 프로토타입 다양성 페널티
- 로짓 수준 MoE 융합 (별도 분류기)
"""

from google.colab import drive
drive.mount('/content/drive')

import os, random, time, copy, json
import numpy as np
from typing import Tuple, Dict, List
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# ========================
# Config & Reproducibility
# ========================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

@dataclass
class Config:
    data_dir: str = "/content/drive/MyDrive/AI_data/TPA2/pamap2_transition_datasets"
    save_dir: str = "/content/drive/MyDrive/AI_data/TPA2"

    epochs: int = 100
    batch_size: int = 128
    lr: float = 1e-4
    weight_decay: float = 1e-4
    grad_clip: float = 1.0
    label_smoothing: float = 0.05

    patience: int = 20
    min_delta: float = 0.0001
    val_split: float = 0.2

    d_model: int = 128

    # Transformer hyperparameters
    num_layers: int = 2
    n_heads: int = 4
    ff_dim: int = 256
    dropout: float = 0.1

    # TPA hyperparameters
    tpa_num_prototypes: int = 16
    tpa_heads: int = 4
    tpa_dropout: float = 0.1
    tpa_temperature: float = 0.07
    tpa_topk_ratio: float = 0.25

    # 새로운 하이퍼파라미터
    diversity_weight: float = 5e-3  # 프로토타입 다양성 페널티 가중치
    use_logit_fusion: bool = True    # 로짓 수준 융합 사용 여부

    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    num_workers: int = 2

cfg = Config()

# ========================
# Dataset Class
# ========================
class PreloadedDataset(Dataset):
    """Dataset for pre-loaded numpy arrays"""
    def __init__(self, X: np.ndarray, y: np.ndarray):
        super().__init__()
        self.X = torch.from_numpy(X).float()

        # Label 범위 확인 및 조정 (1-6 -> 0-5)
        if y.min() >= 1:
            y = y - 1

        self.y = torch.from_numpy(y).long()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# ========================
# Data Loading Functions
# ========================
def load_dataset(base_dir: str, dataset_name: str):
    """
    Load pre-augmented dataset
    Args:
        base_dir: base directory containing all datasets
        dataset_name: e.g., "ORIGINAL", "STANDING_TO_SITTING_10pct", etc.
    Returns:
        train_dataset, test_dataset
    """
    dataset_dir = os.path.join(base_dir, dataset_name)

    print(f"\nLoading {dataset_name}...")
    print(f"  Path: {dataset_dir}")

    # Load data
    X_train = np.load(os.path.join(dataset_dir, "X_train.npy"))
    y_train = np.load(os.path.join(dataset_dir, "y_train.npy"))
    X_test = np.load(os.path.join(dataset_dir, "X_test.npy"))
    y_test = np.load(os.path.join(dataset_dir, "y_test.npy"))

    print(f"  Train: {X_train.shape}, Test: {X_test.shape}")

    train_dataset = PreloadedDataset(X_train, y_train)
    test_dataset = PreloadedDataset(X_test, y_test)

    return train_dataset, test_dataset

# ========================
# Transformer Backbone Components
# ========================
class PositionalEncoding(nn.Module):
    """Sinusoidal Positional Encoding"""
    def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: [B, T, D]
        Returns:
            [B, T, D]
        """
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TransformerBackbone(nn.Module):
    """
    Lightweight Transformer Encoder Backbone
    - 2 layers
    - d_model=128
    - n_heads=4
    - ff_dim=256
    - Dropout=0.1
    """
    def __init__(self,
                 in_channels: int = 27,
                 d_model: int = 128,
                 num_layers: int = 2,
                 n_heads: int = 4,
                 ff_dim: int = 256,
                 dropout: float = 0.1,
                 max_seq_len: int = 200):
        super().__init__()

        self.d_model = d_model

        # Input projection: [B, C, T] -> [B, T, D]
        self.input_projection = nn.Linear(in_channels, d_model)

        # Positional encoding
        self.pos_encoder = PositionalEncoding(d_model, max_seq_len, dropout)

        # Transformer Encoder layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True  # Pre-LN for better stability
        )

        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers
        )

        # Output normalization
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x):
        """
        Args:
            x: [B, C, T] - input sensor data
        Returns:
            [B, T, D] - transformed sequence
        """
        # [B, C, T] -> [B, T, C]
        # x = x.transpose(1, 2)

        # Project to d_model: [B, T, C] -> [B, T, D]
        x = self.input_projection(x)

        # Add positional encoding: [B, T, D]
        x = self.pos_encoder(x)

        # Transformer encoding: [B, T, D]
        x = self.transformer_encoder(x)

        # Final normalization: [B, T, D]
        x = self.norm(x)

        return x

# ========================
# GAP Model
# ========================
class GAPModel(nn.Module):
    """Baseline: Global Average Pooling with Transformer Backbone"""
    def __init__(self,
                 in_channels: int = 27,
                 d_model: int = 128,
                 num_layers: int = 2,
                 n_heads: int = 4,
                 ff_dim: int = 256,
                 dropout: float = 0.1,
                 num_classes: int = 12):
        super().__init__()
        self.backbone = TransformerBackbone(
            in_channels=in_channels,
            d_model=d_model,
            num_layers=num_layers,
            n_heads=n_heads,
            ff_dim=ff_dim,
            dropout=dropout
        )
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        features = self.backbone(x)  # [B, T, D]
        pooled = features.mean(dim=1)  # [B, D]
        logits = self.fc(pooled)
        return logits

# ========================
# Improved TPA with Top-k
# ========================
class ImprovedTPA(nn.Module):
    """개선된 TPA: Top-k 마스킹 + 다양성 정규화"""
    def __init__(self, dim, num_prototypes=16, heads=4, dropout=0.1,
                 temperature=0.07, topk_ratio=0.25):
        super().__init__()
        assert dim % heads == 0

        self.dim = dim
        self.heads = heads
        self.head_dim = dim // heads
        self.num_prototypes = num_prototypes
        self.temperature = temperature
        self.topk_ratio = topk_ratio

        self.proto = nn.Parameter(torch.randn(num_prototypes, dim) * 0.02)

        self.pre_norm = nn.LayerNorm(dim)

        self.q_proj = nn.Linear(dim, dim, bias=False)
        self.k_proj = nn.Linear(dim, dim, bias=False)
        self.v_proj = nn.Linear(dim, dim, bias=False)

        self.fuse = nn.Sequential(
            nn.Linear(dim, dim),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Args:
            x: [B, T, D]
        Returns:
            z_tpa: [B, D]
        """
        B, T, D = x.shape
        P = self.num_prototypes

        x_norm = self.pre_norm(x)

        K = self.k_proj(x_norm)
        V = self.v_proj(x_norm)
        Qp = self.q_proj(self.proto).unsqueeze(0).expand(B, -1, -1)

        def split_heads(t, length):
            return t.view(B, length, self.heads, self.head_dim).transpose(1, 2)

        Qh = split_heads(Qp, P)  # [B, H, P, d]
        Kh = split_heads(K, T)    # [B, H, T, d]
        Vh = split_heads(V, T)    # [B, H, T, d]

        # Qh = F.normalize(Qh, dim=-1)
        # Kh = F.normalize(Kh, dim=-1)

        scores = torch.matmul(Qh, Kh.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn = F.softmax(scores, dim=-1)  # [B, H, P, T]
        attn = torch.nan_to_num(attn, nan=0.0)

        # ==================
        # Top-k 마스킹
        # ==================
        k = max(1, int(self.topk_ratio * T))
        vals, idx = attn.topk(k, dim=-1)  # [B, H, P, k]
        mask = torch.zeros_like(attn).scatter_(-1, idx, 1.0)
        attn = attn * mask
        # 재정규화
        attn = attn / (attn.sum(dim=-1, keepdim=True) + 1e-8)

        attn = self.dropout(attn)

        proto_tokens = torch.matmul(attn, Vh)  # [B, H, P, d]
        proto_tokens = proto_tokens.transpose(1, 2).contiguous().view(B, P, D)

        z_tpa = proto_tokens.mean(dim=1)  # [B, D]

        z = self.fuse(z_tpa)

        return z

    def compute_diversity_loss(self):
        """
        프로토타입 다양성 페널티
        Returns:
            diversity_loss: scalar
        """
        proto_norm = F.normalize(self.proto, dim=-1)  # [P, D]
        sim = proto_norm @ proto_norm.t()  # [P, P]
        # 대각선 제외하고 유사도를 최소화
        div_loss = (sim - torch.eye(sim.size(0), device=sim.device)).pow(2).mean()
        return div_loss

class TPAModel(nn.Module):
    """개선된 TPA 모델"""
    def __init__(self,
                 in_channels: int = 27,
                 d_model: int = 128,
                 num_layers: int = 2,
                 n_heads: int = 4,
                 ff_dim: int = 256,
                 dropout: float = 0.1,
                 num_classes: int = 12,
                 tpa_config=None):
        super().__init__()
        self.backbone = TransformerBackbone(
            in_channels=in_channels,
            d_model=d_model,
            num_layers=num_layers,
            n_heads=n_heads,
            ff_dim=ff_dim,
            dropout=dropout
        )

        self.tpa = ImprovedTPA(
            dim=d_model,
            num_prototypes=tpa_config['num_prototypes'],
            heads=tpa_config['heads'],
            dropout=tpa_config['dropout'],
            temperature=tpa_config['temperature'],
            topk_ratio=tpa_config['topk_ratio']
        )

        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        features = self.backbone(x)  # [B, T, D]
        z = self.tpa(features)  # [B, D]
        logits = self.classifier(z)
        return logits

# ========================
# Improved Gated-TPA with Logit-level Fusion
# ========================
class ImprovedGatedTPAModel(nn.Module):
    """개선된 Gated-TPA: 로짓 수준 융합 + 별도 분류기"""
    def __init__(self,
                 in_channels: int = 27,
                 d_model: int = 128,
                 num_layers: int = 2,
                 n_heads: int = 4,
                 ff_dim: int = 256,
                 dropout: float = 0.1,
                 num_classes: int = 12,
                 tpa_config=None,
                 use_logit_fusion=True):
        super().__init__()
        self.use_logit_fusion = use_logit_fusion
        self.backbone = TransformerBackbone(
            in_channels=in_channels,
            d_model=d_model,
            num_layers=num_layers,
            n_heads=n_heads,
            ff_dim=ff_dim,
            dropout=dropout
        )

        self.tpa = ImprovedTPA(
            dim=d_model,
            num_prototypes=tpa_config['num_prototypes'],
            heads=tpa_config['heads'],
            dropout=tpa_config['dropout'],
            temperature=tpa_config['temperature'],
            topk_ratio=tpa_config['topk_ratio']
        )

        if use_logit_fusion:
            # 로짓 수준 융합 (별도 분류기)
            self.cls_gap = nn.Linear(d_model, num_classes)
            self.cls_tpa = nn.Linear(d_model, num_classes)

            # Gating mechanism
            self.gate = nn.Sequential(
                nn.Linear(d_model * 2, num_classes),
                nn.Sigmoid()
            )
        else:
            # 기존 feature 수준 융합
            self.gate = nn.Sequential(
                nn.Linear(d_model * 2, d_model),
                nn.Sigmoid()
            )
            self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        features = self.backbone(x)  # [B, T, D]

        # GAP branch
        z_gap = features.mean(dim=1)  # [B, D]

        # TPA branch
        z_tpa = self.tpa(features)  # [B, D]

        if self.use_logit_fusion:
            # 로짓 수준 융합
            logits_gap = self.cls_gap(z_gap)  # [B, C]
            logits_tpa = self.cls_tpa(z_tpa)  # [B, C]

            # 게이팅 (클래스별 게이트)
            gate_input = torch.cat([z_gap, z_tpa], dim=-1)
            g = self.gate(gate_input)  # [B, C]

            # 가중 융합
            logits = g * logits_gap + (1 - g) * logits_tpa
        else:
            # 기존 feature 수준 융합
            gate_input = torch.cat([z_gap, z_tpa], dim=-1)
            g = self.gate(gate_input)  # [B, D]
            z = g * z_gap + (1 - g) * z_tpa
            logits = self.classifier(z)

        return logits

# ========================
# Training & Evaluation
# ========================
def train_one_epoch(model, loader, opt, cfg: Config, compute_diversity=True):
    """
    개선: 다양성 페널티 추가
    """
    model.train()
    total, correct, loss_sum, ce_loss_sum, div_loss_sum = 0, 0, 0.0, 0.0, 0.0

    for x, y in loader:
        x, y = x.to(cfg.device).float(), y.to(cfg.device)

        opt.zero_grad(set_to_none=True)
        logits = model(x)

        # Cross-entropy loss
        ce_loss = F.cross_entropy(logits, y, label_smoothing=cfg.label_smoothing)

        # Diversity loss (TPA 모델만)
        div_loss = torch.tensor(0.0, device=cfg.device)
        if compute_diversity and hasattr(model, 'tpa'):
            div_loss = model.tpa.compute_diversity_loss()

        # Total loss
        loss = ce_loss + cfg.diversity_weight * div_loss

        if torch.isnan(loss):
            continue

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
        opt.step()

        with torch.no_grad():
            pred = logits.argmax(dim=-1)
            correct += (pred == y).sum().item()
            total += y.size(0)
            loss_sum += loss.item() * y.size(0)
            ce_loss_sum += ce_loss.item() * y.size(0)
            div_loss_sum += div_loss.item() * y.size(0)

    return {
        "loss": loss_sum / total if total > 0 else 0,
        "ce_loss": ce_loss_sum / total if total > 0 else 0,
        "div_loss": div_loss_sum / total if total > 0 else 0,
        "acc": correct / total if total > 0 else 0
    }

@torch.no_grad()
def evaluate(model, loader, cfg: Config):
    model.eval()
    ys, ps = [], []

    for x, y in loader:
        x, y = x.to(cfg.device), y.to(cfg.device)
        logits = model(x)
        ps.append(logits.argmax(dim=-1).cpu().numpy())
        ys.append(y.cpu().numpy())

    y_true, y_pred = np.concatenate(ys), np.concatenate(ps)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')

    return acc, f1

# ========================
# Model Complexity Analysis
# ========================
def count_parameters(model):
    """Count total and trainable parameters"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

def estimate_flops(model, input_shape=(1, 100, 27), device='cuda'):
    """
    Estimate FLOPs using manual calculation
    For Conv1d: FLOPs = 2 * C_in * C_out * K * L_out
    For Linear: FLOPs = 2 * in_features * out_features
    """
    model.eval()
    total_flops = 0

    def conv1d_flops(module, input, output):
        batch_size, out_channels, out_length = output.shape
        kernel_size = module.kernel_size[0]
        in_channels = module.in_channels
        groups = module.groups

        flops_per_element = 2 * (in_channels // groups) * kernel_size
        total = flops_per_element * out_channels * out_length * batch_size

        nonlocal total_flops
        total_flops += total

    def linear_flops(module, input, output):
        batch_size = input[0].shape[0]
        in_features = module.in_features
        out_features = module.out_features

        total = 2 * in_features * out_features * batch_size

        nonlocal total_flops
        total_flops += total

    # Register hooks
    hooks = []
    for module in model.modules():
        if isinstance(module, nn.Conv1d):
            hooks.append(module.register_forward_hook(conv1d_flops))
        elif isinstance(module, nn.Linear):
            hooks.append(module.register_forward_hook(linear_flops))

    # Forward pass
    with torch.no_grad():
        x = torch.randn(input_shape).to(device)
        model(x)

    # Remove hooks
    for hook in hooks:
        hook.remove()

    return total_flops

def measure_inference_time(model, input_shape=(1, 100, 27), device='cuda', n_runs=100):
    """
    Measure average inference time over multiple runs
    """
    model.eval()
    x = torch.randn(input_shape).to(device)

    # Warmup
    with torch.no_grad():
        for _ in range(10):
            _ = model(x)

    # Synchronize GPU
    if device == 'cuda':
        torch.cuda.synchronize()

    # Measure
    start = time.time()
    with torch.no_grad():
        for _ in range(n_runs):
            _ = model(x)
            if device == 'cuda':
                torch.cuda.synchronize()
    end = time.time()

    avg_time = (end - start) / n_runs * 1000  # Convert to ms
    return avg_time

def analyze_model_complexity(model, model_name, cfg: Config, input_shape=(1, 100, 27)):
    """
    Complete model complexity analysis
    """
    print(f"\n{'='*80}")
    print(f"MODEL COMPLEXITY ANALYSIS: {model_name}")
    print(f"{'='*80}")

    # Parameters
    total_params, trainable_params = count_parameters(model)
    print(f"Total Parameters: {total_params:,}")
    print(f"Trainable Parameters: {trainable_params:,}")
    print(f"Model Size: {total_params * 4 / 1024 / 1024:.2f} MB (float32)")

    # FLOPs
    flops = estimate_flops(model, input_shape, cfg.device)
    print(f"FLOPs: {flops:,} ({flops / 1e6:.2f} MFLOPs)")

    # Inference time
    inference_time = measure_inference_time(model, input_shape, cfg.device, n_runs=100)
    print(f"Inference Time: {inference_time:.3f} ms (avg over 100 runs)")

    return {
        'model': model_name,
        'total_params': total_params,
        'trainable_params': trainable_params,
        'model_size_mb': total_params * 4 / 1024 / 1024,
        'flops': flops,
        'mflops': flops / 1e6,
        'inference_time_ms': inference_time
    }

def train_model(model, train_loader, val_loader, cfg: Config, model_name: str):
    """Train a single model"""
    print(f"\n[Training {model_name}]")

    # TPA 모델만 diversity loss 계산
    compute_diversity = 'TPA' in model_name

    opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)

    best_acc, best_wts = 0.0, None
    patience_counter = 0

    for epoch in range(1, cfg.epochs + 1):
        stats = train_one_epoch(model, train_loader, opt, cfg, compute_diversity)
        val_acc, val_f1 = evaluate(model, val_loader, cfg)

        if val_acc > best_acc + cfg.min_delta:
            best_acc = val_acc
            best_wts = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1

        if epoch % 10 == 0:
            if compute_diversity:
                print(f"  Epoch {epoch:3d}: Train Acc={stats['acc']:.4f}, "
                      f"CE={stats['ce_loss']:.4f}, Div={stats['div_loss']:.4f}, "
                      f"Val Acc={val_acc:.4f}, F1={val_f1:.4f}")
            else:
                print(f"  Epoch {epoch:3d}: Train Acc={stats['acc']:.4f}, "
                      f"Loss={stats['loss']:.4f}, Val Acc={val_acc:.4f}, F1={val_f1:.4f}")

        if patience_counter >= cfg.patience:
            print(f"  Early stopping at epoch {epoch}")
            break

    if best_wts:
        model.load_state_dict(best_wts)

    print(f"  Best Val Acc: {best_acc:.4f}")
    return best_acc

def create_model(model_name: str, cfg: Config):
    """Create model by name"""
    tpa_config = {
        'num_prototypes': cfg.tpa_num_prototypes,
        'heads': cfg.tpa_heads,
        'dropout': cfg.tpa_dropout,
        'temperature': cfg.tpa_temperature,
        'topk_ratio': cfg.tpa_topk_ratio
    }

    if model_name == "GAP":
        return GAPModel(d_model=cfg.d_model).to(cfg.device).float()
    elif model_name == "TPA":
        return TPAModel(
            d_model=cfg.d_model,
            tpa_config=tpa_config
        ).to(cfg.device).float()
    elif model_name == "Gated-TPA":
        return ImprovedGatedTPAModel(
            d_model=cfg.d_model,
            tpa_config=tpa_config,
            use_logit_fusion=cfg.use_logit_fusion
        ).to(cfg.device).float()
    else:
        raise ValueError(f"Unknown model: {model_name}")

# ========================
# Main Experiment
# ========================
def run_experiment(dataset_name: str, cfg: Config):
    """Run complete experiment for one dataset"""

    print(f"\n{'='*80}")
    print(f"EXPERIMENT: {dataset_name}")
    print(f"{'='*80}")

    # Load data
    train_dataset, test_dataset = load_dataset(cfg.data_dir, dataset_name)

    # Split train into train/val using indices
    n_total = len(train_dataset)
    indices = np.arange(n_total)

    # Get labels for stratification
    y_labels = train_dataset.y.numpy()

    train_indices, val_indices = train_test_split(
        indices,
        test_size=cfg.val_split,
        random_state=SEED,
        stratify=y_labels
    )

    # Create subsets using Subset
    from torch.utils.data import Subset
    train_subset = Subset(train_dataset, train_indices)
    val_subset = Subset(train_dataset, val_indices)

    # Create data loaders
    g = torch.Generator(device='cpu').manual_seed(SEED)
    train_loader = DataLoader(train_subset, cfg.batch_size, shuffle=True,
                              num_workers=cfg.num_workers, generator=g)
    val_loader = DataLoader(val_subset, cfg.batch_size, num_workers=cfg.num_workers)
    test_loader = DataLoader(test_dataset, cfg.batch_size, num_workers=cfg.num_workers)

    print(f"\nDataset splits:")
    print(f"  Train: {len(train_subset)}, Val: {len(val_subset)}, Test: {len(test_dataset)}")

    # Train and evaluate all models
    results = []
    complexity_results = []
    model_names = ["GAP", "TPA", "Gated-TPA"]

    # First, analyze model complexity (only once, use first dataset)
    if dataset_name == "ORIGINAL":
        print(f"\n{'='*80}")
        print("MODEL COMPLEXITY COMPARISON")
        print(f"{'='*80}")

        for model_name in model_names:
            random.seed(SEED)
            np.random.seed(SEED)
            torch.manual_seed(SEED)

            model = create_model(model_name, cfg)
            complexity = analyze_model_complexity(model, model_name, cfg)
            complexity_results.append(complexity)

    for model_name in model_names:
        # Reset seed for each model
        random.seed(SEED)
        np.random.seed(SEED)
        torch.manual_seed(SEED)

        # Create and train model
        model = create_model(model_name, cfg)
        best_val_acc = train_model(model, train_loader, val_loader, cfg, model_name)

        # Evaluate on test set
        test_acc, test_f1 = evaluate(model, test_loader, cfg)

        print(f"\n[{model_name} Results]")
        print(f"  Val Acc: {best_val_acc:.4f}")
        print(f"  Test Acc: {test_acc:.4f}, F1: {test_f1:.4f}")

        results.append({
            'Model': model_name,
            'Dataset': dataset_name,
            'Val_Accuracy': float(best_val_acc),
            'Test_Accuracy': float(test_acc),
            'Test_F1_Score': float(test_f1)
        })

    return results, complexity_results

# ========================
# Run All Experiments
# ========================
if __name__ == "__main__":
    print("\n" + "="*80)
    print("SIMPLIFIED MODEL COMPARISON: GAP vs TPA vs Gated-TPA")
    print("="*80)
    print("\n개선사항:")
    print("  1. TPA Top-k 마스킹 적용")
    print("  2. 프로토타입 다양성 페널티")
    print("  3. 로짓 수준 MoE 융합 (별도 분류기)")
    print("="*80)

    datasets = ["Sitting_TO_Lying_20PCT", "Sitting_TO_Lying_30PCT", "Sitting_TO_Lying_40PCT"]

    transitions = [
        'Lying_TO_Sitting',
        'Standing_TO_Lying',
        'Lying_TO_Standing',
        'Standing_TO_Walking',
        'Walking_TO_Standing',
        'Walking_TO_Running',
        'Running_TO_Walking',
        'Walking_TO_Ascending_stairs',
        'Walking_TO_Descending_stairs',
        'Ascending_stairs_TO_Walking',
        'Descending_stairs_TO_Walking'
    ]

    # 모든 전이에 대해 10%, 20%, 30%, 40% 추가
    mix_pcts = [10, 20, 30, 40]

    for transition in transitions:
        for pct in mix_pcts:
            datasets.append(f"{transition}_{pct}PCT")

    print(f"\nTotal datasets to test: {len(datasets)}")
    print(f"  - transitions: {len(transitions) * len(mix_pcts) + 2}")

    all_results = []
    all_complexity = []

    # Run experiments
    for i, dataset_name in enumerate(datasets, 1):
        print(f"\n[Progress: {i}/{len(datasets)}]")

        results, complexity = run_experiment(dataset_name, cfg)
        all_results.extend(results)
        if complexity:  # 첫 번째 데이터셋에서만 반환됨
            all_complexity = complexity

    # Save all results
    print(f"\n{'='*80}")
    print("SAVING RESULTS")
    print(f"{'='*80}")

    results_dict = {
        'experiment_info': {
            'date': time.strftime('%Y-%m-%d %H:%M:%S'),
            'version': 'simplified_v1',
            'improvements': [
                'TPA Top-k masking',
                'Prototype diversity penalty',
                'Logit-level MoE fusion'
            ],
            'models': ['GAP', 'TPA', 'Gated-TPA'],
            'total_datasets': len(datasets),
            'datasets': datasets,
            'config': {
                'epochs': cfg.epochs,
                'batch_size': cfg.batch_size,
                'lr': cfg.lr,
                'd_model': cfg.d_model,
                'tpa_num_prototypes': cfg.tpa_num_prototypes,
                'tpa_heads': cfg.tpa_heads,
                'tpa_temperature': cfg.tpa_temperature,
                'tpa_topk_ratio': cfg.tpa_topk_ratio,
                'diversity_weight': cfg.diversity_weight,
                'use_logit_fusion': cfg.use_logit_fusion
            }
        },
        'model_complexity': all_complexity,
        'results': all_results
    }

    # Save to JSON
    json_path = os.path.join(cfg.save_dir, "pamap2_tpa_transition_cnn_simplified.json")
    with open(json_path, 'w') as f:
        json.dump(results_dict, f, indent=2)

    print(f"\nResults saved to: {json_path}")

    # Print summary
    print(f"\n{'='*80}")
    print("SUMMARY")
    print(f"{'='*80}")
    print(f"Total experiments: {len(all_results)}")
    print(f"Total datasets tested: {len(datasets)}")
    print(f"Models compared: 3 (GAP, TPA, Gated-TPA)")

    # Calculate average performance per model
    print(f"\n{'='*80}")
    print("AVERAGE PERFORMANCE (All Datasets)")
    print(f"{'='*80}")

    for model_name in ['GAP', 'TPA', 'Gated-TPA']:
        model_results = [r for r in all_results if r['Model'] == model_name]
        avg_acc = np.mean([r['Test_Accuracy'] for r in model_results])
        avg_f1 = np.mean([r['Test_F1_Score'] for r in model_results])
        print(f"{model_name:12s}: Acc={avg_acc:.4f}, F1={avg_f1:.4f}")

    # Print model complexity table
    if all_complexity:
        print(f"\n{'='*80}")
        print("MODEL COMPLEXITY COMPARISON")
        print(f"{'='*80}")
        print(f"{'Model':<12} {'Params':<12} {'Size(MB)':<10} {'MFLOPs':<10} {'Time(ms)':<10}")
        print("-" * 80)
        for comp in all_complexity:
            print(f"{comp['model']:<12} {comp['total_params']:<12,} "
                  f"{comp['model_size_mb']:<10.2f} {comp['mflops']:<10.2f} "
                  f"{comp['inference_time_ms']:<10.3f}")

    print(f"\n{'='*80}")
    print("EXPERIMENT COMPLETE")
    print(f"{'='*80}")


Mounted at /content/drive

SIMPLIFIED MODEL COMPARISON: GAP vs TPA vs Gated-TPA

개선사항:
  1. TPA Top-k 마스킹 적용
  2. 프로토타입 다양성 페널티
  3. 로짓 수준 MoE 융합 (별도 분류기)

Total datasets to test: 47
  - transitions: 46

[Progress: 1/47]

EXPERIMENT: Sitting_TO_Lying_20PCT

Loading Sitting_TO_Lying_20PCT...
  Path: /content/drive/MyDrive/AI_data/TPA2/pamap2_transition_datasets/Sitting_TO_Lying_20PCT
  Train: (34192, 100, 27), Test: (8549, 100, 27)

Dataset splits:
  Train: 27353, Val: 6839, Test: 8549





[Training GAP]
  Epoch  10: Train Acc=0.9263, Loss=0.5606, Val Acc=0.9286, F1=0.9234
  Epoch  20: Train Acc=0.9588, Loss=0.4478, Val Acc=0.9517, F1=0.9486
  Epoch  30: Train Acc=0.9758, Loss=0.3992, Val Acc=0.9633, F1=0.9605
  Epoch  40: Train Acc=0.9857, Loss=0.3693, Val Acc=0.9684, F1=0.9659
  Epoch  50: Train Acc=0.9903, Loss=0.3523, Val Acc=0.9731, F1=0.9705
  Epoch  60: Train Acc=0.9929, Loss=0.3399, Val Acc=0.9757, F1=0.9729
  Epoch  70: Train Acc=0.9948, Loss=0.3319, Val Acc=0.9787, F1=0.9770
  Epoch  80: Train Acc=0.9955, Loss=0.3263, Val Acc=0.9794, F1=0.9780
  Epoch  90: Train Acc=0.9964, Loss=0.3214, Val Acc=0.9807, F1=0.9795
  Epoch 100: Train Acc=0.9967, Loss=0.3180, Val Acc=0.9819, F1=0.9809
  Best Val Acc: 0.9825

[GAP Results]
  Val Acc: 0.9825
  Test Acc: 0.9830, F1: 0.9831

[Training TPA]




  Epoch  10: Train Acc=0.9260, CE=0.5459, Div=0.0581, Val Acc=0.9286, F1=0.9231
  Epoch  20: Train Acc=0.9596, CE=0.4456, Div=0.0189, Val Acc=0.9550, F1=0.9498
  Epoch  30: Train Acc=0.9751, CE=0.3975, Div=0.0086, Val Acc=0.9611, F1=0.9582
  Epoch  40: Train Acc=0.9848, CE=0.3718, Div=0.0038, Val Acc=0.9729, F1=0.9697
  Epoch  50: Train Acc=0.9900, CE=0.3551, Div=0.0022, Val Acc=0.9750, F1=0.9736
  Epoch  60: Train Acc=0.9929, CE=0.3439, Div=0.0013, Val Acc=0.9785, F1=0.9772
  Epoch  70: Train Acc=0.9941, CE=0.3370, Div=0.0005, Val Acc=0.9782, F1=0.9768
  Epoch  80: Train Acc=0.9956, CE=0.3314, Div=0.0005, Val Acc=0.9814, F1=0.9808
  Epoch  90: Train Acc=0.9962, CE=0.3277, Div=0.0005, Val Acc=0.9826, F1=0.9817
  Epoch 100: Train Acc=0.9964, CE=0.3245, Div=0.0002, Val Acc=0.9838, F1=0.9831
  Best Val Acc: 0.9842

[TPA Results]
  Val Acc: 0.9842
  Test Acc: 0.9796, F1: 0.9796

[Training Gated-TPA]




  Epoch  10: Train Acc=0.9312, CE=0.5223, Div=0.0258, Val Acc=0.9292, F1=0.9246
  Epoch  20: Train Acc=0.9624, CE=0.4273, Div=0.0054, Val Acc=0.9541, F1=0.9496
  Epoch  30: Train Acc=0.9774, CE=0.3839, Div=0.0025, Val Acc=0.9642, F1=0.9602
  Epoch  40: Train Acc=0.9860, CE=0.3583, Div=0.0019, Val Acc=0.9659, F1=0.9619
  Epoch  50: Train Acc=0.9909, CE=0.3411, Div=0.0015, Val Acc=0.9731, F1=0.9700
  Epoch  60: Train Acc=0.9936, CE=0.3301, Div=0.0007, Val Acc=0.9763, F1=0.9748
  Epoch  70: Train Acc=0.9943, CE=0.3240, Div=0.0005, Val Acc=0.9768, F1=0.9758
  Epoch  80: Train Acc=0.9956, CE=0.3186, Div=0.0000, Val Acc=0.9766, F1=0.9753
  Epoch  90: Train Acc=0.9960, CE=0.3150, Div=0.0001, Val Acc=0.9794, F1=0.9782
  Epoch 100: Train Acc=0.9968, CE=0.3120, Div=0.0001, Val Acc=0.9784, F1=0.9770
  Best Val Acc: 0.9810

[Gated-TPA Results]
  Val Acc: 0.9810
  Test Acc: 0.9798, F1: 0.9789

[Progress: 2/47]

EXPERIMENT: Sitting_TO_Lying_30PCT

Loading Sitting_TO_Lying_30PCT...
  Path: /content/d



  Epoch  10: Train Acc=0.9309, Loss=0.5496, Val Acc=0.9300, F1=0.9197
  Epoch  20: Train Acc=0.9608, Loss=0.4432, Val Acc=0.9526, F1=0.9463
  Epoch  30: Train Acc=0.9762, Loss=0.3963, Val Acc=0.9636, F1=0.9590
  Epoch  40: Train Acc=0.9861, Loss=0.3672, Val Acc=0.9718, F1=0.9692
  Epoch  50: Train Acc=0.9908, Loss=0.3494, Val Acc=0.9778, F1=0.9762
  Epoch  60: Train Acc=0.9939, Loss=0.3374, Val Acc=0.9781, F1=0.9762
  Epoch  70: Train Acc=0.9954, Loss=0.3295, Val Acc=0.9787, F1=0.9774
  Epoch  80: Train Acc=0.9961, Loss=0.3240, Val Acc=0.9808, F1=0.9790
  Epoch  90: Train Acc=0.9970, Loss=0.3196, Val Acc=0.9801, F1=0.9786
  Epoch 100: Train Acc=0.9972, Loss=0.3168, Val Acc=0.9817, F1=0.9801
  Best Val Acc: 0.9833

[GAP Results]
  Val Acc: 0.9833
  Test Acc: 0.9782, F1: 0.9774

[Training TPA]




  Epoch  10: Train Acc=0.9294, CE=0.5379, Div=0.0489, Val Acc=0.9244, F1=0.9164
  Epoch  20: Train Acc=0.9603, CE=0.4427, Div=0.0147, Val Acc=0.9534, F1=0.9460
  Epoch  30: Train Acc=0.9768, CE=0.3955, Div=0.0089, Val Acc=0.9653, F1=0.9600
  Epoch  40: Train Acc=0.9848, CE=0.3692, Div=0.0047, Val Acc=0.9713, F1=0.9672
  Epoch  50: Train Acc=0.9906, CE=0.3529, Div=0.0032, Val Acc=0.9760, F1=0.9731
  Epoch  60: Train Acc=0.9939, CE=0.3420, Div=0.0019, Val Acc=0.9795, F1=0.9767
  Epoch  70: Train Acc=0.9954, CE=0.3344, Div=0.0013, Val Acc=0.9807, F1=0.9778
  Epoch  80: Train Acc=0.9961, CE=0.3294, Div=0.0004, Val Acc=0.9808, F1=0.9784
  Epoch  90: Train Acc=0.9970, CE=0.3247, Div=0.0005, Val Acc=0.9813, F1=0.9792
  Early stopping at epoch 98
  Best Val Acc: 0.9823

[TPA Results]
  Val Acc: 0.9823
  Test Acc: 0.9786, F1: 0.9773

[Training Gated-TPA]




  Epoch  10: Train Acc=0.9342, CE=0.5130, Div=0.0266, Val Acc=0.9365, F1=0.9281
  Epoch  20: Train Acc=0.9639, CE=0.4201, Div=0.0059, Val Acc=0.9520, F1=0.9460
  Epoch  30: Train Acc=0.9787, CE=0.3788, Div=0.0027, Val Acc=0.9601, F1=0.9549
  Epoch  40: Train Acc=0.9869, CE=0.3553, Div=0.0018, Val Acc=0.9683, F1=0.9651
  Epoch  50: Train Acc=0.9910, CE=0.3395, Div=0.0018, Val Acc=0.9691, F1=0.9660
  Epoch  60: Train Acc=0.9936, CE=0.3293, Div=0.0009, Val Acc=0.9737, F1=0.9703
  Epoch  70: Train Acc=0.9951, CE=0.3221, Div=0.0004, Val Acc=0.9754, F1=0.9731
  Epoch  80: Train Acc=0.9956, CE=0.3178, Div=0.0002, Val Acc=0.9773, F1=0.9748
  Epoch  90: Train Acc=0.9967, CE=0.3137, Div=0.0001, Val Acc=0.9794, F1=0.9769
  Epoch 100: Train Acc=0.9969, CE=0.3115, Div=0.0001, Val Acc=0.9789, F1=0.9771
  Best Val Acc: 0.9801

[Gated-TPA Results]
  Val Acc: 0.9801
  Test Acc: 0.9773, F1: 0.9757

[Progress: 3/47]

EXPERIMENT: Sitting_TO_Lying_40PCT

Loading Sitting_TO_Lying_40PCT...
  Path: /content/d



  Epoch  10: Train Acc=0.9301, Loss=0.5547, Val Acc=0.9209, F1=0.9144
  Epoch  20: Train Acc=0.9598, Loss=0.4438, Val Acc=0.9481, F1=0.9449
  Epoch  30: Train Acc=0.9766, Loss=0.3954, Val Acc=0.9614, F1=0.9585
  Epoch  40: Train Acc=0.9859, Loss=0.3668, Val Acc=0.9683, F1=0.9670
  Epoch  50: Train Acc=0.9911, Loss=0.3492, Val Acc=0.9735, F1=0.9727
  Epoch  60: Train Acc=0.9933, Loss=0.3379, Val Acc=0.9760, F1=0.9752
  Epoch  70: Train Acc=0.9952, Loss=0.3303, Val Acc=0.9788, F1=0.9786
  Epoch  80: Train Acc=0.9961, Loss=0.3245, Val Acc=0.9792, F1=0.9788
  Epoch  90: Train Acc=0.9967, Loss=0.3197, Val Acc=0.9789, F1=0.9789
  Epoch 100: Train Acc=0.9970, Loss=0.3165, Val Acc=0.9808, F1=0.9809
  Best Val Acc: 0.9808

[GAP Results]
  Val Acc: 0.9808
  Test Acc: 0.9807, F1: 0.9801

[Training TPA]




  Epoch  10: Train Acc=0.9278, CE=0.5423, Div=0.0629, Val Acc=0.9215, F1=0.9171
  Epoch  20: Train Acc=0.9586, CE=0.4458, Div=0.0163, Val Acc=0.9460, F1=0.9430
  Epoch  30: Train Acc=0.9742, CE=0.4005, Div=0.0076, Val Acc=0.9573, F1=0.9546
  Epoch  40: Train Acc=0.9848, CE=0.3728, Div=0.0044, Val Acc=0.9624, F1=0.9608
  Epoch  50: Train Acc=0.9907, CE=0.3545, Div=0.0021, Val Acc=0.9728, F1=0.9718
  Epoch  60: Train Acc=0.9929, CE=0.3442, Div=0.0015, Val Acc=0.9766, F1=0.9764
  Epoch  70: Train Acc=0.9944, CE=0.3367, Div=0.0009, Val Acc=0.9773, F1=0.9774
  Epoch  80: Train Acc=0.9954, CE=0.3314, Div=0.0007, Val Acc=0.9791, F1=0.9794
  Epoch  90: Train Acc=0.9961, CE=0.3270, Div=0.0004, Val Acc=0.9773, F1=0.9779
  Epoch 100: Train Acc=0.9967, CE=0.3233, Div=0.0003, Val Acc=0.9810, F1=0.9810
  Best Val Acc: 0.9814

[TPA Results]
  Val Acc: 0.9814
  Test Acc: 0.9805, F1: 0.9803

[Training Gated-TPA]




  Epoch  10: Train Acc=0.9335, CE=0.5158, Div=0.0129, Val Acc=0.9286, F1=0.9233
  Epoch  20: Train Acc=0.9623, CE=0.4238, Div=0.0051, Val Acc=0.9453, F1=0.9404
  Epoch  30: Train Acc=0.9774, CE=0.3825, Div=0.0024, Val Acc=0.9598, F1=0.9571
  Epoch  40: Train Acc=0.9857, CE=0.3575, Div=0.0013, Val Acc=0.9690, F1=0.9679
  Epoch  50: Train Acc=0.9905, CE=0.3415, Div=0.0009, Val Acc=0.9719, F1=0.9709
  Epoch  60: Train Acc=0.9931, CE=0.3310, Div=0.0012, Val Acc=0.9759, F1=0.9747
  Epoch  70: Train Acc=0.9943, CE=0.3241, Div=0.0004, Val Acc=0.9716, F1=0.9708
  Epoch  80: Train Acc=0.9956, CE=0.3194, Div=0.0004, Val Acc=0.9787, F1=0.9786
  Epoch  90: Train Acc=0.9959, CE=0.3157, Div=0.0002, Val Acc=0.9766, F1=0.9762
  Epoch 100: Train Acc=0.9968, CE=0.3124, Div=0.0002, Val Acc=0.9795, F1=0.9796
  Best Val Acc: 0.9795

[Gated-TPA Results]
  Val Acc: 0.9795
  Test Acc: 0.9761, F1: 0.9751

[Progress: 4/47]

EXPERIMENT: Lying_TO_Sitting_10PCT

Loading Lying_TO_Sitting_10PCT...
  Path: /content/d



  Epoch  10: Train Acc=0.9318, Loss=0.5429, Val Acc=0.9285, F1=0.9180
  Epoch  20: Train Acc=0.9619, Loss=0.4333, Val Acc=0.9488, F1=0.9413
  Epoch  30: Train Acc=0.9774, Loss=0.3901, Val Acc=0.9640, F1=0.9588
  Epoch  40: Train Acc=0.9868, Loss=0.3647, Val Acc=0.9716, F1=0.9678
  Epoch  50: Train Acc=0.9913, Loss=0.3484, Val Acc=0.9750, F1=0.9717
  Epoch  60: Train Acc=0.9942, Loss=0.3372, Val Acc=0.9770, F1=0.9738
  Epoch  70: Train Acc=0.9952, Loss=0.3293, Val Acc=0.9787, F1=0.9756
  Epoch  80: Train Acc=0.9966, Loss=0.3231, Val Acc=0.9798, F1=0.9773
  Epoch  90: Train Acc=0.9969, Loss=0.3193, Val Acc=0.9785, F1=0.9748
  Epoch 100: Train Acc=0.9974, Loss=0.3161, Val Acc=0.9826, F1=0.9799
  Best Val Acc: 0.9829

[GAP Results]
  Val Acc: 0.9829
  Test Acc: 0.9822, F1: 0.9795

[Training TPA]




  Epoch  10: Train Acc=0.9296, CE=0.5369, Div=0.0271, Val Acc=0.9226, F1=0.9107
  Epoch  20: Train Acc=0.9616, CE=0.4383, Div=0.0097, Val Acc=0.9513, F1=0.9442
  Epoch  30: Train Acc=0.9767, CE=0.3948, Div=0.0030, Val Acc=0.9636, F1=0.9583
  Epoch  40: Train Acc=0.9856, CE=0.3699, Div=0.0011, Val Acc=0.9691, F1=0.9645
  Epoch  50: Train Acc=0.9908, CE=0.3529, Div=0.0006, Val Acc=0.9751, F1=0.9718
  Epoch  60: Train Acc=0.9938, CE=0.3426, Div=0.0004, Val Acc=0.9788, F1=0.9751
  Epoch  70: Train Acc=0.9951, CE=0.3361, Div=0.0002, Val Acc=0.9801, F1=0.9767
  Epoch  80: Train Acc=0.9959, CE=0.3307, Div=0.0002, Val Acc=0.9803, F1=0.9769
  Epoch  90: Train Acc=0.9968, CE=0.3261, Div=0.0002, Val Acc=0.9823, F1=0.9788
  Epoch 100: Train Acc=0.9970, CE=0.3226, Div=0.0000, Val Acc=0.9845, F1=0.9819
  Best Val Acc: 0.9845

[TPA Results]
  Val Acc: 0.9845
  Test Acc: 0.9825, F1: 0.9799

[Training Gated-TPA]




  Epoch  10: Train Acc=0.9323, CE=0.5165, Div=0.0341, Val Acc=0.9254, F1=0.9144
  Epoch  20: Train Acc=0.9625, CE=0.4242, Div=0.0087, Val Acc=0.9522, F1=0.9444
  Epoch  30: Train Acc=0.9786, CE=0.3811, Div=0.0044, Val Acc=0.9671, F1=0.9616
  Epoch  40: Train Acc=0.9883, CE=0.3546, Div=0.0025, Val Acc=0.9727, F1=0.9681
  Epoch  50: Train Acc=0.9921, CE=0.3393, Div=0.0013, Val Acc=0.9759, F1=0.9719
  Epoch  60: Train Acc=0.9947, CE=0.3280, Div=0.0006, Val Acc=0.9803, F1=0.9762
  Epoch  70: Train Acc=0.9963, CE=0.3210, Div=0.0005, Val Acc=0.9789, F1=0.9749
  Epoch  80: Train Acc=0.9968, CE=0.3160, Div=0.0001, Val Acc=0.9782, F1=0.9745
  Epoch  90: Train Acc=0.9971, CE=0.3126, Div=0.0002, Val Acc=0.9803, F1=0.9769
  Epoch 100: Train Acc=0.9972, CE=0.3105, Div=0.0000, Val Acc=0.9803, F1=0.9767
  Best Val Acc: 0.9822

[Gated-TPA Results]
  Val Acc: 0.9822
  Test Acc: 0.9811, F1: 0.9774

[Progress: 5/47]

EXPERIMENT: Lying_TO_Sitting_20PCT

Loading Lying_TO_Sitting_20PCT...
  Path: /content/d



  Epoch  10: Train Acc=0.9279, Loss=0.5477, Val Acc=0.9205, F1=0.9086
  Epoch  20: Train Acc=0.9602, Loss=0.4388, Val Acc=0.9493, F1=0.9410
  Epoch  30: Train Acc=0.9763, Loss=0.3936, Val Acc=0.9649, F1=0.9580
  Epoch  40: Train Acc=0.9853, Loss=0.3670, Val Acc=0.9712, F1=0.9663
  Epoch  50: Train Acc=0.9912, Loss=0.3497, Val Acc=0.9747, F1=0.9710
  Epoch  60: Train Acc=0.9939, Loss=0.3382, Val Acc=0.9760, F1=0.9729
  Epoch  70: Train Acc=0.9951, Loss=0.3309, Val Acc=0.9787, F1=0.9754
  Epoch  80: Train Acc=0.9963, Loss=0.3257, Val Acc=0.9795, F1=0.9765
  Epoch  90: Train Acc=0.9969, Loss=0.3209, Val Acc=0.9814, F1=0.9790
  Epoch 100: Train Acc=0.9969, Loss=0.3173, Val Acc=0.9801, F1=0.9773
  Best Val Acc: 0.9823

[GAP Results]
  Val Acc: 0.9823
  Test Acc: 0.9803, F1: 0.9782

[Training TPA]




  Epoch  10: Train Acc=0.9277, CE=0.5377, Div=0.0267, Val Acc=0.9246, F1=0.9108
  Epoch  20: Train Acc=0.9608, CE=0.4377, Div=0.0073, Val Acc=0.9529, F1=0.9455
  Epoch  30: Train Acc=0.9747, CE=0.3975, Div=0.0036, Val Acc=0.9591, F1=0.9526
  Epoch  40: Train Acc=0.9839, CE=0.3724, Div=0.0025, Val Acc=0.9680, F1=0.9617
  Epoch  50: Train Acc=0.9895, CE=0.3567, Div=0.0013, Val Acc=0.9702, F1=0.9657
  Epoch  60: Train Acc=0.9925, CE=0.3439, Div=0.0011, Val Acc=0.9754, F1=0.9711
  Epoch  70: Train Acc=0.9945, CE=0.3365, Div=0.0010, Val Acc=0.9768, F1=0.9723
  Epoch  80: Train Acc=0.9957, CE=0.3304, Div=0.0010, Val Acc=0.9773, F1=0.9733
  Epoch  90: Train Acc=0.9965, CE=0.3266, Div=0.0005, Val Acc=0.9808, F1=0.9768
  Epoch 100: Train Acc=0.9968, CE=0.3233, Div=0.0005, Val Acc=0.9823, F1=0.9790
  Best Val Acc: 0.9827

[TPA Results]
  Val Acc: 0.9827
  Test Acc: 0.9802, F1: 0.9773

[Training Gated-TPA]




  Epoch  10: Train Acc=0.9302, CE=0.5199, Div=0.0339, Val Acc=0.9241, F1=0.9119
  Epoch  20: Train Acc=0.9619, CE=0.4262, Div=0.0114, Val Acc=0.9496, F1=0.9413
  Epoch  30: Train Acc=0.9775, CE=0.3820, Div=0.0044, Val Acc=0.9651, F1=0.9595
  Epoch  40: Train Acc=0.9874, CE=0.3557, Div=0.0027, Val Acc=0.9715, F1=0.9679
  Epoch  50: Train Acc=0.9916, CE=0.3397, Div=0.0016, Val Acc=0.9741, F1=0.9703
  Epoch  60: Train Acc=0.9940, CE=0.3291, Div=0.0006, Val Acc=0.9776, F1=0.9733
  Epoch  70: Train Acc=0.9956, CE=0.3216, Div=0.0005, Val Acc=0.9784, F1=0.9750
  Epoch  80: Train Acc=0.9964, CE=0.3165, Div=0.0004, Val Acc=0.9795, F1=0.9760
  Epoch  90: Train Acc=0.9966, CE=0.3136, Div=0.0002, Val Acc=0.9801, F1=0.9770
  Epoch 100: Train Acc=0.9967, CE=0.3112, Div=0.0001, Val Acc=0.9807, F1=0.9776
  Best Val Acc: 0.9811

[Gated-TPA Results]
  Val Acc: 0.9811
  Test Acc: 0.9800, F1: 0.9770

[Progress: 6/47]

EXPERIMENT: Lying_TO_Sitting_30PCT

Loading Lying_TO_Sitting_30PCT...
  Path: /content/d



  Epoch  10: Train Acc=0.9279, Loss=0.5528, Val Acc=0.9229, F1=0.9108
  Epoch  20: Train Acc=0.9603, Loss=0.4409, Val Acc=0.9507, F1=0.9433
  Epoch  30: Train Acc=0.9749, Loss=0.3956, Val Acc=0.9649, F1=0.9603
  Epoch  40: Train Acc=0.9852, Loss=0.3675, Val Acc=0.9725, F1=0.9700
  Epoch  50: Train Acc=0.9910, Loss=0.3498, Val Acc=0.9757, F1=0.9736
  Epoch  60: Train Acc=0.9937, Loss=0.3386, Val Acc=0.9801, F1=0.9781
  Epoch  70: Train Acc=0.9954, Loss=0.3306, Val Acc=0.9794, F1=0.9774
  Epoch  80: Train Acc=0.9962, Loss=0.3250, Val Acc=0.9822, F1=0.9805
  Epoch  90: Train Acc=0.9969, Loss=0.3209, Val Acc=0.9808, F1=0.9783
  Epoch 100: Train Acc=0.9972, Loss=0.3171, Val Acc=0.9842, F1=0.9823
  Best Val Acc: 0.9844

[GAP Results]
  Val Acc: 0.9844
  Test Acc: 0.9818, F1: 0.9791

[Training TPA]




  Epoch  10: Train Acc=0.9264, CE=0.5422, Div=0.0472, Val Acc=0.9250, F1=0.9131
  Epoch  20: Train Acc=0.9603, CE=0.4403, Div=0.0128, Val Acc=0.9535, F1=0.9458
  Epoch  30: Train Acc=0.9761, CE=0.3972, Div=0.0054, Val Acc=0.9626, F1=0.9559
  Epoch  40: Train Acc=0.9854, CE=0.3709, Div=0.0023, Val Acc=0.9681, F1=0.9639
  Epoch  50: Train Acc=0.9905, CE=0.3544, Div=0.0018, Val Acc=0.9741, F1=0.9708
  Epoch  60: Train Acc=0.9932, CE=0.3438, Div=0.0008, Val Acc=0.9775, F1=0.9743
  Epoch  70: Train Acc=0.9947, CE=0.3360, Div=0.0008, Val Acc=0.9768, F1=0.9740
  Epoch  80: Train Acc=0.9958, CE=0.3313, Div=0.0005, Val Acc=0.9784, F1=0.9764
  Epoch  90: Train Acc=0.9966, CE=0.3264, Div=0.0004, Val Acc=0.9827, F1=0.9809
  Epoch 100: Train Acc=0.9969, CE=0.3231, Div=0.0003, Val Acc=0.9822, F1=0.9799
  Best Val Acc: 0.9832

[TPA Results]
  Val Acc: 0.9832
  Test Acc: 0.9823, F1: 0.9801

[Training Gated-TPA]




  Epoch  10: Train Acc=0.9296, CE=0.5223, Div=0.0246, Val Acc=0.9259, F1=0.9118
  Epoch  20: Train Acc=0.9620, CE=0.4265, Div=0.0085, Val Acc=0.9522, F1=0.9422
  Epoch  30: Train Acc=0.9778, CE=0.3828, Div=0.0061, Val Acc=0.9648, F1=0.9593
  Epoch  40: Train Acc=0.9863, CE=0.3567, Div=0.0041, Val Acc=0.9719, F1=0.9675
  Epoch  50: Train Acc=0.9913, CE=0.3400, Div=0.0029, Val Acc=0.9721, F1=0.9696
  Epoch  60: Train Acc=0.9936, CE=0.3293, Div=0.0013, Val Acc=0.9787, F1=0.9759
  Epoch  70: Train Acc=0.9948, CE=0.3226, Div=0.0009, Val Acc=0.9781, F1=0.9759
  Epoch  80: Train Acc=0.9963, CE=0.3166, Div=0.0004, Val Acc=0.9797, F1=0.9782
  Epoch  90: Train Acc=0.9967, CE=0.3139, Div=0.0002, Val Acc=0.9806, F1=0.9786
  Epoch 100: Train Acc=0.9970, CE=0.3115, Div=0.0001, Val Acc=0.9779, F1=0.9755
  Best Val Acc: 0.9825

[Gated-TPA Results]
  Val Acc: 0.9825
  Test Acc: 0.9802, F1: 0.9775

[Progress: 7/47]

EXPERIMENT: Lying_TO_Sitting_40PCT

Loading Lying_TO_Sitting_40PCT...
  Path: /content/d



  Epoch  10: Train Acc=0.9263, Loss=0.5526, Val Acc=0.9251, F1=0.9123
  Epoch  20: Train Acc=0.9601, Loss=0.4443, Val Acc=0.9554, F1=0.9469
  Epoch  30: Train Acc=0.9763, Loss=0.3974, Val Acc=0.9658, F1=0.9589
  Epoch  40: Train Acc=0.9843, Loss=0.3698, Val Acc=0.9705, F1=0.9653
  Epoch  50: Train Acc=0.9901, Loss=0.3522, Val Acc=0.9772, F1=0.9734
  Epoch  60: Train Acc=0.9934, Loss=0.3405, Val Acc=0.9800, F1=0.9761
  Epoch  70: Train Acc=0.9946, Loss=0.3322, Val Acc=0.9797, F1=0.9748
  Epoch  80: Train Acc=0.9960, Loss=0.3262, Val Acc=0.9833, F1=0.9796
  Epoch  90: Train Acc=0.9963, Loss=0.3215, Val Acc=0.9836, F1=0.9796
  Epoch 100: Train Acc=0.9969, Loss=0.3182, Val Acc=0.9835, F1=0.9798
  Best Val Acc: 0.9854

[GAP Results]
  Val Acc: 0.9854
  Test Acc: 0.9818, F1: 0.9790

[Training TPA]




  Epoch  10: Train Acc=0.9255, CE=0.5451, Div=0.0426, Val Acc=0.9241, F1=0.9107
  Epoch  20: Train Acc=0.9594, CE=0.4415, Div=0.0115, Val Acc=0.9536, F1=0.9455
  Epoch  30: Train Acc=0.9757, CE=0.3983, Div=0.0043, Val Acc=0.9592, F1=0.9535
  Epoch  40: Train Acc=0.9845, CE=0.3732, Div=0.0031, Val Acc=0.9702, F1=0.9651
  Epoch  50: Train Acc=0.9900, CE=0.3553, Div=0.0015, Val Acc=0.9729, F1=0.9688
  Epoch  60: Train Acc=0.9931, CE=0.3437, Div=0.0011, Val Acc=0.9782, F1=0.9760
  Epoch  70: Train Acc=0.9948, CE=0.3367, Div=0.0006, Val Acc=0.9814, F1=0.9795
  Epoch  80: Train Acc=0.9960, CE=0.3308, Div=0.0006, Val Acc=0.9813, F1=0.9792
  Epoch  90: Train Acc=0.9965, CE=0.3264, Div=0.0002, Val Acc=0.9844, F1=0.9827
  Epoch 100: Train Acc=0.9968, CE=0.3232, Div=0.0002, Val Acc=0.9858, F1=0.9846
  Best Val Acc: 0.9861

[TPA Results]
  Val Acc: 0.9861
  Test Acc: 0.9816, F1: 0.9803

[Training Gated-TPA]




  Epoch  10: Train Acc=0.9279, CE=0.5301, Div=0.0356, Val Acc=0.9281, F1=0.9166
  Epoch  20: Train Acc=0.9602, CE=0.4313, Div=0.0117, Val Acc=0.9515, F1=0.9429
  Epoch  30: Train Acc=0.9780, CE=0.3846, Div=0.0063, Val Acc=0.9670, F1=0.9615
  Epoch  40: Train Acc=0.9868, CE=0.3575, Div=0.0032, Val Acc=0.9754, F1=0.9718
  Epoch  50: Train Acc=0.9914, CE=0.3412, Div=0.0017, Val Acc=0.9792, F1=0.9758
  Epoch  60: Train Acc=0.9939, CE=0.3302, Div=0.0010, Val Acc=0.9795, F1=0.9769
  Epoch  70: Train Acc=0.9952, CE=0.3233, Div=0.0007, Val Acc=0.9768, F1=0.9735
  Epoch  80: Train Acc=0.9964, CE=0.3174, Div=0.0005, Val Acc=0.9816, F1=0.9786
  Epoch  90: Train Acc=0.9967, CE=0.3136, Div=0.0001, Val Acc=0.9820, F1=0.9800
