## __Check first before starting__

In [1]:
import os

Working_directory = os.path.normpath("/mnt/mydisk/Continual_Learning_JL/Continual_Learning/")
os.chdir(Working_directory)
print(f"Working directory: {os.getcwd()}")

Working directory: /mnt/mydisk/Continual_Learning_JL/Continual_Learning


## __All imports__

In [2]:
# Operating system and file management
import os
import shutil
import contextlib
import traceback
import gc
import copy
from collections import defaultdict
import subprocess
import time
import re, pickle
import scipy.io
from scipy.io import loadmat
from glob import glob
from math import ceil

# Jupyter notebook widgets and display
import ipywidgets as widgets
from IPython.display import display

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Plotting and visualization
import matplotlib.pyplot as plt
from mpl_interactions import zoom_factory, panhandler

# Machine learning and preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle
from ta import trend, momentum, volatility, volume

# Mathematical and scientific computing
import math
from scipy.ndimage import gaussian_filter1d

# Type hinting
from typing import Callable, Tuple

# Deep learning with PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchvision.models import resnet18
from sklearn.utils.class_weight import compute_class_weight

## __📁 Path Settings and Constants__
This cell defines essential paths and constants for the CPSC2018 ECG dataset processing:
- `BASE_DIR`: Root directory of the project.
- `save_dir`: Path to the preprocessed `.npy` files (one for each continual learning period).
- `ECG_PATH`: Directory containing original `.mat` and `.hea` files.
- `MAX_LEN`: Length of each ECG sample, fixed to 5000 time steps (i.e., 10 seconds at 500Hz).

In [3]:
BASE_DIR = "/mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL"
save_dir = os.path.join(BASE_DIR, "processed")
ECG_PATH = os.path.join(BASE_DIR, "datas")
MAX_LEN = 5000

## __🏷️ Label Mapping and Period Configuration__

This section defines:
- `snomed_map`: Mapping from SNOMED CT codes to readable class names for 9 major ECG conditions.
- `period_label_map`: Incremental learning task structure across four periods.  
  Class `1` is reserved for "OTHER" abnormalities until Period 4 when all 9 classes are explicitly categorized.
- `print_class_distribution()`: Helper function to show class-wise data distribution.


In [4]:
# SNOMED CT to readable names
snomed_map = {
    "426783006": "NSR",    # 正常竇性心律
    "270492004": "I-AVB",  # 一度房室傳導阻滯
    "164889003": "AF",     # 心房纖維顫動
    "164909002": "LBBB",   # 左束支傳導阻滯
    "59118001":  "RBBB",   # 右束支傳導阻滯
    "284470004": "PAC",    # 心房早期搏動
    "164884008": "PVC",    # 室性早期搏動
    "429622005": "STD",    # ST 段壓低
    "164931005": "STE"     # ST 段抬高
}

# Period class mapping (固定 class 1 是「其他異常」直到 P4 移除)
period_label_map = {
    1: {"NSR": 0, "OTHER": 1},
    2: {"NSR": 0, "I-AVB": 2, "AF": 3, "OTHER": 1},
    3: {"NSR": 0, "I-AVB": 2, "AF": 3, "LBBB": 4, "RBBB": 5, "OTHER": 1},
    4: {"NSR": 0, "I-AVB": 2, "AF": 3, "LBBB": 4, "RBBB": 5, "PAC": 6, "PVC": 7, "STD": 8, "STE": 9}
}

def print_class_distribution(y, label_map):
    y = np.array(y).flatten()
    total = len(y)
    all_labels = sorted(label_map.values())
    print("\n📊 Class Distribution")
    for lbl in all_labels:
        count = np.sum(y == lbl)
        label = [k for k, v in label_map.items() if v == lbl]
        name = label[0] if label else str(lbl)
        print(f"  ├─ Label {lbl:<2} ({name:<10}) → {count:>5} samples ({(count/total)*100:5.2f}%)")

def ensure_folder(folder_path: str) -> None:
    """Ensure the given folder exists, create it if not."""
    os.makedirs(folder_path, exist_ok=True)


## 📦 EX. Load Example (Period 4) Data and View Format

This example demonstrates how to load preprocessed `.npy` data for **Period 4**, and inspect the dataset shapes and label distribution.  
Use this format as a reference when loading data in other methods (e.g., EWC, PNN, DynEx-CLoRA).

Each ECG sample:
- Has shape `(5000, 12)` → represents 10 seconds (at 500Hz) across 12-lead channels.
- Corresponding label is an integer ID (e.g., 0–9) defined by `period_label_map[4]`.

In [5]:
# # 範例:載入 period 4
# save_dir = os.path.join(BASE_DIR, "processed")
# X_train = np.load(os.path.join(save_dir, "X_train_p4.npy"))
# y_train = np.load(os.path.join(save_dir, "y_train_p4.npy"))
# X_test = np.load(os.path.join(save_dir, "X_test_p4.npy"))
# y_test = np.load(os.path.join(save_dir, "y_test_p4.npy"))

# print("✅ Loaded")
# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)
# print("X_test shape:", X_test.shape)
# print("y_test shape:", y_test.shape)
# print_class_distribution(y_train, period_label_map[4])
# print_class_distribution(y_test, period_label_map[4])

# del X_train, y_train, X_test, y_test


## __Check GPU, CUDA, Pytorch__

In [6]:
!nvidia-smi

Sat May 10 15:30:05 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.07             Driver Version: 570.133.07     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               Off |   00000000:2A:00.0 Off |                  Off |
| 30%   45C    P2             82W /  300W |    2096MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A6000               Off |   00

### CUDA Details

In [7]:
def check_gpu_config():
    """
    Check GPU availability and display detailed configuration information.
    """
    # Check if GPU is available
    gpu_available = torch.cuda.is_available()
    
    # Print header
    print("=" * 50)
    print("GPU Configuration Check".center(50))
    print("=" * 50)
    
    # Basic GPU availability
    print(f"{'PyTorch Version':<25}: {torch.__version__}")
    print(f"{'GPU Available':<25}: {'Yes' if gpu_available else 'No'}")
    
    # If GPU is available, print detailed info
    if gpu_available:
        print("-" * 50)
        print("GPU Details".center(50))
        print("-" * 50)
        
        # Device info
        print(f"{'Device Name':<25}: {torch.cuda.get_device_name(0)}")
        print(f"{'Number of GPUs':<25}: {torch.cuda.device_count()}")
        print(f"{'Current Device Index':<25}: {torch.cuda.current_device()}")
        
        # Compute capability and CUDA cores
        props = torch.cuda.get_device_properties(0)
        print(f"{'Compute Capability':<25}: {props.major}.{props.minor}")
        print(f"{'Total CUDA Cores':<25}: {props.multi_processor_count * 128}")  # Approx. 128 cores per SM
        
        # Memory info
        total_memory = props.total_memory / (1024 ** 3)  # Convert to GB
        memory_allocated = torch.cuda.memory_allocated(0) / (1024 ** 3)
        memory_reserved = torch.cuda.memory_reserved(0) / (1024 ** 3)
        print(f"{'Total Memory (GB)':<25}: {total_memory:.2f}")
        print(f"{'Allocated Memory (GB)':<25}: {memory_allocated:.2f}")
        print(f"{'Reserved Memory (GB)':<25}: {memory_reserved:.2f}")
    else:
        print("-" * 50)
        print("No GPU detected. Running on CPU.".center(50))
        print("-" * 50)
    
    print("=" * 50)

if __name__ == "__main__":
    check_gpu_config()

             GPU Configuration Check              
PyTorch Version          : 2.5.1
GPU Available            : Yes
--------------------------------------------------
                   GPU Details                    
--------------------------------------------------
Device Name              : NVIDIA RTX A6000
Number of GPUs           : 3
Current Device Index     : 0
Compute Capability       : 8.6
Total CUDA Cores         : 10752
Total Memory (GB)        : 47.41
Allocated Memory (GB)    : 0.00
Reserved Memory (GB)     : 0.00


### PyTorch Details

In [8]:
def print_torch_config():
    """Print PyTorch and CUDA configuration in a formatted manner."""
    print("=" * 50)
    print("PyTorch Configuration".center(50))
    print("=" * 50)
    
    # Basic PyTorch and CUDA info
    print(f"{'PyTorch Version':<25}: {torch.__version__}")
    print(f"{'CUDA Compiled Version':<25}: {torch.version.cuda}")
    print(f"{'CUDA Available':<25}: {'Yes' if torch.cuda.is_available() else 'No'}")
    print(f"{'Number of GPUs':<25}: {torch.cuda.device_count()}")

    # GPU details if available
    if torch.cuda.is_available():
        print(f"{'GPU Name':<25}: {torch.cuda.get_device_name(0)}")

    print("-" * 50)
    
    # Seed setting
    torch.manual_seed(42)
    print(f"{'Random Seed':<25}: 42 (Seeding successful!)")
    
    print("=" * 50)

if __name__ == "__main__":
    print_torch_config()

              PyTorch Configuration               
PyTorch Version          : 2.5.1
CUDA Compiled Version    : 12.1
CUDA Available           : Yes
Number of GPUs           : 3
GPU Name                 : NVIDIA RTX A6000
--------------------------------------------------
Random Seed              : 42 (Seeding successful!)


## __⚙️ GPU Selection — Auto-select the least loaded GPU__
This code automatically scans available GPUs and selects the one with the lowest current memory usage.


In [9]:
def auto_select_cuda_device(verbose=True):
    """
    Automatically selects the CUDA GPU with the least memory usage.
    Falls back to CPU if no GPU is available.
    """
    if not torch.cuda.is_available():
        print("🚫 No CUDA GPU available. Using CPU.")
        return torch.device("cpu")

    try:
        # Run nvidia-smi to get memory usage of each GPU
        smi_output = subprocess.check_output(
            ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'],
            encoding='utf-8'
        )
        memory_used = [int(x) for x in smi_output.strip().split('\n')]
        best_gpu = int(np.argmin(memory_used))

        if verbose:
            print("🎯 Automatically selected GPU:")
            print(f"    - CUDA Device ID : {best_gpu}")
            print(f"    - Memory Used    : {memory_used[best_gpu]} MiB")
            print(f"    - Device Name    : {torch.cuda.get_device_name(best_gpu)}")
        return torch.device(f"cuda:{best_gpu}")
    except Exception as e:
        print(f"⚠️ Failed to auto-detect GPU. Falling back to cuda:0. ({e})")
        return torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Execute and assign
device = auto_select_cuda_device()

🎯 Automatically selected GPU:
    - CUDA Device ID : 0
    - Memory Used    : 2096 MiB
    - Device Name    : NVIDIA RTX A6000


## __Model__

### ResNet 18 - 1D (ResNet18_1D_big_inplane)

In [10]:
class BasicBlock1d(nn.Module):
    expansion = 1
    
    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock1d, self).__init__()
        self.conv1 = nn.Conv1d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm1d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv1d(planes, planes, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm1d(planes)
        self.downsample = downsample
        self.stride = stride
        
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            identity = self.downsample(x)
            
        out += identity
        out = self.relu(out)
        
        return out

class ResNet18_1D(nn.Module):
    def __init__(self, input_channels=12, output_size=9, inplanes=64):
        super(ResNet18_1D, self).__init__()
        self.inplanes = inplanes
        
        # Initial conv layer
        self.conv1 = nn.Conv1d(input_channels, self.inplanes, kernel_size=15, stride=2, padding=7, bias=False)
        self.bn1 = nn.BatchNorm1d(inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
        
        # Residual layers
        self.layer1 = self._make_layer(BasicBlock1d, 64, 2)
        self.layer2 = self._make_layer(BasicBlock1d, 128, 2, stride=2)
        self.layer3 = self._make_layer(BasicBlock1d, 256, 2, stride=2)
        self.layer4 = self._make_layer(BasicBlock1d, 512, 2, stride=2)
        
        # Adaptive pooling (both avg and max)
        self.adaptiveavgpool = nn.AdaptiveAvgPool1d(1)
        self.adaptivemaxpool = nn.AdaptiveMaxPool1d(1)
        
        # Fully connected layer with dropout
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(512 * BasicBlock1d.expansion * 2, output_size)
    
    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(planes * block.expansion),
            )
        
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))
            
        return nn.Sequential(*layers)
    
    def forward(self, x):
        # Expect input shape: (batch_size, time_steps, channels)
        x = x.permute(0, 2, 1)  # → (batch_size, channels, time_steps)
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        # Apply both avg and max pooling
        x1 = self.adaptiveavgpool(x)
        x2 = self.adaptivemaxpool(x)
        
        # Concatenate pooling results
        x = torch.cat((x1, x2), dim=1)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Apply dropout
        x = self.dropout(x)
        
        # Final classification
        x = self.fc(x)
        
        return x

## __Training and validation function__

### Extra Function

In [11]:
def compute_classwise_accuracy(student_logits_flat, y_batch, class_correct, class_total):
    """
    Computes per-class accuracy by accumulating correct and total samples for each class using vectorized operations.
    
    Args:
        student_logits_flat (torch.Tensor): Model predictions (logits) in shape [batch_size * seq_len, output_size]
        y_batch (torch.Tensor): True labels in shape [batch_size * seq_len]
        class_correct (dict): Dictionary to store correct predictions per class
        class_total (dict): Dictionary to store total samples per class
    """
    # Ensure inputs are on the same device
    if student_logits_flat.device != y_batch.device:
        raise ValueError("student_logits_flat and y_batch must be on the same device")

    # Convert logits to predicted class indices
    predictions = torch.argmax(student_logits_flat, dim=-1)  # Shape: [batch_size * seq_len]

    # Compute correct predictions mask
    correct_mask = (predictions == y_batch)  # Shape: [batch_size * seq_len], boolean

    # Get unique labels in this batch
    unique_labels = torch.unique(y_batch)

    # Update class_total and class_correct using vectorized operations
    for label in unique_labels:
        label = label.item()  # Convert tensor to scalar
        if label not in class_total:
            class_total[label] = 0
            class_correct[label] = 0
        
        # Count total samples for this label
        label_mask = (y_batch == label)
        class_total[label] += label_mask.sum().item()
        
        # Count correct predictions for this label
        class_correct[label] += (label_mask & correct_mask).sum().item()

In [12]:
def get_model_parameter_info(model):
    total_params = sum(p.numel() for p in model.parameters())
    param_size_bytes = total_params * 4
    param_size_MB = param_size_bytes / (1024**2)
    return total_params, param_size_MB

In [13]:
def compute_class_weights(y: np.ndarray, num_classes: int, exclude_classes: list = None) -> torch.Tensor:
    """
    計算 class weights（inverse frequency）避免 class imbalance。
    可排除某些類別（如不存在的類別），這些類別的權重將設為 0。
    """
    exclude_classes = set(exclude_classes or [])
    class_sample_counts = np.bincount(y, minlength=num_classes)
    total_samples = len(y)

    weights = np.zeros(num_classes, dtype=np.float32)

    for cls in range(num_classes):
        if cls in exclude_classes:
            weights[cls] = 0.0
        else:
            count = class_sample_counts[cls]
            weights[cls] = total_samples / (count + 1e-6)

    # Normalize only non-excluded weights
    valid_mask = np.array([cls not in exclude_classes for cls in range(num_classes)])
    norm_sum = weights[valid_mask].sum()
    if norm_sum > 0:
        weights[valid_mask] /= norm_sum

    print("\n📊 Class Weights (normalized):")
    for i, w in enumerate(weights):
        status = " (excluded)" if i in exclude_classes else ""
        print(f"  - Class {i}: {w:.4f}{status}")
    
    return torch.tensor(weights, dtype=torch.float32)


### Training Function

In [14]:
def train_with_mse_ecg(student_model, output_size, criterion, optimizer,
                       X_train, y_train, X_val, y_val,
                       stable_classes=None, teacher_model=None, alpha=0.1,
                       scheduler=None, num_epochs=10, batch_size=64,
                       model_saving_folder=None, model_name=None,
                       stop_signal_file=None, device=None):
    
    print("\n🚀 'train_with_mse_ecg' started.")
    start_time = time.time()

    device = device or auto_select_cuda_device()
    model_name = model_name or 'model'
    model_saving_folder = model_saving_folder or './saved_models'

    if os.path.exists(model_saving_folder):
        shutil.rmtree(model_saving_folder)
        print(f"✅ Removed existing folder: {model_saving_folder}")
    os.makedirs(model_saving_folder, exist_ok=True)

    student_model.to(device)
    if teacher_model:
        teacher_model.to(device)
        teacher_model.eval()

    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val = torch.tensor(y_val, dtype=torch.long).to(device)

    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)

    print("\n✅ Data Overview:")
    print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

    best_results = []

    for epoch in range(num_epochs):
        if stop_signal_file and os.path.exists(stop_signal_file):
            print("\n🛑 Stop signal detected. Exiting training loop.")
            break

        student_model.train()
        epoch_loss = 0.0
        class_correct, class_total = {}, {}

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            student_logits = student_model(X_batch)  # (B, output_size)

            ce_loss = criterion(student_logits, y_batch)
            compute_classwise_accuracy(student_logits, y_batch, class_correct, class_total)

            if teacher_model and stable_classes:
                with torch.no_grad():
                    teacher_logits = teacher_model(X_batch)

                stable_indices = torch.tensor(stable_classes, device=student_logits.device)
                teacher_stable = teacher_logits.index_select(dim=1, index=stable_indices)
                student_stable = student_logits.index_select(dim=1, index=stable_indices)
                distill_loss = F.mse_loss(student_stable, teacher_stable)
                total_loss = alpha * distill_loss + (1 - alpha) * ce_loss
            else:
                total_loss = ce_loss

            total_loss.backward()
            optimizer.step()
            epoch_loss += total_loss.item() * X_batch.size(0)

        train_loss = epoch_loss / len(train_loader.dataset)
        train_acc = {int(c): f"{(class_correct[c] / class_total[c]) * 100:.2f}%" if class_total[c] > 0 else "0.00%"
                     for c in sorted(class_total.keys())}

        # Validation
        student_model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0
        val_class_correct, val_class_total = {}, {}
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = student_model(X_batch)
                val_loss += criterion(outputs, y_batch).item() * X_batch.size(0)
                predictions = torch.argmax(outputs, dim=-1)
                val_correct += (predictions == y_batch).sum().item()
                val_total += y_batch.size(0)
                compute_classwise_accuracy(outputs, y_batch, val_class_correct, val_class_total)

        val_loss /= len(val_loader.dataset)
        val_acc = val_correct / val_total
        val_acc_cls = {int(c): f"{(val_class_correct[c] / val_class_total[c]) * 100:.2f}%" if val_class_total[c] > 0 else "0.00%"
                       for c in sorted(val_class_total.keys())}

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.6f}, Train-Class-Acc: {train_acc}")
        print(f"Val Loss: {val_loss:.6f}, Val Acc: {val_acc * 100:.2f}%, Val-Class-Acc: {val_acc_cls}, LR: {optimizer.param_groups[0]['lr']:.6f}")

        model_path = os.path.join(model_saving_folder, f"{model_name}_epoch_{epoch+1}.pth")
        current = {
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_accuracy': val_acc,
            'train_classwise_accuracy': train_acc,
            'val_classwise_accuracy': val_acc_cls,
            'model_state_dict': student_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'learning_rate': optimizer.param_groups[0]['lr'],
            'model_path': model_path
        }

        if len(best_results) < 5 or val_acc > best_results[-1]['val_accuracy']:
            if len(best_results) == 5:
                to_remove = best_results.pop()
                if os.path.exists(to_remove['model_path']):
                    os.remove(to_remove['model_path'])
                    print(f"🗑 Removed: {to_remove['model_path']}")
            best_results.append(current)
            best_results.sort(key=lambda x: (x['val_accuracy'], x['epoch']), reverse=True)
            torch.save(current, model_path)
            print(f"✅ Saved model: {model_path}")

        if scheduler:
            scheduler.step(val_loss)

    # 結束訓練
    training_time = time.time() - start_time
    total_params, param_size_MB = get_model_parameter_info(student_model)

    if best_results:
        best = best_results[0]
        best_model_path = os.path.join(model_saving_folder, f"{model_name}_best.pth")
        torch.save(best, best_model_path)
        print(f"\n🏆 Best model saved as: {best_model_path} (Val Accuracy: {best['val_accuracy'] * 100:.2f}%)")

    final_model_path = os.path.join(model_saving_folder, f"{model_name}_final.pth")
    torch.save(current, final_model_path)
    print(f"\n📌 Final model saved as: {final_model_path}")

    print("\n🎯 Top 5 Best Models:")
    for res in best_results:
        print(f"Epoch {res['epoch']}, Train Loss: {res['train_loss']:.6f}, Train-Acc: {res['train_classwise_accuracy']},\n"
              f"Val Loss: {res['val_loss']:.6f}, Val Acc: {res['val_accuracy']*100:.2f}%, Val-Acc: {res['val_classwise_accuracy']},"
              f" Model Path: {res['model_path']}")

    print(f"\n🧠 Model Summary:")
    print(f"Total Parameters: {total_params:,}")
    print(f"Model Size (float32): {param_size_MB:.2f} MB")
    print(f"Total Training Time: {training_time:.2f} seconds")

    # Markdown summary block
    match = re.search(r'Period_(\d+)', model_saving_folder)
    period_label = match.group(1) if match else "?"
    model_name_str = student_model.__class__.__name__

    print(f"""
---
### Period {period_label} (alpha = {alpha})
+ ##### Total training time: {training_time:.2f} seconds
+ ##### Model: {model_name_str}
+ ##### Training and saving in *'{model_saving_folder}'*
+ ##### Best Epoch: {best['epoch']}
#### __Val Accuracy: {best['val_accuracy'] * 100:.2f}%__
#### __Val-Class-Acc: {best['val_classwise_accuracy']}__
#### __Total Parameters: {total_params:,}__
#### __Model Size (float32): {param_size_MB:.2f} MB__
""".strip())

    del X_train, y_train, X_val, y_val, train_loader, val_loader, current, outputs, predictions
    torch.cuda.empty_cache()
    gc.collect()


## __Training__

### Period 1 Summary

In [14]:
def display_model_summary_with_params(model_folder, model_filename="ResNet18_big_inplane_1D_best.pth", input_channels=12, output_size=10):
    model_path = os.path.join(model_folder, model_filename)

    if not os.path.exists(model_path):
        print(f"❌ File not found: {model_path}")
        return

    checkpoint = torch.load(model_path, map_location='cpu')

    # === 還原模型並載入參數 ===
    model = ResNet18_1D(input_channels=input_channels, output_size=output_size)
    model.load_state_dict(checkpoint["model_state_dict"])
    total_params, param_size_MB = get_model_parameter_info(model)

    # === 顯示摘要 ===
    epoch = checkpoint.get("epoch", "?")
    train_loss = checkpoint.get("train_loss", "?")
    val_loss = checkpoint.get("val_loss", "?")
    val_acc = checkpoint.get("val_accuracy", "?")
    train_acc_dict = checkpoint.get("train_classwise_accuracy", {})
    val_acc_dict = checkpoint.get("val_classwise_accuracy", {})
    lr = checkpoint.get("learning_rate", "?")
    stored_path = checkpoint.get("model_path", "N/A")

    print(f"\n📦 Model Summary from: {model_path}")
    print(f"📌 Epoch: {epoch}")
    print(f"🧮 Train Loss: {train_loss:.6f}" if isinstance(train_loss, float) else f"🧮 Train Loss: {train_loss}")
    print(f"🎯 Val Loss: {val_loss:.6f}" if isinstance(val_loss, float) else f"🎯 Val Loss: {val_loss}")
    print(f"✅ Val Accuracy: {val_acc*100:.2f}%" if isinstance(val_acc, float) else f"✅ Val Accuracy: {val_acc}")
    print(f"📎 Learning Rate: {lr}")
    print(f"📁 Stored Model Path: {stored_path}")
    print(f"🧠 Total Parameters: {total_params:,}")
    print(f"📏 Model Size (float32): {param_size_MB:.2f} MB")

    print("\n📊 Train Class-wise Accuracy:")
    for c, acc in train_acc_dict.items():
        print(f"  └─ Class {c:<2}: {acc}")

    print("\n📊 Val Class-wise Accuracy:")
    for c, acc in val_acc_dict.items():
        print(f"  └─ Class {c:<2}: {acc}")

    print("\n---\n### Period 1 Summary (Markdown Format)")
    print(f"+ **Epoch:** {epoch}")
    print(f"+ **Train Loss:** {train_loss}")
    print(f"+ **Val Loss:** {val_loss}")
    print(f"+ **Val Accuracy:** {val_acc*100:.2f}%" if isinstance(val_acc, float) else f"+ **Val Accuracy:** {val_acc}")
    print(f"+ **Learning Rate:** {lr}")
    print(f"+ **Stored Model Path:** `{stored_path}`")
    print(f"+ **Total Parameters:** {total_params:,}")
    print(f"+ **Model Size (float32):** {param_size_MB:.2f} MB")
    print(f"+ **Train-Class-Acc:** {train_acc_dict}")
    print(f"+ **Val-Class-Acc:** {val_acc_dict}")
    print("---")

# Example call:
display_model_summary_with_params(
    model_folder=os.path.join("Class_Incremental_CL", "CPSC_CIL", "ResNet18_Selection", "ResNet18_big_inplane_v1"),
    input_channels=12,
    output_size=2
)


📦 Model Summary from: Class_Incremental_CL/CPSC_CIL/ResNet18_Selection/ResNet18_big_inplane_v1/ResNet18_big_inplane_1D_best.pth
📌 Epoch: 63
🧮 Train Loss: 0.007664
🎯 Val Loss: 0.800983
✅ Val Accuracy: 88.86%
📎 Learning Rate: 0.0006561000000000001
📁 Stored Model Path: Class_Incremental_CL/CPSC_CIL/ResNet18_Selection/ResNet18_big_inplane_v1/ResNet18_big_inplane_1D_epoch_63.pth
🧠 Total Parameters: 3,857,026
📏 Model Size (float32): 14.71 MB

📊 Train Class-wise Accuracy:
  └─ Class 0 : 99.86%
  └─ Class 1 : 99.59%

📊 Val Class-wise Accuracy:
  └─ Class 0 : 91.85%
  └─ Class 1 : 85.87%

---
### Period 1 Summary (Markdown Format)
+ **Epoch:** 63
+ **Train Loss:** 0.0076635455248283595
+ **Val Loss:** 0.800983331773592
+ **Val Accuracy:** 88.86%
+ **Learning Rate:** 0.0006561000000000001
+ **Stored Model Path:** `Class_Incremental_CL/CPSC_CIL/ResNet18_Selection/ResNet18_big_inplane_v1/ResNet18_big_inplane_1D_epoch_63.pth`
+ **Total Parameters:** 3,857,026
+ **Model Size (float32):** 14.71 MB
+

  checkpoint = torch.load(model_path, map_location='cpu')


### Period 2

In [15]:
# ================================
# 📌 Period 2: MSE Distillation Training
# ================================
period = 2
prev_period = 1

# ==== Paths ====
stop_signal_file = os.path.join(BASE_DIR, "stop_training.txt")
model_saving_folder = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v1", f"Period_{period}")
ensure_folder(model_saving_folder)

# ==== Load Period 2 Data ====
X_train = np.load(os.path.join(save_dir, f"X_train_p{period}.npy"))
y_train = np.load(os.path.join(save_dir, f"y_train_p{period}.npy"))
X_val   = np.load(os.path.join(save_dir, f"X_test_p{period}.npy"))
y_val   = np.load(os.path.join(save_dir, f"y_test_p{period}.npy"))

# ==== Device ====
device = auto_select_cuda_device()

# ==== Model Configuration ====
input_channels = X_train.shape[2]  # 12 leads
output_size = len(np.unique(y_train))  # Period 2: number of classes
model_name = "ResNet18_1D"

student_model = ResNet18_1D(input_channels=input_channels, output_size=output_size).to(device)

# ==== Load Period 1 Best Teacher Model ====
teacher_model_path = os.path.join(BASE_DIR, "ResNet18_Selection", "ResNet18_big_inplane_v1", "ResNet18_big_inplane_1D_best.pth")
teacher_checkpoint = torch.load(teacher_model_path, map_location=device)
teacher_output_size = len(np.unique(np.load(os.path.join(save_dir, f"y_train_p{prev_period}.npy"))))
teacher_model = ResNet18_1D(input_channels=input_channels, output_size=teacher_output_size).to(device)

# Selectively load matching parameters (ignore FC mismatch)
teacher_state = teacher_checkpoint["model_state_dict"]
model_state = teacher_model.state_dict()
filtered_teacher = {k: v for k, v in teacher_state.items() if k in model_state and model_state[k].shape == v.shape}
teacher_model.load_state_dict(filtered_teacher, strict=False)
teacher_model.eval()
print("✅ Loaded Period 1 teacher weights (FC excluded)")

# ==== Define Stable Classes ====
stable_classes = [0]  # class 1 是 OTHER（特殊處理類別）

# ==== Training Hyperparameters ====
learning_rate = 1e-3
weight_decay = 1e-5
alpha = 0.1
num_epochs = 200
batch_size = 64

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10)

# ==== Train ====
train_with_mse_ecg(
    student_model=student_model,
    output_size=output_size,
    criterion=criterion,
    optimizer=optimizer,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    stable_classes=stable_classes,
    teacher_model=teacher_model,
    alpha=alpha,
    scheduler=scheduler,
    num_epochs=num_epochs,
    batch_size=batch_size,
    model_saving_folder=model_saving_folder,
    model_name=model_name,
    stop_signal_file=stop_signal_file,
    device=device
)

# ================================
# ✅ Cleanup
# ================================
del X_train, y_train, X_val, y_val
del teacher_model_path, teacher_checkpoint, teacher_state, model_state, filtered_teacher
del student_model, teacher_model
gc.collect()
torch.cuda.empty_cache()


🎯 Automatically selected GPU:
    - CUDA Device ID : 2
    - Memory Used    : 991 MiB
    - Device Name    : NVIDIA RTX A6000


  teacher_checkpoint = torch.load(teacher_model_path, map_location=device)


✅ Loaded Period 1 teacher weights (FC excluded)

🚀 'train_with_mse_ecg' started.
✅ Removed existing folder: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v1/Period_2

✅ Data Overview:
X_train: torch.Size([3263, 5000, 12]), y_train: torch.Size([3263])
X_val: torch.Size([816, 5000, 12]), y_val: torch.Size([816])
Epoch 1/200, Train Loss: 7.340005, Train-Class-Acc: {0: '63.22%', 1: '42.83%', 2: '28.60%', 3: '52.66%'}
Val Loss: 1.103862, Val Acc: 54.53%, Val-Class-Acc: {0: '59.24%', 1: '18.03%', 2: '84.03%', 3: '70.08%'}, LR: 0.001000
✅ Saved model: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v1/Period_2/ResNet18_1D_epoch_1.pth
Epoch 2/200, Train Loss: 2.942453, Train-Class-Acc: {0: '77.66%', 1: '61.68%', 2: '66.90%', 3: '74.28%'}
Val Loss: 0.682258, Val Acc: 75.98%, Val-Class-Acc: {0: '70.65%', 1: '69.26%', 2: '92.36%', 3: '77.05%'}, LR: 0.001000
✅ Saved model: /mnt/myd

### Period 3

In [16]:
# ================================
# 📌 Period 3: MSE Distillation Training
# ================================
period = 3
prev_period = 2

# ==== Paths ====
stop_signal_file = os.path.join(BASE_DIR, "stop_training.txt")
model_saving_folder = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v1", f"Period_{period}")
ensure_folder(model_saving_folder)

# ==== Load Period 3 Data ====
X_train = np.load(os.path.join(save_dir, f"X_train_p{period}.npy"))
y_train = np.load(os.path.join(save_dir, f"y_train_p{period}.npy"))
X_val   = np.load(os.path.join(save_dir, f"X_test_p{period}.npy"))
y_val   = np.load(os.path.join(save_dir, f"y_test_p{period}.npy"))

# ==== Device ====
device = auto_select_cuda_device()

# ==== Model Configuration ====
input_channels = X_train.shape[2]  # 12 leads
output_size = len(np.unique(y_train))  # Period 3: number of classes
model_name = "ResNet18_1D"

student_model = ResNet18_1D(input_channels=input_channels, output_size=output_size).to(device)

# ==== Load Period 2 Best Teacher Model ====
teacher_model_path = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v1", "Period_2", f"{model_name}_best.pth")
teacher_checkpoint = torch.load(teacher_model_path, map_location=device)
teacher_output_size = len(np.unique(np.load(os.path.join(save_dir, f"y_train_p{prev_period}.npy"))))
teacher_model = ResNet18_1D(input_channels=input_channels, output_size=teacher_output_size).to(device)

# Selectively load matching parameters (ignore FC mismatch)
teacher_state = teacher_checkpoint["model_state_dict"]
model_state = teacher_model.state_dict()
filtered_teacher = {k: v for k, v in teacher_state.items() if k in model_state and model_state[k].shape == v.shape}
teacher_model.load_state_dict(filtered_teacher, strict=False)
teacher_model.eval()
print("✅ Loaded Period 2 teacher weights (FC excluded)")

# ==== Define Stable Classes ====
stable_classes = [0, 2, 3]  # ONLY NSR is retained as stable class

# ==== Training Hyperparameters ====
learning_rate = 1e-3
weight_decay = 1e-5
alpha = 0.1
num_epochs = 200
batch_size = 64

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10)

# ==== Train ====
train_with_mse_ecg(
    student_model=student_model,
    output_size=output_size,
    criterion=criterion,
    optimizer=optimizer,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    stable_classes=stable_classes,
    teacher_model=teacher_model,
    alpha=alpha,
    scheduler=scheduler,
    num_epochs=num_epochs,
    batch_size=batch_size,
    model_saving_folder=model_saving_folder,
    model_name=model_name,
    stop_signal_file=stop_signal_file,
    device=device
)

# ================================
# ✅ Cleanup
# ================================
del X_train, y_train, X_val, y_val
del teacher_model_path, teacher_checkpoint, teacher_state, model_state, filtered_teacher
del student_model, teacher_model
gc.collect()
torch.cuda.empty_cache()


🎯 Automatically selected GPU:
    - CUDA Device ID : 1
    - Memory Used    : 58 MiB
    - Device Name    : NVIDIA RTX A6000
✅ Loaded Period 2 teacher weights (FC excluded)

🚀 'train_with_mse_ecg' started.
✅ Removed existing folder: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v1/Period_3


  teacher_checkpoint = torch.load(teacher_model_path, map_location=device)



✅ Data Overview:
X_train: torch.Size([5120, 5000, 12]), y_train: torch.Size([5120])
X_val: torch.Size([1281, 5000, 12]), y_val: torch.Size([1281])
Epoch 1/200, Train Loss: 5.950207, Train-Class-Acc: {0: '62.13%', 1: '55.20%', 2: '53.73%', 3: '60.04%', 4: '34.18%', 5: '76.01%'}
Val Loss: 0.813030, Val Acc: 79.16%, Val-Class-Acc: {0: '75.54%', 1: '67.16%', 2: '86.11%', 3: '80.74%', 4: '72.50%', 5: '89.82%'}, LR: 0.001000
✅ Saved model: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v1/Period_3/ResNet18_1D_epoch_1.pth
Epoch 2/200, Train Loss: 2.681105, Train-Class-Acc: {0: '69.62%', 1: '71.43%', 2: '84.06%', 3: '88.32%', 4: '61.39%', 5: '84.83%'}
Val Loss: 0.693197, Val Acc: 81.81%, Val-Class-Acc: {0: '40.76%', 1: '88.36%', 2: '86.11%', 3: '89.75%', 4: '72.50%', 5: '91.32%'}, LR: 0.001000
✅ Saved model: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v1/Period_3/ResNet18_1

In [18]:
# ================================
# 📌 Period 3: MSE Distillation Training
# weight = class_weights_tensor
# ================================
period = 3
prev_period = 2

# ==== Paths ====
stop_signal_file = os.path.join(BASE_DIR, "stop_training.txt")
model_saving_folder = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v2", f"Period_{period}")
ensure_folder(model_saving_folder)

# ==== Load Period 3 Data ====
X_train = np.load(os.path.join(save_dir, f"X_train_p{period}.npy"))
y_train = np.load(os.path.join(save_dir, f"y_train_p{period}.npy"))
X_val   = np.load(os.path.join(save_dir, f"X_test_p{period}.npy"))
y_val   = np.load(os.path.join(save_dir, f"y_test_p{period}.npy"))

# ==== Device ====
device = auto_select_cuda_device()

# ==== Model Configuration ====
input_channels = X_train.shape[2]  # 12 leads
output_size = len(np.unique(y_train))  # Period 3: number of classes
model_name = "ResNet18_1D"

student_model = ResNet18_1D(input_channels=input_channels, output_size=output_size).to(device)

# ==== Load Period 2 Best Teacher Model ====
teacher_model_path = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v1", "Period_2", f"{model_name}_best.pth")
teacher_checkpoint = torch.load(teacher_model_path, map_location=device)
teacher_output_size = len(np.unique(np.load(os.path.join(save_dir, f"y_train_p{prev_period}.npy"))))
teacher_model = ResNet18_1D(input_channels=input_channels, output_size=teacher_output_size).to(device)

# Selectively load matching parameters (ignore FC mismatch)
teacher_state = teacher_checkpoint["model_state_dict"]
model_state = teacher_model.state_dict()
filtered_teacher = {k: v for k, v in teacher_state.items() if k in model_state and model_state[k].shape == v.shape}
teacher_model.load_state_dict(filtered_teacher, strict=False)
teacher_model.eval()
print("✅ Loaded Period 2 teacher weights (FC excluded)")

# ==== Define Stable Classes ====
stable_classes = [0, 2, 3]  # ONLY NSR is retained as stable class

# ==== Training Hyperparameters ====
learning_rate = 1e-3
weight_decay = 1e-5
alpha = 0.1
num_epochs = 200
batch_size = 64

class_weights_tensor = compute_class_weights(y_train, output_size)
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor.to(device))
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10)

# ==== Train ====
train_with_mse_ecg(
    student_model=student_model,
    output_size=output_size,
    criterion=criterion,
    optimizer=optimizer,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    stable_classes=stable_classes,
    teacher_model=teacher_model,
    alpha=alpha,
    scheduler=scheduler,
    num_epochs=num_epochs,
    batch_size=batch_size,
    model_saving_folder=model_saving_folder,
    model_name=model_name,
    stop_signal_file=stop_signal_file,
    device=device
)

# ================================
# ✅ Cleanup
# ================================
del X_train, y_train, X_val, y_val
del teacher_model_path, teacher_checkpoint, teacher_state, model_state, filtered_teacher
del student_model, teacher_model
del class_weights_tensor
gc.collect()
torch.cuda.empty_cache()


🎯 Automatically selected GPU:
    - CUDA Device ID : 0
    - Memory Used    : 431 MiB
    - Device Name    : NVIDIA RTX A6000


  teacher_checkpoint = torch.load(teacher_model_path, map_location=device)


✅ Loaded Period 2 teacher weights (FC excluded)

📊 Class Weights (normalized):
  - Class 0: 0.1141
  - Class 1: 0.0626
  - Class 2: 0.1451
  - Class 3: 0.0858
  - Class 4: 0.5299
  - Class 5: 0.0626

🚀 'train_with_mse_ecg' started.
✅ Removed existing folder: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v2/Period_3

✅ Data Overview:
X_train: torch.Size([5120, 5000, 12]), y_train: torch.Size([5120])
X_val: torch.Size([1281, 5000, 12]), y_val: torch.Size([1281])
Epoch 1/200, Train Loss: 6.687473, Train-Class-Acc: {0: '67.03%', 1: '38.67%', 2: '48.35%', 3: '52.05%', 4: '74.68%', 5: '70.10%'}
Val Loss: 1.096466, Val Acc: 68.46%, Val-Class-Acc: {0: '92.93%', 1: '26.27%', 2: '88.19%', 3: '70.90%', 4: '85.00%', 5: '85.03%'}, LR: 0.001000
✅ Saved model: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v2/Period_3/ResNet18_1D_epoch_1.pth
Epoch 2/200, Train Loss: 2.923364, Train-C

### Period 4

In [20]:
# ================================
# 📌 Period 4: MSE Distillation Training
# ================================
period = 4
prev_period = 3

# ==== Paths ====
stop_signal_file = os.path.join(BASE_DIR, "stop_training.txt")
model_saving_folder = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v1", f"Period_{period}")
ensure_folder(model_saving_folder)

# ==== Load Period 4 Data ====
X_train = np.load(os.path.join(save_dir, f"X_train_p{period}.npy"))
y_train = np.load(os.path.join(save_dir, f"y_train_p{period}.npy"))
X_val   = np.load(os.path.join(save_dir, f"X_test_p{period}.npy"))
y_val   = np.load(os.path.join(save_dir, f"y_test_p{period}.npy"))

# ==== Device ====
device = auto_select_cuda_device()

# ==== Model Configuration ====
input_channels = X_train.shape[2]  # 12 leads
output_size = int(np.max(y_train)) + 1  # avoid indexing problems due to missing class 1
model_name = "ResNet18_1D"

student_model = ResNet18_1D(input_channels=input_channels, output_size=output_size).to(device)

# ==== Load Period 3 Best Teacher Model ====
teacher_model_path = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v1", "Period_3", f"{model_name}_best.pth")
teacher_checkpoint = torch.load(teacher_model_path, map_location=device)
teacher_output_size = len(np.unique(np.load(os.path.join(save_dir, f"y_train_p{prev_period}.npy"))))
teacher_model = ResNet18_1D(input_channels=input_channels, output_size=teacher_output_size).to(device)

# Selectively load matching parameters (ignore FC mismatch)
teacher_state = teacher_checkpoint["model_state_dict"]
model_state = teacher_model.state_dict()
filtered_teacher = {k: v for k, v in teacher_state.items() if k in model_state and model_state[k].shape == v.shape}
teacher_model.load_state_dict(filtered_teacher, strict=False)
teacher_model.eval()
print("✅ Loaded Period 3 teacher weights (FC excluded)")

# ==== Define Stable Classes ====
stable_classes = [0, 2, 3, 4, 5]  # ONLY NSR is retained as stable class

# ==== Training Hyperparameters ====
learning_rate = 1e-3
weight_decay = 1e-5
alpha = 0.1
num_epochs = 200
batch_size = 64

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10)

# ==== Train ====
train_with_mse_ecg(
    student_model=student_model,
    output_size=output_size,
    criterion=criterion,
    optimizer=optimizer,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    stable_classes=stable_classes,
    teacher_model=teacher_model,
    alpha=alpha,
    scheduler=scheduler,
    num_epochs=num_epochs,
    batch_size=batch_size,
    model_saving_folder=model_saving_folder,
    model_name=model_name,
    stop_signal_file=stop_signal_file,
    device=device
)

# ================================
# ✅ Cleanup
# ================================
del X_train, y_train, X_val, y_val
del teacher_model_path, teacher_checkpoint, teacher_state, model_state, filtered_teacher
del student_model, teacher_model
gc.collect()
torch.cuda.empty_cache()


🎯 Automatically selected GPU:
    - CUDA Device ID : 1
    - Memory Used    : 951 MiB
    - Device Name    : NVIDIA RTX A6000
✅ Loaded Period 3 teacher weights (FC excluded)

🚀 'train_with_mse_ecg' started.
✅ Removed existing folder: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v1/Period_4


  teacher_checkpoint = torch.load(teacher_model_path, map_location=device)



✅ Data Overview:
X_train: torch.Size([5493, 5000, 12]), y_train: torch.Size([5493])
X_val: torch.Size([1374, 5000, 12]), y_val: torch.Size([1374])
Epoch 1/200, Train Loss: 5.773285, Train-Class-Acc: {0: '62.67%', 2: '50.43%', 3: '58.30%', 4: '49.37%', 5: '84.37%', 6: '13.13%', 7: '43.31%', 8: '42.83%', 9: '6.76%'}
Val Loss: 1.125484, Val Acc: 73.00%, Val-Class-Acc: {0: '63.59%', 2: '94.44%', 3: '72.95%', 4: '87.50%', 5: '94.63%', 6: '33.33%', 7: '59.20%', 8: '68.79%', 9: '5.41%'}, LR: 0.001000
✅ Saved model: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v1/Period_4/ResNet18_1D_epoch_1.pth
Epoch 2/200, Train Loss: 2.958663, Train-Class-Acc: {0: '68.26%', 2: '83.02%', 3: '87.30%', 4: '73.42%', 5: '90.50%', 6: '26.96%', 7: '61.68%', 8: '66.72%', 9: '20.27%'}
Val Loss: 0.939714, Val Acc: 76.78%, Val-Class-Acc: {0: '80.98%', 2: '92.36%', 3: '91.80%', 4: '60.00%', 5: '81.19%', 6: '58.33%', 7: '68.80%', 8: '61.78%', 9: '18.92%'}, LR

In [23]:
# ================================
# 📌 Period 4: MSE Distillation Training
# weight = class_weights_tensor
# ================================
period = 4
prev_period = 3

# ==== Paths ====
stop_signal_file = os.path.join(BASE_DIR, "stop_training.txt")
model_saving_folder = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v2", f"Period_{period}")
ensure_folder(model_saving_folder)

# ==== Load Period 4 Data ====
X_train = np.load(os.path.join(save_dir, f"X_train_p{period}.npy"))
y_train = np.load(os.path.join(save_dir, f"y_train_p{period}.npy"))
X_val   = np.load(os.path.join(save_dir, f"X_test_p{period}.npy"))
y_val   = np.load(os.path.join(save_dir, f"y_test_p{period}.npy"))

# ==== Device ====
device = auto_select_cuda_device()

# ==== Model Configuration ====
input_channels = X_train.shape[2]  # 12 leads
output_size = int(np.max(y_train)) + 1  # avoid indexing problems due to missing class 1
model_name = "ResNet18_1D"

student_model = ResNet18_1D(input_channels=input_channels, output_size=output_size).to(device)

# ==== Load Period 3 Best Teacher Model ====
teacher_model_path = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v2", "Period_3", f"{model_name}_best.pth")
teacher_checkpoint = torch.load(teacher_model_path, map_location=device)
teacher_output_size = len(np.unique(np.load(os.path.join(save_dir, f"y_train_p{prev_period}.npy"))))
teacher_model = ResNet18_1D(input_channels=input_channels, output_size=teacher_output_size).to(device)

# Selectively load matching parameters (ignore FC mismatch)
teacher_state = teacher_checkpoint["model_state_dict"]
model_state = teacher_model.state_dict()
filtered_teacher = {k: v for k, v in teacher_state.items() if k in model_state and model_state[k].shape == v.shape}
teacher_model.load_state_dict(filtered_teacher, strict=False)
teacher_model.eval()
print("✅ Loaded Period 3 teacher weights (FC excluded)")

# ==== Define Stable Classes ====
stable_classes = [0, 2, 3, 4, 5]  # ONLY NSR is retained as stable class

# ==== Training Hyperparameters ====
learning_rate = 1e-3
weight_decay = 1e-5
alpha = 0.1
num_epochs = 200
batch_size = 64

exclude_labels = [1]  # 假設 y_train 沒有 class 1，但 output_size 還是包含它
class_weights_tensor = compute_class_weights(y_train, output_size, exclude_classes=exclude_labels)
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor.to(device))
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10)

# ==== Train ====
train_with_mse_ecg(
    student_model=student_model,
    output_size=output_size,
    criterion=criterion,
    optimizer=optimizer,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    stable_classes=stable_classes,
    teacher_model=teacher_model,
    alpha=alpha,
    scheduler=scheduler,
    num_epochs=num_epochs,
    batch_size=batch_size,
    model_saving_folder=model_saving_folder,
    model_name=model_name,
    stop_signal_file=stop_signal_file,
    device=device
)

# ================================
# ✅ Cleanup
# ================================
del X_train, y_train, X_val, y_val
del teacher_model_path, teacher_checkpoint, teacher_state, model_state, filtered_teacher
del student_model, teacher_model
del class_weights_tensor
gc.collect()
torch.cuda.empty_cache()


🎯 Automatically selected GPU:
    - CUDA Device ID : 1
    - Memory Used    : 911 MiB
    - Device Name    : NVIDIA RTX A6000


  teacher_checkpoint = torch.load(teacher_model_path, map_location=device)


✅ Loaded Period 3 teacher weights (FC excluded)

📊 Class Weights (normalized):
  - Class 0: 0.0571
  - Class 1: 0.0000 (excluded)
  - Class 2: 0.0727
  - Class 3: 0.0430
  - Class 4: 0.2654
  - Class 5: 0.0314
  - Class 6: 0.0966
  - Class 7: 0.0837
  - Class 8: 0.0668
  - Class 9: 0.2833

🚀 'train_with_mse_ecg' started.
✅ Removed existing folder: /mnt/mydisk/Continual_Learning_JL/Continual_Learning/Class_Incremental_CL/CPSC_CIL/Trained_models/MSE_CIL_v2/Period_4

✅ Data Overview:
X_train: torch.Size([5493, 5000, 12]), y_train: torch.Size([5493])
X_val: torch.Size([1374, 5000, 12]), y_val: torch.Size([1374])
Epoch 1/200, Train Loss: 7.124712, Train-Class-Acc: {0: '43.19%', 2: '28.77%', 3: '28.07%', 4: '67.09%', 5: '83.47%', 6: '19.59%', 7: '47.11%', 8: '32.64%', 9: '27.03%'}
Val Loss: 1.545699, Val Acc: 64.99%, Val-Class-Acc: {0: '79.89%', 2: '31.25%', 3: '48.77%', 4: '77.50%', 5: '91.64%', 6: '34.26%', 7: '71.20%', 8: '70.70%', 9: '18.92%'}, LR: 0.001000
✅ Saved model: /mnt/mydisk/Con

##  Compute FWT

In [15]:
def compute_fwt_ecg(previous_model, init_model, X_val, y_val, known_classes, batch_size=64):
    """
    FWT computation for ECG-style inputs with 1D CNN (e.g., ResNet18_1D).
    X_val: shape [B, T, C]  (e.g., [N, 5000, 12])
    y_val: shape [B]        (e.g., [N])
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    previous_model.to(device).eval()
    init_model.to(device).eval()

    # 只選取 known classes
    mask = np.isin(y_val, known_classes)
    X_known = X_val[mask]
    y_known = y_val[mask]

    if len(y_known) == 0:
        print(f"⚠️ No validation samples for known classes {known_classes}.")
        return None, None, None

    print(f"📋 Total samples for known classes {known_classes}: {len(y_known)}")

    dataset = TensorDataset(
        torch.tensor(X_known, dtype=torch.float32),
        torch.tensor(y_known, dtype=torch.long)
    )
    loader = DataLoader(dataset, batch_size=batch_size)

    correct_prev, correct_init, total = 0, 0, 0

    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)

            out_prev = previous_model(xb)  # [B, C]
            out_init = init_model(xb)

            preds_prev = torch.argmax(out_prev, dim=-1)
            preds_init = torch.argmax(out_init, dim=-1)

            correct_prev += (preds_prev == yb).sum().item()
            correct_init += (preds_init == yb).sum().item()
            total += yb.size(0)

    acc_prev = 100 * correct_prev / total
    acc_init = 100 * correct_init / total
    fwt_value = acc_prev - acc_init

    print(f"\n### 🔍 FWT Debug Info:")
    print(f"- Total evaluated samples: {total}")
    print(f"- Accuracy by previous model: {acc_prev:.2f}%")
    print(f"- Accuracy by init model:     {acc_init:.2f}%")
    print(f"- FWT = Acc_prev - Acc_init = {fwt_value:.2f}%")

    return fwt_value, acc_prev, acc_init


### Period 2

In [17]:
X_val   = np.load(os.path.join(save_dir, f"X_test_p{2}.npy"))
y_val   = np.load(os.path.join(save_dir, f"y_test_p{2}.npy"))
known_classes = [0,1]

device = auto_select_cuda_device()
input_channels = X_val.shape[2]
output_size_prev = 2

# === Previous Model ===
prev_model_path = os.path.join(BASE_DIR, "ResNet18_Selection", "ResNet18_big_inplane_v1", "ResNet18_big_inplane_1D_best.pth")
previous_model = ResNet18_1D(input_channels=input_channels, output_size=output_size_prev).to(device)
checkpoint = torch.load(prev_model_path, map_location=device)
previous_model.load_state_dict(checkpoint["model_state_dict"])

# === Init Model ===
init_model = ResNet18_1D(input_channels=input_channels, output_size=output_size_prev).to(device)

# === Tensor Conversion ===
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# === FWT Calculation ===
fwt, acc_prev, acc_init = compute_fwt_ecg(previous_model, init_model, X_val_tensor, y_val_tensor, known_classes)


🎯 Automatically selected GPU:
    - CUDA Device ID : 0
    - Memory Used    : 7703 MiB
    - Device Name    : NVIDIA RTX A6000


  checkpoint = torch.load(prev_model_path, map_location=device)


📋 Total samples for known classes [0, 1]: 428


  torch.tensor(X_known, dtype=torch.float32),
  torch.tensor(y_known, dtype=torch.long)



### 🔍 FWT Debug Info:
- Total evaluated samples: 428
- Accuracy by previous model: 89.25%
- Accuracy by init model:     42.99%
- FWT = Acc_prev - Acc_init = 46.26%


### Period 3

In [24]:
X_val   = np.load(os.path.join(save_dir, f"X_test_p{3}.npy"))
y_val   = np.load(os.path.join(save_dir, f"y_test_p{3}.npy"))
known_classes = [0,1,2,3]

device = auto_select_cuda_device()
input_channels = X_val.shape[2]
output_size_prev = 4

# === Previous Model ===
prev_model_path = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v1", "Period_2", "ResNet18_1D_best.pth")
previous_model = ResNet18_1D(input_channels=input_channels, output_size=output_size_prev).to(device)
checkpoint = torch.load(prev_model_path, map_location=device)
previous_model.load_state_dict(checkpoint["model_state_dict"])

# === Init Model ===
init_model = ResNet18_1D(input_channels=input_channels, output_size=output_size_prev).to(device)

# === Tensor Conversion ===
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# === FWT Calculation ===
fwt, acc_prev, acc_init = compute_fwt_ecg(previous_model, init_model, X_val_tensor, y_val_tensor, known_classes)


🎯 Automatically selected GPU:
    - CUDA Device ID : 0
    - Memory Used    : 7745 MiB
    - Device Name    : NVIDIA RTX A6000


  checkpoint = torch.load(prev_model_path, map_location=device)


📋 Total samples for known classes [0, 1, 2, 3]: 907


  torch.tensor(X_known, dtype=torch.float32),
  torch.tensor(y_known, dtype=torch.long)



### 🔍 FWT Debug Info:
- Total evaluated samples: 907
- Accuracy by previous model: 89.75%
- Accuracy by init model:     25.91%
- FWT = Acc_prev - Acc_init = 63.84%


### Period 4

In [30]:
X_val   = np.load(os.path.join(save_dir, f"X_test_p{4}.npy"))
y_val   = np.load(os.path.join(save_dir, f"y_test_p{4}.npy"))
known_classes = [0,1,2,3,4,5]

device = auto_select_cuda_device()
input_channels = X_val.shape[2]
output_size_prev = 6

# === Previous Model ===
prev_model_path = os.path.join(BASE_DIR, "Trained_models", "MSE_CIL_v1", "Period_3", "ResNet18_1D_best.pth")
previous_model = ResNet18_1D(input_channels=input_channels, output_size=output_size_prev).to(device)
checkpoint = torch.load(prev_model_path, map_location=device)
previous_model.load_state_dict(checkpoint["model_state_dict"])

# === Init Model ===
init_model = ResNet18_1D(input_channels=input_channels, output_size=output_size_prev).to(device)

# === Tensor Conversion ===
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# === FWT Calculation ===
fwt, acc_prev, acc_init = compute_fwt_ecg(previous_model, init_model, X_val_tensor, y_val_tensor, known_classes)


🎯 Automatically selected GPU:
    - CUDA Device ID : 0
    - Memory Used    : 7775 MiB
    - Device Name    : NVIDIA RTX A6000


  checkpoint = torch.load(prev_model_path, map_location=device)


📋 Total samples for known classes [0, 1, 2, 3, 4, 5]: 947


  torch.tensor(X_known, dtype=torch.float32),
  torch.tensor(y_known, dtype=torch.long)



### 🔍 FWT Debug Info:
- Total evaluated samples: 947
- Accuracy by previous model: 96.30%
- Accuracy by init model:     19.85%
- FWT = Acc_prev - Acc_init = 76.45%


## 📊 Summary: 

### ✔️ CPSC - MSE Distillation: Validation Summary

| Period | Training Time (s) | Validation Accuracy | Class-wise Accuracy                                                                 |
|--------|-------------------|---------------------|--------------------------------------------------------------------------------------|
| 1      | 134.66            | **88.86%**          | {0: 91.85%, 1: 85.87%}                                                              |
| 2      | 386.38            | **89.09%**          | {0: 92.93%, 1: 79.92%, 2: 92.36%, 3: 93.44%}                                        |
| 3      | 615.24            | **86.89%**          | {0: 76.63%, 1: 83.88%, 2: 84.72%, 3: 92.62%, 4: 82.50%, 5: 92.81%}                  |
| 4      | 662.07            | **86.24%**          | {0: 79.35%, 2: 91.67%, 3: 97.54%, 4: 85.00%, 5: 96.12%, 6: 50.00%, 7: 81.60%, 8: 86.62%, 9: 56.76%} |


### 🧠 Continual Learning Metrics

| Period | AA_old (%) | AA_new (%) | BWT (%) | FWT (%) | FWT Classes        | Prev. Model Acc | Init Model Acc |
|--------|------------|------------|---------|---------|---------------------|------------------|-----------------|
| 2      | 86.42%     | 92.90%     | -2.44%  | 46.26%  | [0, 1]              | 89.25%           | 42.99%          |
| 3      | 84.46%     | 87.66%     | -5.20%  | 63.84%  | [0, 1, 2, 3]        | 89.75%           | 25.91%          |
| 4      | 89.94%     | 68.75%     | +4.08%  | 76.45%  | [0, 1, 2, 3, 4, 5]  | 96.30%           | 19.85%          |


### 📦 Model Size per Period

| Period | Output Size | Total Params | Δ Params vs Prev | Δ % vs Prev | Model Size (float32) |
|--------|-------------|--------------|------------------|-------------|-----------------------|
| 1      | 2           | 3,857,026    | —                | —           | 14.71 MB              |
| 2      | 4           | 3,859,076    | +2,050           | +0.05%      | 14.72 MB              |
| 3      | 6           | 3,861,126    | +2,050           | +0.05%      | 14.73 MB              |
| 4      | 10          | 3,865,226    | +4,100           | +0.11%      | 14.74 MB              |

**📈 Model Growth Rate (MGR) = (3,865,226 - 3,857,026) / (3,857,026 × 3) ≈ +0.07%**

**📈 Max trainable ratio = 100.00%**
