# Knowledge Distillation Experiments - Unified Framework

## Master's Thesis: Robust Knowledge Distillation for Compact Vision Models

**Author:** Gheith Alrawahi  
**Institution:** Nankai University  
**Supervisor:** Prof. Jing Wang

---

### Experiments Overview:

| ID   | Name       | Method       | Key Feature                     |
| :--- | :--------- | :----------- | :------------------------------ |
| v1   | Baseline   | Standard KD  | Mixup + CutMix only             |
| v2   | Enhanced   | Standard KD  | + AutoAugment + Label Smoothing |
| v3   | DKD β=8.0  | Decoupled KD | Default DKD parameters          |
| v3.1 | DKD β=2.0  | Decoupled KD | Tuned beta parameter            |
| v4   | Saturation | Standard KD  | Strong teacher + Standard KD    |

---


## 1. Setup and Configuration


In [1]:
# Cell 1: Imports and Setup
import os
import sys
import time
import copy
from pathlib import Path
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm

# Local imports
from config import (
    RESULTS_DIR, MODELS_DIR, CHECKPOINTS_DIR,
    get_experiment, ALL_EXPERIMENTS,
    ExperimentConfig
)
from utils import (
    set_seed, mixup_data, cutmix_data,
    kd_loss_with_mixup, evaluate_model,
    save_checkpoint, load_checkpoint, cleanup_checkpoints,
    TrainingLogger  # Use unified logger
)
from data import get_dataloaders
from models import create_teacher_model, create_student_model, load_teacher_model

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Device: cuda
GPU: NVIDIA GeForce RTX 5070 Laptop GPU
Memory: 8.5 GB


In [2]:
# Cell 2: Select Experiment
# ============================================================
# CHANGE THIS TO RUN DIFFERENT EXPERIMENTS
# Options: "v1", "v2", "v3", "v3.1", "v4"
# ============================================================

EXPERIMENT_NAME = "v4"  # <-- CHANGE THIS

# ============================================================

# Load configuration
config = get_experiment(EXPERIMENT_NAME)

print("=" * 60)
print(f"EXPERIMENT: {config.experiment_id}")
print(f"Name: {config.experiment_name}")
print(f"Description: {config.description}")
print("=" * 60)
print(f"\nMethod: {config.distillation.method}")
print(f"Temperature: {config.distillation.temperature}")
if config.distillation.method == "dkd":
    print(f"DKD Alpha: {config.distillation.dkd_alpha}")
    print(f"DKD Beta: {config.distillation.dkd_beta}")
else:
    print(f"KD Alpha: {config.distillation.alpha}")
print(f"\nAugmentation:")
print(f"  AutoAugment: {config.augmentation.auto_augment}")
print(f"  RandomErasing: {config.augmentation.random_erasing}")
print(f"  Mixup: {config.augmentation.mixup}")
print(f"  CutMix: {config.augmentation.cutmix}")
print(f"\nTraining:")
print(f"  Epochs: {config.base.num_epochs}")
print(f"  Batch Size: {config.base.batch_size}")
print(f"  Learning Rate: {config.base.learning_rate}")
print(f"  Early Stopping Patience: {config.base.patience}")

EXPERIMENT: v4_saturation
Name: Saturation Test
Description: Strong Teacher (v3) + Standard KD to test capacity saturation

Method: standard_kd
Temperature: 4.0
KD Alpha: 0.7

Augmentation:
  AutoAugment: True
  RandomErasing: True
  Mixup: True
  CutMix: True

Training:
  Epochs: 200
  Batch Size: 128
  Learning Rate: 0.001
  Early Stopping Patience: 30


In [3]:
# Cell 3: Set Seed and Initialize Logger
set_seed(config.base.seed)

# Get results directory with timestamp (prevents overwriting)
# Set use_timestamp=False if you want to overwrite previous results
USE_TIMESTAMP = True  # <-- Set to False to overwrite previous runs

results_dir = config.get_results_dir(use_timestamp=USE_TIMESTAMP)
print(f"Results will be saved to: {results_dir}")

# Initialize Student Logger (same structure as Teacher)
logger = TrainingLogger(config._run_id, RESULTS_DIR, model_type="student")

# Save configuration
config.save()

Random seed set to: 42
Results will be saved to: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
Logger initialized: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
Config saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910\config.json


## 2. Data Loading


In [4]:
# Cell 4: Load Data
train_loader, test_loader = get_dataloaders(
    aug_config=config.augmentation,
    batch_size=config.base.batch_size,
    num_workers=config.base.num_workers,
    pin_memory=config.base.pin_memory
)

Data loaded:
  Training samples: 50000
  Test samples: 10000
  Batch size: 128
  Resolution: 32x32
  Augmentation: AutoAugment=True, RandomErasing=True, Mixup=True, CutMix=True


## 3. Model Initialization


In [5]:
# Cell 5: Create/Load Teacher Model
print("\n" + "=" * 60)
print("TEACHER MODEL")
print("=" * 60)

# V4 uses a stronger teacher trained at 64x64 resolution
IS_V4 = EXPERIMENT_NAME == "v4"

if IS_V4:
    from config import TEACHER_RES_V4, BATCH_SIZE_V4
    teacher_path = MODELS_DIR / "teacher_v4_64x64.pth"
    teacher_resolution = TEACHER_RES_V4
    print(f"V4 Experiment: Using stronger teacher at {teacher_resolution}x{teacher_resolution} resolution")
else:
    teacher_path = MODELS_DIR / "teacher_trained.pth"
    teacher_resolution = 32  # Standard 32x32

if teacher_path.exists():
    print(f"Loading existing teacher from: {teacher_path}")
    teacher_model = load_teacher_model(str(teacher_path), device=device)
    
    # Need to evaluate with correct resolution
    if IS_V4:
        _, teacher_test_loader = get_dataloaders(
            aug_config=config.augmentation,
            batch_size=BATCH_SIZE_V4,
            num_workers=config.base.num_workers,
            pin_memory=config.base.pin_memory,
            resize_to=teacher_resolution
        )
        teacher_results = evaluate_model(teacher_model, teacher_test_loader, device)
    else:
        teacher_results = evaluate_model(teacher_model, test_loader, device)
    
    teacher_accuracy = teacher_results['accuracy']
    print(f"Teacher Accuracy: {teacher_accuracy:.2f}%")
    
    TRAIN_TEACHER = False
else:
    print(f"No trained teacher found at: {teacher_path}")
    print("Will train from scratch.")
    teacher_model = create_teacher_model(device=device)
    teacher_accuracy = 0.0
    TRAIN_TEACHER = True


TEACHER MODEL
V4 Experiment: Using stronger teacher at 64x64 resolution
No trained teacher found at: d:\Projects\KnowledgeDistillation\code_v2_32\models\teacher_v4_64x64.pth
Will train from scratch.
Teacher Model: EfficientNetV2-L
  Parameters: 117,362,372 (117.36M)
  Size: 449.66 MB
  Pretrained: True


In [None]:
# Cell 6: Train Teacher (if needed)
if TRAIN_TEACHER:
    print("\n" + "=" * 60)
    print("TRAINING TEACHER MODEL")
    print("=" * 60)
    
    # Teacher training configuration
    TEACHER_EPOCHS = config.base.num_epochs
    
    # V4 uses higher resolution and smaller batch size
    if IS_V4:
        print(f"V4 Mode: Training at {teacher_resolution}x{teacher_resolution} resolution")
        teacher_batch_size = BATCH_SIZE_V4
        
        # Load data at higher resolution for teacher training
        teacher_train_loader, teacher_test_loader = get_dataloaders(
            aug_config=config.augmentation,
            batch_size=teacher_batch_size,
            num_workers=config.base.num_workers,
            pin_memory=config.base.pin_memory,
            resize_to=teacher_resolution
        )
    else:
        teacher_batch_size = config.base.batch_size
        teacher_train_loader = train_loader
        teacher_test_loader = test_loader
    
    # Create checkpoint directory for teacher
    teacher_checkpoint_dir = CHECKPOINTS_DIR / ("teacher_v4" if IS_V4 else "teacher")
    teacher_checkpoint_dir.mkdir(parents=True, exist_ok=True)
    print(f"Checkpoints: {teacher_checkpoint_dir}")
    
    # Initialize Teacher Logger (same structure as Student)
    teacher_logger = TrainingLogger("teacher_v4" if IS_V4 else "teacher", RESULTS_DIR, model_type="teacher")
    teacher_logger.start_training()
    
    # Optimizer and scheduler
    teacher_optimizer = optim.AdamW(
        teacher_model.parameters(),
        lr=config.base.learning_rate,
        weight_decay=config.base.weight_decay
    )
    teacher_scheduler = optim.lr_scheduler.CosineAnnealingLR(
        teacher_optimizer,
        T_max=TEACHER_EPOCHS - config.base.warmup_epochs
    )
    
    # Training setup
    scaler = torch.amp.GradScaler('cuda')
    best_teacher_acc = 0.0
    best_teacher_weights = None
    epochs_no_improve = 0
    
    for epoch in range(TEACHER_EPOCHS):
        teacher_model.train()
        running_loss = 0.0
        
        # Learning rate warmup
        if epoch < config.base.warmup_epochs:
            warmup_lr = config.base.learning_rate * (epoch + 1) / config.base.warmup_epochs
            for param_group in teacher_optimizer.param_groups:
                param_group['lr'] = warmup_lr
        
        loop = tqdm(teacher_train_loader, desc=f"Teacher Epoch {epoch+1}/{TEACHER_EPOCHS}")
        
        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Apply Mixup or CutMix
            if config.augmentation.mixup or config.augmentation.cutmix:
                if np.random.rand() > 0.5 and config.augmentation.cutmix:
                    inputs, labels_a, labels_b, lam = cutmix_data(
                        inputs.clone(), labels, config.augmentation.cutmix_alpha, device
                    )
                elif config.augmentation.mixup:
                    inputs, labels_a, labels_b, lam = mixup_data(
                        inputs, labels, config.augmentation.mixup_alpha, device
                    )
                else:
                    labels_a, labels_b, lam = labels, labels, 1.0
            else:
                labels_a, labels_b, lam = labels, labels, 1.0
            
            teacher_optimizer.zero_grad()
            
            with torch.amp.autocast('cuda'):
                outputs = teacher_model(inputs)
                loss = lam * nn.functional.cross_entropy(outputs, labels_a, label_smoothing=config.distillation.label_smoothing) + \
                       (1 - lam) * nn.functional.cross_entropy(outputs, labels_b, label_smoothing=config.distillation.label_smoothing)
            
            scaler.scale(loss).backward()
            scaler.unscale_(teacher_optimizer)
            nn.utils.clip_grad_norm_(teacher_model.parameters(), config.base.grad_clip)
            scaler.step(teacher_optimizer)
            scaler.update()
            
            running_loss += loss.item()
            loop.set_postfix(loss=f"{loss.item():.4f}")
        
        # Step scheduler after warmup
        if epoch >= config.base.warmup_epochs:
            teacher_scheduler.step()
        
        # Validation
        train_loss = running_loss / len(teacher_train_loader)
        val_results = evaluate_model(teacher_model, teacher_test_loader, device)
        val_acc = val_results['accuracy']
        val_loss = val_results['loss']
        current_lr = teacher_optimizer.param_groups[0]['lr']
        
        # Log epoch (same as student)
        is_best = teacher_logger.log_epoch(epoch + 1, train_loss, val_loss, val_acc, current_lr)
        
        print(f"Epoch {epoch+1}/{TEACHER_EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | LR: {current_lr:.6f}")
        
        # Save best model
        if is_best:
            best_teacher_acc = val_acc
            best_teacher_weights = copy.deepcopy(teacher_model.state_dict())
            epochs_no_improve = 0
            print(f"  * New best teacher! Accuracy: {best_teacher_acc:.2f}%")
            
            # Save best teacher immediately
            torch.save({
                'model_state_dict': best_teacher_weights,
                'accuracy': best_teacher_acc,
                'epoch': epoch + 1,
                'resolution': teacher_resolution
            }, teacher_path)
        else:
            epochs_no_improve += 1
        
        # Save checkpoint and history every N epochs
        if (epoch + 1) % config.base.checkpoint_frequency == 0:
            # Save checkpoint
            checkpoint_path = teacher_checkpoint_dir / f"teacher_epoch_{epoch+1}.pth"
            torch.save({
                'model_state_dict': teacher_model.state_dict(),
                'optimizer_state_dict': teacher_optimizer.state_dict(),
                'scheduler_state_dict': teacher_scheduler.state_dict(),
                'epoch': epoch + 1,
                'best_accuracy': best_teacher_acc,
                'resolution': teacher_resolution
            }, checkpoint_path)
            print(f"  Checkpoint saved: {checkpoint_path.name}")
            
            # Save history (same as student)
            teacher_logger.save_checkpoint_history()
            
            # Cleanup old checkpoints
            cleanup_checkpoints(teacher_checkpoint_dir, keep=config.base.keep_checkpoints)
        
        # Early stopping
        if epochs_no_improve >= config.base.patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    # Training complete - save final results
    teacher_model.load_state_dict(best_teacher_weights)
    teacher_accuracy = best_teacher_acc
    
    # Save final model
    torch.save({
        'model_state_dict': best_teacher_weights,
        'accuracy': best_teacher_acc,
        'resolution': teacher_resolution
    }, teacher_path)
    
    # Save final results (without resolution - not supported by TrainingLogger)
    teacher_logger.save_final_results(
        model_name="EfficientNetV2-L",
        total_epochs=epoch + 1,
        early_stopped=(epochs_no_improve >= config.base.patience)
    )
    
    print(f"\nV4 Teacher trained at {teacher_resolution}x{teacher_resolution}: {teacher_accuracy:.2f}%")


TRAINING TEACHER MODEL
V4 Mode: Training at 64x64 resolution
Data loaded:
  Training samples: 50000
  Test samples: 10000
  Batch size: 64
  Resolution: 64x64
  Augmentation: AutoAugment=True, RandomErasing=True, Mixup=True, CutMix=True
Checkpoints: d:\Projects\KnowledgeDistillation\code_v2_32\checkpoints\teacher_v4
Logger initialized: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 1/200: 100%|██████████| 781/781 [04:11<00:00,  3.11it/s, loss=3.9575]


Epoch 1/200 | Train Loss: 3.9317 | Val Loss: 1.8641 | Val Acc: 54.54% | LR: 0.000200
  * New best teacher! Accuracy: 54.54%


Teacher Epoch 2/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.8774]


Epoch 2/200 | Train Loss: 3.5671 | Val Loss: 1.6910 | Val Acc: 58.92% | LR: 0.000400
  * New best teacher! Accuracy: 58.92%


Teacher Epoch 3/200: 100%|██████████| 781/781 [04:09<00:00,  3.13it/s, loss=3.5266]


Epoch 3/200 | Train Loss: 3.4446 | Val Loss: 1.6906 | Val Acc: 60.07% | LR: 0.000600
  * New best teacher! Accuracy: 60.07%


Teacher Epoch 4/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.4915]


Epoch 4/200 | Train Loss: 3.4334 | Val Loss: 1.6964 | Val Acc: 58.95% | LR: 0.000800


Teacher Epoch 5/200: 100%|██████████| 781/781 [04:19<00:00,  3.01it/s, loss=3.5173]


Epoch 5/200 | Train Loss: 3.4201 | Val Loss: 1.7714 | Val Acc: 57.67% | LR: 0.001000


Teacher Epoch 6/200: 100%|██████████| 781/781 [04:05<00:00,  3.18it/s, loss=3.2293]


Epoch 6/200 | Train Loss: 3.3455 | Val Loss: 1.6814 | Val Acc: 60.38% | LR: 0.001000
  * New best teacher! Accuracy: 60.38%


Teacher Epoch 7/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=2.6242]


Epoch 7/200 | Train Loss: 3.2778 | Val Loss: 1.5807 | Val Acc: 63.07% | LR: 0.001000
  * New best teacher! Accuracy: 63.07%


Teacher Epoch 8/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.6192]


Epoch 8/200 | Train Loss: 3.2306 | Val Loss: 1.5524 | Val Acc: 63.84% | LR: 0.000999
  * New best teacher! Accuracy: 63.84%


Teacher Epoch 9/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.8496]


Epoch 9/200 | Train Loss: 3.1910 | Val Loss: 1.5747 | Val Acc: 63.60% | LR: 0.000999


Teacher Epoch 10/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=2.3353]


Epoch 10/200 | Train Loss: 3.1477 | Val Loss: 1.5681 | Val Acc: 64.36% | LR: 0.000998
  * New best teacher! Accuracy: 64.36%
  Checkpoint saved: teacher_epoch_10.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 11/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.5355]


Epoch 11/200 | Train Loss: 3.1293 | Val Loss: 1.5190 | Val Acc: 66.76% | LR: 0.000998
  * New best teacher! Accuracy: 66.76%


Teacher Epoch 12/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.5821]


Epoch 12/200 | Train Loss: 3.1059 | Val Loss: 1.4881 | Val Acc: 66.45% | LR: 0.000997


Teacher Epoch 13/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.0841]


Epoch 13/200 | Train Loss: 3.1090 | Val Loss: 1.4479 | Val Acc: 67.45% | LR: 0.000996
  * New best teacher! Accuracy: 67.45%


Teacher Epoch 14/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.4406]


Epoch 14/200 | Train Loss: 3.0710 | Val Loss: 1.3351 | Val Acc: 69.06% | LR: 0.000995
  * New best teacher! Accuracy: 69.06%


Teacher Epoch 15/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.6529]


Epoch 15/200 | Train Loss: 3.0116 | Val Loss: 1.3999 | Val Acc: 68.56% | LR: 0.000994


Teacher Epoch 16/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.4168]


Epoch 16/200 | Train Loss: 3.0209 | Val Loss: 1.4102 | Val Acc: 69.07% | LR: 0.000992
  * New best teacher! Accuracy: 69.07%


Teacher Epoch 17/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=2.1795]


Epoch 17/200 | Train Loss: 2.9908 | Val Loss: 1.3477 | Val Acc: 69.32% | LR: 0.000991
  * New best teacher! Accuracy: 69.32%


Teacher Epoch 18/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.3965]


Epoch 18/200 | Train Loss: 3.0194 | Val Loss: 1.3630 | Val Acc: 68.54% | LR: 0.000989


Teacher Epoch 19/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.6229]


Epoch 19/200 | Train Loss: 2.9981 | Val Loss: 1.3761 | Val Acc: 69.70% | LR: 0.000987
  * New best teacher! Accuracy: 69.70%


Teacher Epoch 20/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=3.4706]


Epoch 20/200 | Train Loss: 2.9955 | Val Loss: 1.4053 | Val Acc: 68.84% | LR: 0.000985
  Checkpoint saved: teacher_epoch_20.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 21/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.1091]


Epoch 21/200 | Train Loss: 2.9767 | Val Loss: 1.3723 | Val Acc: 69.80% | LR: 0.000983
  * New best teacher! Accuracy: 69.80%


Teacher Epoch 22/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=2.0886]


Epoch 22/200 | Train Loss: 2.9493 | Val Loss: 1.4181 | Val Acc: 68.24% | LR: 0.000981


Teacher Epoch 23/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=3.3345]


Epoch 23/200 | Train Loss: 2.9324 | Val Loss: 1.3765 | Val Acc: 69.41% | LR: 0.000979


Teacher Epoch 24/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=2.8863]


Epoch 24/200 | Train Loss: 2.9400 | Val Loss: 1.2885 | Val Acc: 70.32% | LR: 0.000977
  * New best teacher! Accuracy: 70.32%


Teacher Epoch 25/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=3.5161]


Epoch 25/200 | Train Loss: 2.9782 | Val Loss: 1.3597 | Val Acc: 69.57% | LR: 0.000974


Teacher Epoch 26/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=1.9747]


Epoch 26/200 | Train Loss: 2.9169 | Val Loss: 1.2609 | Val Acc: 70.77% | LR: 0.000972
  * New best teacher! Accuracy: 70.77%


Teacher Epoch 27/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.2397]


Epoch 27/200 | Train Loss: 2.9550 | Val Loss: 1.4396 | Val Acc: 68.49% | LR: 0.000969


Teacher Epoch 28/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.4821]


Epoch 28/200 | Train Loss: 2.9336 | Val Loss: 1.4844 | Val Acc: 67.18% | LR: 0.000966


Teacher Epoch 29/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.0394]


Epoch 29/200 | Train Loss: 2.9046 | Val Loss: 1.4281 | Val Acc: 68.32% | LR: 0.000963


Teacher Epoch 30/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.5405]


Epoch 30/200 | Train Loss: 2.9320 | Val Loss: 1.3840 | Val Acc: 70.88% | LR: 0.000960
  * New best teacher! Accuracy: 70.88%
  Checkpoint saved: teacher_epoch_30.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 31/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.1808]


Epoch 31/200 | Train Loss: 2.9117 | Val Loss: 1.3466 | Val Acc: 69.52% | LR: 0.000957


Teacher Epoch 32/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.6444]


Epoch 32/200 | Train Loss: 2.8886 | Val Loss: 1.4082 | Val Acc: 68.73% | LR: 0.000953


Teacher Epoch 33/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=2.5264]


Epoch 33/200 | Train Loss: 2.9212 | Val Loss: 1.3230 | Val Acc: 70.42% | LR: 0.000950


Teacher Epoch 34/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.5168]


Epoch 34/200 | Train Loss: 2.9212 | Val Loss: 1.3775 | Val Acc: 70.66% | LR: 0.000946


Teacher Epoch 35/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.5942]


Epoch 35/200 | Train Loss: 2.9191 | Val Loss: 1.3321 | Val Acc: 70.88% | LR: 0.000943


Teacher Epoch 36/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=1.8382]


Epoch 36/200 | Train Loss: 2.8717 | Val Loss: 1.3078 | Val Acc: 71.34% | LR: 0.000939
  * New best teacher! Accuracy: 71.34%


Teacher Epoch 37/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=2.6132]


Epoch 37/200 | Train Loss: 2.8995 | Val Loss: 1.3928 | Val Acc: 70.21% | LR: 0.000935


Teacher Epoch 38/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.4706]


Epoch 38/200 | Train Loss: 2.9189 | Val Loss: 1.4026 | Val Acc: 70.48% | LR: 0.000931


Teacher Epoch 39/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.4772]


Epoch 39/200 | Train Loss: 2.8974 | Val Loss: 1.3498 | Val Acc: 71.26% | LR: 0.000927


Teacher Epoch 40/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=3.0540]


Epoch 40/200 | Train Loss: 2.9021 | Val Loss: 1.3859 | Val Acc: 69.63% | LR: 0.000923
  Checkpoint saved: teacher_epoch_40.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 41/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.0213]


Epoch 41/200 | Train Loss: 2.9009 | Val Loss: 1.4172 | Val Acc: 70.26% | LR: 0.000918


Teacher Epoch 42/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.3006]


Epoch 42/200 | Train Loss: 2.9100 | Val Loss: 1.2714 | Val Acc: 72.22% | LR: 0.000914
  * New best teacher! Accuracy: 72.22%


Teacher Epoch 43/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=2.6605]


Epoch 43/200 | Train Loss: 2.9069 | Val Loss: 1.4134 | Val Acc: 69.04% | LR: 0.000909


Teacher Epoch 44/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=2.5289]


Epoch 44/200 | Train Loss: 2.8585 | Val Loss: 1.4417 | Val Acc: 68.12% | LR: 0.000905


Teacher Epoch 45/200: 100%|██████████| 781/781 [04:11<00:00,  3.11it/s, loss=3.1614]


Epoch 45/200 | Train Loss: 2.9094 | Val Loss: 1.3204 | Val Acc: 71.27% | LR: 0.000900


Teacher Epoch 46/200: 100%|██████████| 781/781 [03:51<00:00,  3.37it/s, loss=1.7317]


Epoch 46/200 | Train Loss: 2.8978 | Val Loss: 1.3258 | Val Acc: 70.98% | LR: 0.000895


Teacher Epoch 47/200: 100%|██████████| 781/781 [04:20<00:00,  3.00it/s, loss=2.4761]


Epoch 47/200 | Train Loss: 2.8863 | Val Loss: 1.3043 | Val Acc: 69.95% | LR: 0.000890


Teacher Epoch 48/200: 100%|██████████| 781/781 [04:24<00:00,  2.96it/s, loss=3.2897]


Epoch 48/200 | Train Loss: 2.8527 | Val Loss: 1.2962 | Val Acc: 71.78% | LR: 0.000885


Teacher Epoch 49/200: 100%|██████████| 781/781 [04:23<00:00,  2.97it/s, loss=3.4877]


Epoch 49/200 | Train Loss: 2.8802 | Val Loss: 1.3296 | Val Acc: 71.35% | LR: 0.000880


Teacher Epoch 50/200: 100%|██████████| 781/781 [04:23<00:00,  2.96it/s, loss=2.5797]


Epoch 50/200 | Train Loss: 2.8467 | Val Loss: 1.3514 | Val Acc: 68.09% | LR: 0.000874
  Checkpoint saved: teacher_epoch_50.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 51/200: 100%|██████████| 781/781 [04:23<00:00,  2.96it/s, loss=1.5035]


Epoch 51/200 | Train Loss: 2.8566 | Val Loss: 1.3258 | Val Acc: 71.00% | LR: 0.000869


Teacher Epoch 52/200: 100%|██████████| 781/781 [04:22<00:00,  2.97it/s, loss=3.0841]


Epoch 52/200 | Train Loss: 2.8191 | Val Loss: 1.3261 | Val Acc: 70.39% | LR: 0.000863


Teacher Epoch 53/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=3.3903]


Epoch 53/200 | Train Loss: 2.8538 | Val Loss: 1.1859 | Val Acc: 74.01% | LR: 0.000858
  * New best teacher! Accuracy: 74.01%


Teacher Epoch 54/200: 100%|██████████| 781/781 [04:16<00:00,  3.05it/s, loss=3.7649]


Epoch 54/200 | Train Loss: 2.8460 | Val Loss: 1.3416 | Val Acc: 69.93% | LR: 0.000852


Teacher Epoch 55/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=2.0013]


Epoch 55/200 | Train Loss: 2.8341 | Val Loss: 1.2688 | Val Acc: 72.99% | LR: 0.000846


Teacher Epoch 56/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=3.3511]


Epoch 56/200 | Train Loss: 2.8798 | Val Loss: 1.3553 | Val Acc: 71.07% | LR: 0.000841


Teacher Epoch 57/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=3.2556]


Epoch 57/200 | Train Loss: 2.8491 | Val Loss: 1.3614 | Val Acc: 69.93% | LR: 0.000835


Teacher Epoch 58/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=2.5487]


Epoch 58/200 | Train Loss: 2.8367 | Val Loss: 1.2771 | Val Acc: 72.37% | LR: 0.000829


Teacher Epoch 59/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=2.9245]


Epoch 59/200 | Train Loss: 2.8399 | Val Loss: 1.2519 | Val Acc: 72.19% | LR: 0.000822


Teacher Epoch 60/200: 100%|██████████| 781/781 [04:16<00:00,  3.05it/s, loss=3.4287]


Epoch 60/200 | Train Loss: 2.8026 | Val Loss: 1.3348 | Val Acc: 70.51% | LR: 0.000816
  Checkpoint saved: teacher_epoch_60.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 61/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=3.3374]


Epoch 61/200 | Train Loss: 2.7940 | Val Loss: 1.3283 | Val Acc: 72.08% | LR: 0.000810


Teacher Epoch 62/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.8259]


Epoch 62/200 | Train Loss: 2.8483 | Val Loss: 1.2782 | Val Acc: 71.37% | LR: 0.000804


Teacher Epoch 63/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.1299]


Epoch 63/200 | Train Loss: 2.7959 | Val Loss: 1.1544 | Val Acc: 73.82% | LR: 0.000797


Teacher Epoch 64/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=3.2367]


Epoch 64/200 | Train Loss: 2.8414 | Val Loss: 1.3676 | Val Acc: 70.14% | LR: 0.000791


Teacher Epoch 65/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.3738]


Epoch 65/200 | Train Loss: 2.8237 | Val Loss: 1.3280 | Val Acc: 70.49% | LR: 0.000784


Teacher Epoch 66/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.9541]


Epoch 66/200 | Train Loss: 2.8107 | Val Loss: 1.2713 | Val Acc: 72.67% | LR: 0.000777


Teacher Epoch 67/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.7757]


Epoch 67/200 | Train Loss: 2.7822 | Val Loss: 1.1386 | Val Acc: 74.27% | LR: 0.000771
  * New best teacher! Accuracy: 74.27%


Teacher Epoch 68/200: 100%|██████████| 781/781 [04:31<00:00,  2.88it/s, loss=3.4244]


Epoch 68/200 | Train Loss: 2.8267 | Val Loss: 1.1212 | Val Acc: 74.68% | LR: 0.000764
  * New best teacher! Accuracy: 74.68%


Teacher Epoch 69/200: 100%|██████████| 781/781 [04:25<00:00,  2.94it/s, loss=2.7152]


Epoch 69/200 | Train Loss: 2.8048 | Val Loss: 1.1167 | Val Acc: 75.19% | LR: 0.000757
  * New best teacher! Accuracy: 75.19%


Teacher Epoch 70/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=3.3933]


Epoch 70/200 | Train Loss: 2.8043 | Val Loss: 1.3660 | Val Acc: 70.69% | LR: 0.000750
  Checkpoint saved: teacher_epoch_70.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 71/200: 100%|██████████| 781/781 [04:15<00:00,  3.05it/s, loss=3.4722]


Epoch 71/200 | Train Loss: 2.8154 | Val Loss: 1.2440 | Val Acc: 73.53% | LR: 0.000743


Teacher Epoch 72/200: 100%|██████████| 781/781 [04:15<00:00,  3.05it/s, loss=3.1665]


Epoch 72/200 | Train Loss: 2.7787 | Val Loss: 1.1382 | Val Acc: 75.14% | LR: 0.000736


Teacher Epoch 73/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=1.9191]


Epoch 73/200 | Train Loss: 2.8034 | Val Loss: 1.2467 | Val Acc: 73.51% | LR: 0.000729


Teacher Epoch 74/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.2206]


Epoch 74/200 | Train Loss: 2.7656 | Val Loss: 1.2146 | Val Acc: 73.16% | LR: 0.000722


Teacher Epoch 75/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.4724]


Epoch 75/200 | Train Loss: 2.7876 | Val Loss: 1.1640 | Val Acc: 74.10% | LR: 0.000714


Teacher Epoch 76/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=3.3526]


Epoch 76/200 | Train Loss: 2.7619 | Val Loss: 1.2041 | Val Acc: 73.90% | LR: 0.000707


Teacher Epoch 77/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=3.1306]


Epoch 77/200 | Train Loss: 2.8252 | Val Loss: 1.2156 | Val Acc: 73.96% | LR: 0.000700


Teacher Epoch 78/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.9733]


Epoch 78/200 | Train Loss: 2.7956 | Val Loss: 1.2045 | Val Acc: 74.74% | LR: 0.000692


Teacher Epoch 79/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.3244]


Epoch 79/200 | Train Loss: 2.7799 | Val Loss: 1.2568 | Val Acc: 73.49% | LR: 0.000685


Teacher Epoch 80/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.0782]


Epoch 80/200 | Train Loss: 2.7785 | Val Loss: 1.1978 | Val Acc: 75.36% | LR: 0.000677
  * New best teacher! Accuracy: 75.36%
  Checkpoint saved: teacher_epoch_80.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 81/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.7686]


Epoch 81/200 | Train Loss: 2.7824 | Val Loss: 1.2149 | Val Acc: 73.78% | LR: 0.000670


Teacher Epoch 82/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.1098]


Epoch 82/200 | Train Loss: 2.7910 | Val Loss: 1.2839 | Val Acc: 72.36% | LR: 0.000662


Teacher Epoch 83/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.8287]


Epoch 83/200 | Train Loss: 2.7297 | Val Loss: 1.1503 | Val Acc: 75.10% | LR: 0.000655


Teacher Epoch 84/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.4431]


Epoch 84/200 | Train Loss: 2.7683 | Val Loss: 1.1880 | Val Acc: 74.19% | LR: 0.000647


Teacher Epoch 85/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.9775]


Epoch 85/200 | Train Loss: 2.7281 | Val Loss: 1.1708 | Val Acc: 75.10% | LR: 0.000639


Teacher Epoch 86/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.8984]


Epoch 86/200 | Train Loss: 2.7463 | Val Loss: 1.1853 | Val Acc: 74.39% | LR: 0.000631


Teacher Epoch 87/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.9005]


Epoch 87/200 | Train Loss: 2.7050 | Val Loss: 1.1762 | Val Acc: 74.96% | LR: 0.000624


Teacher Epoch 88/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.9942]


Epoch 88/200 | Train Loss: 2.7455 | Val Loss: 1.1855 | Val Acc: 75.44% | LR: 0.000616
  * New best teacher! Accuracy: 75.44%


Teacher Epoch 89/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.3799]


Epoch 89/200 | Train Loss: 2.7466 | Val Loss: 1.1704 | Val Acc: 74.90% | LR: 0.000608


Teacher Epoch 90/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.9492]


Epoch 90/200 | Train Loss: 2.7086 | Val Loss: 1.0490 | Val Acc: 75.71% | LR: 0.000600
  * New best teacher! Accuracy: 75.71%
  Checkpoint saved: teacher_epoch_90.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 91/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.6513]


Epoch 91/200 | Train Loss: 2.7557 | Val Loss: 1.1251 | Val Acc: 75.60% | LR: 0.000592


Teacher Epoch 92/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=3.3089]


Epoch 92/200 | Train Loss: 2.7160 | Val Loss: 1.1494 | Val Acc: 76.49% | LR: 0.000584
  * New best teacher! Accuracy: 76.49%


Teacher Epoch 93/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.5096]


Epoch 93/200 | Train Loss: 2.6933 | Val Loss: 1.1351 | Val Acc: 75.64% | LR: 0.000576


Teacher Epoch 94/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.6149]


Epoch 94/200 | Train Loss: 2.6757 | Val Loss: 1.1191 | Val Acc: 75.52% | LR: 0.000568


Teacher Epoch 95/200: 100%|██████████| 781/781 [04:10<00:00,  3.12it/s, loss=2.4367]


Epoch 95/200 | Train Loss: 2.6810 | Val Loss: 1.0351 | Val Acc: 76.75% | LR: 0.000560
  * New best teacher! Accuracy: 76.75%


Teacher Epoch 96/200: 100%|██████████| 781/781 [04:05<00:00,  3.18it/s, loss=3.0997]


Epoch 96/200 | Train Loss: 2.6751 | Val Loss: 1.1196 | Val Acc: 76.44% | LR: 0.000552


Teacher Epoch 97/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.3205]


Epoch 97/200 | Train Loss: 2.7025 | Val Loss: 1.1353 | Val Acc: 75.75% | LR: 0.000544


Teacher Epoch 98/200: 100%|██████████| 781/781 [04:16<00:00,  3.05it/s, loss=3.1911]


Epoch 98/200 | Train Loss: 2.6762 | Val Loss: 1.0818 | Val Acc: 77.18% | LR: 0.000536
  * New best teacher! Accuracy: 77.18%


Teacher Epoch 99/200: 100%|██████████| 781/781 [04:17<00:00,  3.03it/s, loss=2.5083]


Epoch 99/200 | Train Loss: 2.6617 | Val Loss: 1.1090 | Val Acc: 75.82% | LR: 0.000528


Teacher Epoch 100/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=1.8430]


Epoch 100/200 | Train Loss: 2.6420 | Val Loss: 1.0425 | Val Acc: 77.70% | LR: 0.000520
  * New best teacher! Accuracy: 77.70%
  Checkpoint saved: teacher_epoch_100.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 101/200: 100%|██████████| 781/781 [04:17<00:00,  3.03it/s, loss=1.6091]


Epoch 101/200 | Train Loss: 2.6602 | Val Loss: 1.1078 | Val Acc: 77.01% | LR: 0.000512


Teacher Epoch 102/200: 100%|██████████| 781/781 [04:17<00:00,  3.04it/s, loss=3.0961]


Epoch 102/200 | Train Loss: 2.6335 | Val Loss: 1.1051 | Val Acc: 76.65% | LR: 0.000504


Teacher Epoch 103/200: 100%|██████████| 781/781 [04:17<00:00,  3.04it/s, loss=3.2464]


Epoch 103/200 | Train Loss: 2.6557 | Val Loss: 1.1784 | Val Acc: 75.91% | LR: 0.000496


Teacher Epoch 104/200: 100%|██████████| 781/781 [04:16<00:00,  3.04it/s, loss=1.5989]


Epoch 104/200 | Train Loss: 2.6306 | Val Loss: 1.0210 | Val Acc: 77.42% | LR: 0.000488


Teacher Epoch 105/200: 100%|██████████| 781/781 [04:22<00:00,  2.97it/s, loss=2.5710]


Epoch 105/200 | Train Loss: 2.6604 | Val Loss: 1.0298 | Val Acc: 77.27% | LR: 0.000480


Teacher Epoch 106/200: 100%|██████████| 781/781 [04:13<00:00,  3.09it/s, loss=1.6166]


Epoch 106/200 | Train Loss: 2.6208 | Val Loss: 1.0023 | Val Acc: 76.91% | LR: 0.000472


Teacher Epoch 107/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=2.7918]


Epoch 107/200 | Train Loss: 2.6007 | Val Loss: 1.0416 | Val Acc: 77.71% | LR: 0.000464
  * New best teacher! Accuracy: 77.71%


Teacher Epoch 108/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.1632]


Epoch 108/200 | Train Loss: 2.6228 | Val Loss: 1.0524 | Val Acc: 77.47% | LR: 0.000456


Teacher Epoch 109/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=2.9129]


Epoch 109/200 | Train Loss: 2.6242 | Val Loss: 1.0390 | Val Acc: 77.84% | LR: 0.000448
  * New best teacher! Accuracy: 77.84%


Teacher Epoch 110/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=2.9436]


Epoch 110/200 | Train Loss: 2.6296 | Val Loss: 1.0722 | Val Acc: 77.67% | LR: 0.000440
  Checkpoint saved: teacher_epoch_110.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 111/200: 100%|██████████| 781/781 [04:12<00:00,  3.10it/s, loss=1.6809]


Epoch 111/200 | Train Loss: 2.6118 | Val Loss: 0.9477 | Val Acc: 78.78% | LR: 0.000432
  * New best teacher! Accuracy: 78.78%


Teacher Epoch 112/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=1.7661]


Epoch 112/200 | Train Loss: 2.6327 | Val Loss: 0.9904 | Val Acc: 78.54% | LR: 0.000424


Teacher Epoch 113/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=2.2585]


Epoch 113/200 | Train Loss: 2.6241 | Val Loss: 1.0713 | Val Acc: 78.21% | LR: 0.000416


Teacher Epoch 114/200: 100%|██████████| 781/781 [04:11<00:00,  3.10it/s, loss=2.0127]


Epoch 114/200 | Train Loss: 2.5811 | Val Loss: 0.9980 | Val Acc: 78.56% | LR: 0.000408


Teacher Epoch 115/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.2471]


Epoch 115/200 | Train Loss: 2.6076 | Val Loss: 1.0233 | Val Acc: 78.34% | LR: 0.000400


Teacher Epoch 116/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=2.1706]


Epoch 116/200 | Train Loss: 2.5551 | Val Loss: 1.0304 | Val Acc: 78.66% | LR: 0.000392


Teacher Epoch 117/200: 100%|██████████| 781/781 [04:12<00:00,  3.10it/s, loss=3.2994]


Epoch 117/200 | Train Loss: 2.5469 | Val Loss: 0.9775 | Val Acc: 78.90% | LR: 0.000384
  * New best teacher! Accuracy: 78.90%


Teacher Epoch 118/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.0545]


Epoch 118/200 | Train Loss: 2.5511 | Val Loss: 1.0175 | Val Acc: 78.50% | LR: 0.000376


Teacher Epoch 119/200: 100%|██████████| 781/781 [04:12<00:00,  3.09it/s, loss=3.3296]


Epoch 119/200 | Train Loss: 2.5557 | Val Loss: 1.0810 | Val Acc: 77.93% | LR: 0.000369


Teacher Epoch 120/200: 100%|██████████| 781/781 [04:11<00:00,  3.10it/s, loss=3.0685]


Epoch 120/200 | Train Loss: 2.5153 | Val Loss: 0.9881 | Val Acc: 78.10% | LR: 0.000361
  Checkpoint saved: teacher_epoch_120.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 121/200: 100%|██████████| 781/781 [04:12<00:00,  3.10it/s, loss=2.5954]


Epoch 121/200 | Train Loss: 2.5294 | Val Loss: 0.9935 | Val Acc: 78.92% | LR: 0.000353
  * New best teacher! Accuracy: 78.92%


Teacher Epoch 122/200: 100%|██████████| 781/781 [04:59<00:00,  2.61it/s, loss=2.7111]


Epoch 122/200 | Train Loss: 2.5363 | Val Loss: 0.9754 | Val Acc: 79.19% | LR: 0.000345
  * New best teacher! Accuracy: 79.19%


Teacher Epoch 123/200: 100%|██████████| 781/781 [04:58<00:00,  2.62it/s, loss=1.2664]


Epoch 123/200 | Train Loss: 2.5247 | Val Loss: 0.9953 | Val Acc: 79.03% | LR: 0.000338


Teacher Epoch 124/200: 100%|██████████| 781/781 [04:34<00:00,  2.85it/s, loss=3.2750]


Epoch 124/200 | Train Loss: 2.5242 | Val Loss: 1.0955 | Val Acc: 77.97% | LR: 0.000330


Teacher Epoch 125/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.8857]


Epoch 125/200 | Train Loss: 2.5149 | Val Loss: 0.9580 | Val Acc: 79.66% | LR: 0.000323
  * New best teacher! Accuracy: 79.66%


Teacher Epoch 126/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.5945]


Epoch 126/200 | Train Loss: 2.4606 | Val Loss: 0.9526 | Val Acc: 79.58% | LR: 0.000315


Teacher Epoch 127/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.0354]


Epoch 127/200 | Train Loss: 2.4856 | Val Loss: 0.9124 | Val Acc: 79.65% | LR: 0.000308


Teacher Epoch 128/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.3691]


Epoch 128/200 | Train Loss: 2.4689 | Val Loss: 0.9316 | Val Acc: 78.93% | LR: 0.000300


Teacher Epoch 129/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=1.6373]


Epoch 129/200 | Train Loss: 2.4959 | Val Loss: 0.9294 | Val Acc: 79.59% | LR: 0.000293


Teacher Epoch 130/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.5041]


Epoch 130/200 | Train Loss: 2.4650 | Val Loss: 0.9296 | Val Acc: 80.31% | LR: 0.000286
  * New best teacher! Accuracy: 80.31%
  Checkpoint saved: teacher_epoch_130.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 131/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.2963]


Epoch 131/200 | Train Loss: 2.4671 | Val Loss: 0.9214 | Val Acc: 79.81% | LR: 0.000278


Teacher Epoch 132/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.1376]


Epoch 132/200 | Train Loss: 2.5108 | Val Loss: 0.9644 | Val Acc: 80.27% | LR: 0.000271


Teacher Epoch 133/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.3351]


Epoch 133/200 | Train Loss: 2.4571 | Val Loss: 0.8920 | Val Acc: 80.42% | LR: 0.000264
  * New best teacher! Accuracy: 80.42%


Teacher Epoch 134/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.6557]


Epoch 134/200 | Train Loss: 2.4177 | Val Loss: 0.9473 | Val Acc: 80.75% | LR: 0.000257
  * New best teacher! Accuracy: 80.75%


Teacher Epoch 135/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.2244]


Epoch 135/200 | Train Loss: 2.4330 | Val Loss: 0.9108 | Val Acc: 80.50% | LR: 0.000250


Teacher Epoch 136/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.0442]


Epoch 136/200 | Train Loss: 2.4737 | Val Loss: 0.9745 | Val Acc: 80.50% | LR: 0.000243


Teacher Epoch 137/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.8432]


Epoch 137/200 | Train Loss: 2.4403 | Val Loss: 0.8993 | Val Acc: 80.92% | LR: 0.000236
  * New best teacher! Accuracy: 80.92%


Teacher Epoch 138/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=3.1225]


Epoch 138/200 | Train Loss: 2.4162 | Val Loss: 0.8981 | Val Acc: 81.08% | LR: 0.000229
  * New best teacher! Accuracy: 81.08%


Teacher Epoch 139/200: 100%|██████████| 781/781 [04:11<00:00,  3.10it/s, loss=2.8452]


Epoch 139/200 | Train Loss: 2.3698 | Val Loss: 0.8590 | Val Acc: 81.26% | LR: 0.000223
  * New best teacher! Accuracy: 81.26%


Teacher Epoch 140/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.5735]


Epoch 140/200 | Train Loss: 2.4004 | Val Loss: 0.9132 | Val Acc: 80.79% | LR: 0.000216
  Checkpoint saved: teacher_epoch_140.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 141/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=2.9201]


Epoch 141/200 | Train Loss: 2.4094 | Val Loss: 0.9313 | Val Acc: 81.06% | LR: 0.000209


Teacher Epoch 142/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.3657]


Epoch 142/200 | Train Loss: 2.3653 | Val Loss: 0.8873 | Val Acc: 81.46% | LR: 0.000203
  * New best teacher! Accuracy: 81.46%


Teacher Epoch 143/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.1170]


Epoch 143/200 | Train Loss: 2.3704 | Val Loss: 0.8987 | Val Acc: 81.02% | LR: 0.000196


Teacher Epoch 144/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=3.1913]


Epoch 144/200 | Train Loss: 2.3385 | Val Loss: 0.8954 | Val Acc: 81.22% | LR: 0.000190


Teacher Epoch 145/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.8343]


Epoch 145/200 | Train Loss: 2.3317 | Val Loss: 0.9142 | Val Acc: 80.51% | LR: 0.000184


Teacher Epoch 146/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.3416]


Epoch 146/200 | Train Loss: 2.3613 | Val Loss: 0.8853 | Val Acc: 81.48% | LR: 0.000178
  * New best teacher! Accuracy: 81.48%


Teacher Epoch 147/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.3916]


Epoch 147/200 | Train Loss: 2.3552 | Val Loss: 0.8512 | Val Acc: 81.20% | LR: 0.000171


Teacher Epoch 148/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.6926]


Epoch 148/200 | Train Loss: 2.3511 | Val Loss: 0.8516 | Val Acc: 81.43% | LR: 0.000165


Teacher Epoch 149/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=2.2453]


Epoch 149/200 | Train Loss: 2.3531 | Val Loss: 0.8903 | Val Acc: 81.24% | LR: 0.000159


Teacher Epoch 150/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=1.4336]


Epoch 150/200 | Train Loss: 2.3135 | Val Loss: 0.8803 | Val Acc: 81.87% | LR: 0.000154
  * New best teacher! Accuracy: 81.87%
  Checkpoint saved: teacher_epoch_150.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 151/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=2.4647]


Epoch 151/200 | Train Loss: 2.3253 | Val Loss: 0.8743 | Val Acc: 81.94% | LR: 0.000148
  * New best teacher! Accuracy: 81.94%


Teacher Epoch 152/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.9175]


Epoch 152/200 | Train Loss: 2.3255 | Val Loss: 0.8278 | Val Acc: 82.12% | LR: 0.000142
  * New best teacher! Accuracy: 82.12%


Teacher Epoch 153/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.6655]


Epoch 153/200 | Train Loss: 2.3181 | Val Loss: 0.8559 | Val Acc: 82.00% | LR: 0.000137


Teacher Epoch 154/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=1.7325]


Epoch 154/200 | Train Loss: 2.2746 | Val Loss: 0.9194 | Val Acc: 81.75% | LR: 0.000131


Teacher Epoch 155/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.9375]


Epoch 155/200 | Train Loss: 2.3115 | Val Loss: 0.8420 | Val Acc: 82.47% | LR: 0.000126
  * New best teacher! Accuracy: 82.47%


Teacher Epoch 156/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.0336]


Epoch 156/200 | Train Loss: 2.2692 | Val Loss: 0.7835 | Val Acc: 82.31% | LR: 0.000120


Teacher Epoch 157/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=3.0136]


Epoch 157/200 | Train Loss: 2.3013 | Val Loss: 0.8828 | Val Acc: 82.27% | LR: 0.000115


Teacher Epoch 158/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.4504]


Epoch 158/200 | Train Loss: 2.2744 | Val Loss: 0.8299 | Val Acc: 82.18% | LR: 0.000110


Teacher Epoch 159/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.1702]


Epoch 159/200 | Train Loss: 2.2423 | Val Loss: 0.7808 | Val Acc: 82.56% | LR: 0.000105
  * New best teacher! Accuracy: 82.56%


Teacher Epoch 160/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.8814]


Epoch 160/200 | Train Loss: 2.2477 | Val Loss: 0.8345 | Val Acc: 82.61% | LR: 0.000100
  * New best teacher! Accuracy: 82.61%
  Checkpoint saved: teacher_epoch_160.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 161/200: 100%|██████████| 781/781 [04:13<00:00,  3.08it/s, loss=1.6517]


Epoch 161/200 | Train Loss: 2.2941 | Val Loss: 0.8043 | Val Acc: 82.80% | LR: 0.000095
  * New best teacher! Accuracy: 82.80%


Teacher Epoch 162/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=1.4603]


Epoch 162/200 | Train Loss: 2.2417 | Val Loss: 0.8137 | Val Acc: 82.41% | LR: 0.000091


Teacher Epoch 163/200: 100%|██████████| 781/781 [04:15<00:00,  3.05it/s, loss=2.4702]


Epoch 163/200 | Train Loss: 2.2218 | Val Loss: 0.7856 | Val Acc: 82.87% | LR: 0.000086
  * New best teacher! Accuracy: 82.87%


Teacher Epoch 164/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=1.9659]


Epoch 164/200 | Train Loss: 2.2030 | Val Loss: 0.7845 | Val Acc: 83.16% | LR: 0.000082
  * New best teacher! Accuracy: 83.16%


Teacher Epoch 165/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.6498]


Epoch 165/200 | Train Loss: 2.1794 | Val Loss: 0.8394 | Val Acc: 82.56% | LR: 0.000077


Teacher Epoch 166/200: 100%|██████████| 781/781 [04:14<00:00,  3.06it/s, loss=2.2607]


Epoch 166/200 | Train Loss: 2.2097 | Val Loss: 0.8012 | Val Acc: 83.12% | LR: 0.000073


Teacher Epoch 167/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=2.2970]


Epoch 167/200 | Train Loss: 2.2126 | Val Loss: 0.8350 | Val Acc: 82.86% | LR: 0.000069


Teacher Epoch 168/200: 100%|██████████| 781/781 [04:14<00:00,  3.07it/s, loss=1.4127]


Epoch 168/200 | Train Loss: 2.2287 | Val Loss: 0.7930 | Val Acc: 83.38% | LR: 0.000065
  * New best teacher! Accuracy: 83.38%


Teacher Epoch 169/200: 100%|██████████| 781/781 [04:19<00:00,  3.01it/s, loss=2.2710]


Epoch 169/200 | Train Loss: 2.1844 | Val Loss: 0.8062 | Val Acc: 83.49% | LR: 0.000061
  * New best teacher! Accuracy: 83.49%


Teacher Epoch 170/200: 100%|██████████| 781/781 [04:19<00:00,  3.01it/s, loss=2.0683]


Epoch 170/200 | Train Loss: 2.2042 | Val Loss: 0.8133 | Val Acc: 83.29% | LR: 0.000057
  Checkpoint saved: teacher_epoch_170.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 171/200: 100%|██████████| 781/781 [04:19<00:00,  3.01it/s, loss=2.9137]


Epoch 171/200 | Train Loss: 2.2124 | Val Loss: 0.8197 | Val Acc: 83.17% | LR: 0.000054


Teacher Epoch 172/200: 100%|██████████| 781/781 [04:20<00:00,  3.00it/s, loss=2.6566]


Epoch 172/200 | Train Loss: 2.2395 | Val Loss: 0.7880 | Val Acc: 83.60% | LR: 0.000050
  * New best teacher! Accuracy: 83.60%


Teacher Epoch 173/200: 100%|██████████| 781/781 [04:23<00:00,  2.96it/s, loss=1.2834]


Epoch 173/200 | Train Loss: 2.1946 | Val Loss: 0.7779 | Val Acc: 83.83% | LR: 0.000047
  * New best teacher! Accuracy: 83.83%


Teacher Epoch 174/200: 100%|██████████| 781/781 [04:22<00:00,  2.98it/s, loss=2.4711]


Epoch 174/200 | Train Loss: 2.1432 | Val Loss: 0.7991 | Val Acc: 83.80% | LR: 0.000043


Teacher Epoch 175/200: 100%|██████████| 781/781 [04:22<00:00,  2.97it/s, loss=1.5406]


Epoch 175/200 | Train Loss: 2.1961 | Val Loss: 0.7685 | Val Acc: 83.84% | LR: 0.000040
  * New best teacher! Accuracy: 83.84%


Teacher Epoch 176/200: 100%|██████████| 781/781 [04:22<00:00,  2.97it/s, loss=2.7655]


Epoch 176/200 | Train Loss: 2.1767 | Val Loss: 0.8111 | Val Acc: 83.25% | LR: 0.000037


Teacher Epoch 177/200: 100%|██████████| 781/781 [04:26<00:00,  2.93it/s, loss=2.1275]


Epoch 177/200 | Train Loss: 2.1637 | Val Loss: 0.7657 | Val Acc: 83.92% | LR: 0.000034
  * New best teacher! Accuracy: 83.92%


Teacher Epoch 178/200: 100%|██████████| 781/781 [04:23<00:00,  2.97it/s, loss=2.4422]


Epoch 178/200 | Train Loss: 2.1253 | Val Loss: 0.8077 | Val Acc: 83.84% | LR: 0.000031


Teacher Epoch 179/200: 100%|██████████| 781/781 [04:19<00:00,  3.01it/s, loss=1.9659]


Epoch 179/200 | Train Loss: 2.1682 | Val Loss: 0.7931 | Val Acc: 83.86% | LR: 0.000028


Teacher Epoch 180/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=1.8177]


Epoch 180/200 | Train Loss: 2.1677 | Val Loss: 0.7554 | Val Acc: 83.86% | LR: 0.000026
  Checkpoint saved: teacher_epoch_180.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 181/200: 100%|██████████| 781/781 [04:15<00:00,  3.05it/s, loss=1.5069]


Epoch 181/200 | Train Loss: 2.1225 | Val Loss: 0.7516 | Val Acc: 83.87% | LR: 0.000023


Teacher Epoch 182/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=2.0275]


Epoch 182/200 | Train Loss: 2.1209 | Val Loss: 0.7709 | Val Acc: 83.86% | LR: 0.000021


Teacher Epoch 183/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=2.1177]


Epoch 183/200 | Train Loss: 2.1228 | Val Loss: 0.7650 | Val Acc: 84.00% | LR: 0.000019
  * New best teacher! Accuracy: 84.00%


Teacher Epoch 184/200: 100%|██████████| 781/781 [04:15<00:00,  3.06it/s, loss=2.4029]


Epoch 184/200 | Train Loss: 2.1306 | Val Loss: 0.7455 | Val Acc: 84.18% | LR: 0.000017
  * New best teacher! Accuracy: 84.18%


Teacher Epoch 185/200: 100%|██████████| 781/781 [04:26<00:00,  2.93it/s, loss=1.9562]


Epoch 185/200 | Train Loss: 2.1413 | Val Loss: 0.7479 | Val Acc: 84.27% | LR: 0.000015
  * New best teacher! Accuracy: 84.27%


Teacher Epoch 186/200: 100%|██████████| 781/781 [04:59<00:00,  2.61it/s, loss=1.5428]


Epoch 186/200 | Train Loss: 2.1275 | Val Loss: 0.7179 | Val Acc: 84.22% | LR: 0.000013


Teacher Epoch 187/200: 100%|██████████| 781/781 [04:59<00:00,  2.61it/s, loss=2.5082]


Epoch 187/200 | Train Loss: 2.1491 | Val Loss: 0.7660 | Val Acc: 83.96% | LR: 0.000011


Teacher Epoch 188/200: 100%|██████████| 781/781 [05:00<00:00,  2.60it/s, loss=2.2000]


Epoch 188/200 | Train Loss: 2.1270 | Val Loss: 0.7813 | Val Acc: 84.28% | LR: 0.000009
  * New best teacher! Accuracy: 84.28%


Teacher Epoch 189/200: 100%|██████████| 781/781 [04:58<00:00,  2.62it/s, loss=2.5449]


Epoch 189/200 | Train Loss: 2.1686 | Val Loss: 0.7603 | Val Acc: 84.34% | LR: 0.000008
  * New best teacher! Accuracy: 84.34%


Teacher Epoch 190/200: 100%|██████████| 781/781 [05:00<00:00,  2.60it/s, loss=1.1077]


Epoch 190/200 | Train Loss: 2.1678 | Val Loss: 0.7476 | Val Acc: 84.37% | LR: 0.000006
  * New best teacher! Accuracy: 84.37%
  Checkpoint saved: teacher_epoch_190.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


Teacher Epoch 191/200: 100%|██████████| 781/781 [04:58<00:00,  2.62it/s, loss=1.0222]


Epoch 191/200 | Train Loss: 2.1339 | Val Loss: 0.7460 | Val Acc: 84.13% | LR: 0.000005


Teacher Epoch 192/200: 100%|██████████| 781/781 [04:58<00:00,  2.62it/s, loss=1.0342]


Epoch 192/200 | Train Loss: 2.1013 | Val Loss: 0.7277 | Val Acc: 84.34% | LR: 0.000004


Teacher Epoch 193/200: 100%|██████████| 781/781 [04:58<00:00,  2.62it/s, loss=2.0278]


Epoch 193/200 | Train Loss: 2.1317 | Val Loss: 0.7384 | Val Acc: 84.00% | LR: 0.000003


Teacher Epoch 194/200: 100%|██████████| 781/781 [04:59<00:00,  2.61it/s, loss=2.3710]


Epoch 194/200 | Train Loss: 2.1433 | Val Loss: 0.7302 | Val Acc: 84.17% | LR: 0.000002


Teacher Epoch 195/200: 100%|██████████| 781/781 [05:00<00:00,  2.60it/s, loss=1.8730]


Epoch 195/200 | Train Loss: 2.0919 | Val Loss: 0.7500 | Val Acc: 84.16% | LR: 0.000002


Teacher Epoch 196/200: 100%|██████████| 781/781 [04:59<00:00,  2.61it/s, loss=1.6743]


Epoch 196/200 | Train Loss: 2.0994 | Val Loss: 0.7417 | Val Acc: 84.39% | LR: 0.000001
  * New best teacher! Accuracy: 84.39%


Teacher Epoch 197/200: 100%|██████████| 781/781 [04:59<00:00,  2.61it/s, loss=2.6905]


Epoch 197/200 | Train Loss: 2.0952 | Val Loss: 0.7848 | Val Acc: 84.00% | LR: 0.000001


Teacher Epoch 198/200: 100%|██████████| 781/781 [04:58<00:00,  2.62it/s, loss=2.7167]


Epoch 198/200 | Train Loss: 2.1381 | Val Loss: 0.7507 | Val Acc: 84.21% | LR: 0.000000


Teacher Epoch 199/200: 100%|██████████| 781/781 [05:00<00:00,  2.60it/s, loss=1.3484]


Epoch 199/200 | Train Loss: 2.1265 | Val Loss: 0.7486 | Val Acc: 84.18% | LR: 0.000000


Teacher Epoch 200/200: 100%|██████████| 781/781 [04:59<00:00,  2.61it/s, loss=1.0337]


Epoch 200/200 | Train Loss: 2.1276 | Val Loss: 0.7349 | Val Acc: 84.31% | LR: 0.000000
  Checkpoint saved: teacher_epoch_200.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\teacher_v4


TypeError: TrainingLogger.save_final_results() got an unexpected keyword argument 'resolution'

In [8]:
# Cell 7: Create Student Model
print("\n" + "=" * 60)
print("STUDENT MODEL")
print("=" * 60)

student_model = create_student_model(device=device)

# Freeze teacher
teacher_model.eval()
for param in teacher_model.parameters():
    param.requires_grad = False


STUDENT MODEL
Student Model: EfficientNetV2-S
  Parameters: 20,305,588 (20.31M)
  Size: 78.05 MB
  Pretrained: True


## 4. Knowledge Distillation Training


In [9]:
# Cell 8: Setup Training
print("\n" + "=" * 60)
print(f"KNOWLEDGE DISTILLATION - {config.experiment_name}")
print("=" * 60)

# Start timing
logger.start_training()

# For V4: Teacher uses 64x64, student uses 32x32
# We need to resize inputs for teacher during distillation
if IS_V4:
    import torch.nn.functional as F
    print(f"V4 Mode: Teacher at {teacher_resolution}x{teacher_resolution}, Student at 32x32")

# Optimizer and scheduler
optimizer = optim.AdamW(
    student_model.parameters(),
    lr=config.base.learning_rate,
    weight_decay=config.base.weight_decay
)
scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=config.base.num_epochs - config.base.warmup_epochs
)

# Training setup
scaler = torch.amp.GradScaler('cuda')
best_acc = 0.0
best_weights = None
epochs_no_improve = 0
early_stopped = False

# Checkpoint directory for this experiment
exp_checkpoint_dir = CHECKPOINTS_DIR / config.experiment_id
exp_checkpoint_dir.mkdir(parents=True, exist_ok=True)

print(f"\nDistillation Method: {config.distillation.method}")
print(f"Teacher Accuracy: {teacher_accuracy:.2f}%")
print(f"Checkpoints: {exp_checkpoint_dir}")


KNOWLEDGE DISTILLATION - Saturation Test
V4 Mode: Teacher at 64x64, Student at 32x32

Distillation Method: standard_kd
Teacher Accuracy: 84.39%
Checkpoints: d:\Projects\KnowledgeDistillation\code_v2_32\checkpoints\v4_saturation


In [10]:
# Cell 9: Training Loop
for epoch in range(config.base.num_epochs):
    student_model.train()
    running_loss = 0.0
    valid_batches = 0
    
    # Learning rate warmup
    if epoch < config.base.warmup_epochs:
        warmup_lr = config.base.learning_rate * (epoch + 1) / config.base.warmup_epochs
        for param_group in optimizer.param_groups:
            param_group['lr'] = warmup_lr
    
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.base.num_epochs}")
    
    for inputs, labels in loop:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Apply Mixup or CutMix
        if config.augmentation.mixup or config.augmentation.cutmix:
            if np.random.rand() > 0.5 and config.augmentation.cutmix:
                inputs, labels_a, labels_b, lam = cutmix_data(
                    inputs.clone(), labels, config.augmentation.cutmix_alpha, device
                )
            elif config.augmentation.mixup:
                inputs, labels_a, labels_b, lam = mixup_data(
                    inputs, labels, config.augmentation.mixup_alpha, device
                )
            else:
                labels_a, labels_b, lam = labels, labels, 1.0
        else:
            labels_a, labels_b, lam = labels, labels, 1.0
        
        optimizer.zero_grad()
        
        with torch.amp.autocast('cuda'):
            # Forward pass - student uses 32x32 inputs
            student_outputs = student_model(inputs)
            
            with torch.no_grad():
                # V4: Resize inputs to 64x64 for teacher
                if IS_V4:
                    teacher_inputs = F.interpolate(inputs, size=(teacher_resolution, teacher_resolution), mode='bilinear', align_corners=False)
                    teacher_outputs = teacher_model(teacher_inputs)
                else:
                    teacher_outputs = teacher_model(inputs)
            
            # Calculate KD loss
            loss = kd_loss_with_mixup(
                student_outputs, teacher_outputs,
                labels_a, labels_b, lam,
                method=config.distillation.method,
                temperature=config.distillation.temperature,
                alpha=config.distillation.alpha,
                dkd_alpha=config.distillation.dkd_alpha,
                dkd_beta=config.distillation.dkd_beta,
                label_smoothing=config.distillation.label_smoothing
            )
        
        # Skip NaN
        if torch.isnan(loss):
            continue
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(student_model.parameters(), config.base.grad_clip)
        scaler.step(optimizer)
        scaler.update()
        
        running_loss += loss.item()
        valid_batches += 1
        loop.set_postfix(loss=f"{loss.item():.4f}")
    
    # Step scheduler after warmup
    if epoch >= config.base.warmup_epochs:
        scheduler.step()
    
    # Validation
    train_loss = running_loss / max(valid_batches, 1)
    val_results = evaluate_model(student_model, test_loader, device)
    val_acc = val_results['accuracy']
    val_loss = val_results['loss']
    current_lr = optimizer.param_groups[0]['lr']
    
    # Log epoch (same structure as teacher)
    is_best = logger.log_epoch(epoch + 1, train_loss, val_loss, val_acc, current_lr)
    
    print(f"Epoch {epoch+1}/{config.base.num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | LR: {current_lr:.6f}")
    
    # Save best model
    if is_best:
        best_acc = val_acc
        best_weights = copy.deepcopy(student_model.state_dict())
        epochs_no_improve = 0
        print(f"  * New best model! Accuracy: {best_acc:.2f}%")
        
        # Save best model
        torch.save(
            {'model_state_dict': best_weights, 'accuracy': best_acc, 'epoch': epoch + 1},
            MODELS_DIR / f"{config.experiment_id}_best.pth"
        )
    else:
        epochs_no_improve += 1
    
    # Checkpointing and save history
    if (epoch + 1) % config.base.checkpoint_frequency == 0:
        checkpoint_path = exp_checkpoint_dir / f"checkpoint_epoch_{epoch+1}.pth"
        save_checkpoint(
            student_model, optimizer, scheduler,
            epoch + 1, best_acc, logger.history,
            checkpoint_path, is_best=(val_acc == best_acc)
        )
        print(f"  Checkpoint saved: {checkpoint_path.name}")
        
        # Save history (same as teacher)
        logger.save_checkpoint_history()
        
        cleanup_checkpoints(exp_checkpoint_dir, keep=config.base.keep_checkpoints)
    
    # Early stopping
    if epochs_no_improve >= config.base.patience:
        print(f"\nEarly stopping triggered at epoch {epoch+1}")
        early_stopped = True
        break
    
    # Clear cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Training complete
total_epochs = epoch + 1

# Load best weights
student_model.load_state_dict(best_weights)

Epoch 1/200: 100%|██████████| 390/390 [01:26<00:00,  4.51it/s, loss=1.4955]


Epoch 1/200 | Train Loss: 1.7918 | Val Loss: 3.1929 | Val Acc: 28.01% | LR: 0.000200
  * New best model! Accuracy: 28.01%


Epoch 2/200: 100%|██████████| 390/390 [01:21<00:00,  4.77it/s, loss=1.5575]


Epoch 2/200 | Train Loss: 1.6144 | Val Loss: 2.5888 | Val Acc: 41.33% | LR: 0.000400
  * New best model! Accuracy: 41.33%


Epoch 3/200: 100%|██████████| 390/390 [01:21<00:00,  4.77it/s, loss=1.3710]


Epoch 3/200 | Train Loss: 1.5430 | Val Loss: 2.2997 | Val Acc: 47.65% | LR: 0.000600
  * New best model! Accuracy: 47.65%


Epoch 4/200: 100%|██████████| 390/390 [01:29<00:00,  4.34it/s, loss=1.3265]


Epoch 4/200 | Train Loss: 1.5027 | Val Loss: 2.2195 | Val Acc: 50.55% | LR: 0.000800
  * New best model! Accuracy: 50.55%


Epoch 5/200: 100%|██████████| 390/390 [01:28<00:00,  4.40it/s, loss=1.6501]


Epoch 5/200 | Train Loss: 1.4891 | Val Loss: 2.2815 | Val Acc: 49.27% | LR: 0.001000


Epoch 6/200: 100%|██████████| 390/390 [01:28<00:00,  4.40it/s, loss=1.5982]


Epoch 6/200 | Train Loss: 1.4591 | Val Loss: 2.0776 | Val Acc: 52.16% | LR: 0.001000
  * New best model! Accuracy: 52.16%


Epoch 7/200: 100%|██████████| 390/390 [01:29<00:00,  4.38it/s, loss=1.4471]


Epoch 7/200 | Train Loss: 1.4157 | Val Loss: 2.0274 | Val Acc: 55.38% | LR: 0.001000
  * New best model! Accuracy: 55.38%


Epoch 8/200: 100%|██████████| 390/390 [01:28<00:00,  4.40it/s, loss=1.2792]


Epoch 8/200 | Train Loss: 1.3985 | Val Loss: 1.9471 | Val Acc: 56.31% | LR: 0.000999
  * New best model! Accuracy: 56.31%


Epoch 9/200: 100%|██████████| 390/390 [01:29<00:00,  4.37it/s, loss=1.5523]


Epoch 9/200 | Train Loss: 1.3893 | Val Loss: 1.9765 | Val Acc: 55.82% | LR: 0.000999


Epoch 10/200: 100%|██████████| 390/390 [01:28<00:00,  4.39it/s, loss=1.4631]


Epoch 10/200 | Train Loss: 1.3813 | Val Loss: 1.9222 | Val Acc: 58.32% | LR: 0.000998
  * New best model! Accuracy: 58.32%
  Checkpoint saved: checkpoint_epoch_10.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910


Epoch 11/200: 100%|██████████| 390/390 [01:28<00:00,  4.39it/s, loss=1.2609]


Epoch 11/200 | Train Loss: 1.3749 | Val Loss: 1.9519 | Val Acc: 55.63% | LR: 0.000998


Epoch 12/200: 100%|██████████| 390/390 [01:29<00:00,  4.37it/s, loss=1.3823]


Epoch 12/200 | Train Loss: 1.3689 | Val Loss: 1.8340 | Val Acc: 58.89% | LR: 0.000997
  * New best model! Accuracy: 58.89%


Epoch 13/200: 100%|██████████| 390/390 [01:28<00:00,  4.39it/s, loss=0.9962]


Epoch 13/200 | Train Loss: 1.3577 | Val Loss: 1.8226 | Val Acc: 59.20% | LR: 0.000996
  * New best model! Accuracy: 59.20%


Epoch 14/200: 100%|██████████| 390/390 [01:28<00:00,  4.39it/s, loss=1.4079]


Epoch 14/200 | Train Loss: 1.3433 | Val Loss: 1.8045 | Val Acc: 59.59% | LR: 0.000995
  * New best model! Accuracy: 59.59%


Epoch 15/200: 100%|██████████| 390/390 [01:26<00:00,  4.51it/s, loss=1.3179]


Epoch 15/200 | Train Loss: 1.3574 | Val Loss: 1.8226 | Val Acc: 59.67% | LR: 0.000994
  * New best model! Accuracy: 59.67%


Epoch 16/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=1.1172]


Epoch 16/200 | Train Loss: 1.3351 | Val Loss: 1.8275 | Val Acc: 59.30% | LR: 0.000992


Epoch 17/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=1.2427]


Epoch 17/200 | Train Loss: 1.3489 | Val Loss: 1.7666 | Val Acc: 60.99% | LR: 0.000991
  * New best model! Accuracy: 60.99%


Epoch 18/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=1.0305]


Epoch 18/200 | Train Loss: 1.3282 | Val Loss: 1.7273 | Val Acc: 62.15% | LR: 0.000989
  * New best model! Accuracy: 62.15%


Epoch 19/200: 100%|██████████| 390/390 [01:23<00:00,  4.69it/s, loss=1.5957]


Epoch 19/200 | Train Loss: 1.3106 | Val Loss: 1.7084 | Val Acc: 61.48% | LR: 0.000987


Epoch 20/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.5281]


Epoch 20/200 | Train Loss: 1.3172 | Val Loss: 1.7018 | Val Acc: 62.07% | LR: 0.000985
  Checkpoint saved: checkpoint_epoch_20.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910


Epoch 21/200: 100%|██████████| 390/390 [01:23<00:00,  4.64it/s, loss=1.1318]


Epoch 21/200 | Train Loss: 1.3093 | Val Loss: 1.7580 | Val Acc: 61.38% | LR: 0.000983


Epoch 22/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=1.4594]


Epoch 22/200 | Train Loss: 1.3175 | Val Loss: 1.6105 | Val Acc: 62.91% | LR: 0.000981
  * New best model! Accuracy: 62.91%


Epoch 23/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.3972]


Epoch 23/200 | Train Loss: 1.3275 | Val Loss: 1.8059 | Val Acc: 61.50% | LR: 0.000979


Epoch 24/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=1.4519]


Epoch 24/200 | Train Loss: 1.2915 | Val Loss: 1.7093 | Val Acc: 62.61% | LR: 0.000977


Epoch 25/200: 100%|██████████| 390/390 [01:23<00:00,  4.68it/s, loss=1.5072]


Epoch 25/200 | Train Loss: 1.2940 | Val Loss: 1.7592 | Val Acc: 62.37% | LR: 0.000974


Epoch 26/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=0.9721]


Epoch 26/200 | Train Loss: 1.2887 | Val Loss: 1.7218 | Val Acc: 63.11% | LR: 0.000972
  * New best model! Accuracy: 63.11%


Epoch 27/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=1.0971]


Epoch 27/200 | Train Loss: 1.2808 | Val Loss: 1.6239 | Val Acc: 64.01% | LR: 0.000969
  * New best model! Accuracy: 64.01%


Epoch 28/200: 100%|██████████| 390/390 [01:22<00:00,  4.70it/s, loss=1.2491]


Epoch 28/200 | Train Loss: 1.2844 | Val Loss: 1.6767 | Val Acc: 63.35% | LR: 0.000966


Epoch 29/200: 100%|██████████| 390/390 [01:23<00:00,  4.69it/s, loss=1.0666]


Epoch 29/200 | Train Loss: 1.2800 | Val Loss: 1.6030 | Val Acc: 64.53% | LR: 0.000963
  * New best model! Accuracy: 64.53%


Epoch 30/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=0.9977]


Epoch 30/200 | Train Loss: 1.2753 | Val Loss: 1.6297 | Val Acc: 64.16% | LR: 0.000960
  Checkpoint saved: checkpoint_epoch_30.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910


Epoch 31/200: 100%|██████████| 390/390 [01:23<00:00,  4.69it/s, loss=1.2964]


Epoch 31/200 | Train Loss: 1.2581 | Val Loss: 1.7119 | Val Acc: 64.15% | LR: 0.000957


Epoch 32/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=1.0320]


Epoch 32/200 | Train Loss: 1.2691 | Val Loss: 1.5796 | Val Acc: 65.71% | LR: 0.000953
  * New best model! Accuracy: 65.71%


Epoch 33/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.3827]


Epoch 33/200 | Train Loss: 1.2443 | Val Loss: 1.6084 | Val Acc: 64.28% | LR: 0.000950


Epoch 34/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.2706]


Epoch 34/200 | Train Loss: 1.2462 | Val Loss: 1.6366 | Val Acc: 64.50% | LR: 0.000946


Epoch 35/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=1.2730]


Epoch 35/200 | Train Loss: 1.2627 | Val Loss: 1.6245 | Val Acc: 64.90% | LR: 0.000943


Epoch 36/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=1.3452]


Epoch 36/200 | Train Loss: 1.2584 | Val Loss: 1.6240 | Val Acc: 64.44% | LR: 0.000939


Epoch 37/200: 100%|██████████| 390/390 [01:23<00:00,  4.68it/s, loss=1.3075]


Epoch 37/200 | Train Loss: 1.2556 | Val Loss: 1.6139 | Val Acc: 64.21% | LR: 0.000935


Epoch 38/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=1.1369]


Epoch 38/200 | Train Loss: 1.2540 | Val Loss: 1.6034 | Val Acc: 65.61% | LR: 0.000931


Epoch 39/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.4612]


Epoch 39/200 | Train Loss: 1.2488 | Val Loss: 1.6773 | Val Acc: 65.24% | LR: 0.000927


Epoch 40/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.3669]


Epoch 40/200 | Train Loss: 1.2589 | Val Loss: 1.6303 | Val Acc: 66.05% | LR: 0.000923
  * New best model! Accuracy: 66.05%
  Checkpoint saved: checkpoint_epoch_40.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910


Epoch 41/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.4454]


Epoch 41/200 | Train Loss: 1.2619 | Val Loss: 1.5169 | Val Acc: 66.86% | LR: 0.000918
  * New best model! Accuracy: 66.86%


Epoch 42/200: 100%|██████████| 390/390 [01:23<00:00,  4.64it/s, loss=1.0291]


Epoch 42/200 | Train Loss: 1.2330 | Val Loss: 1.6708 | Val Acc: 65.50% | LR: 0.000914


Epoch 43/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.3008]


Epoch 43/200 | Train Loss: 1.2284 | Val Loss: 1.5334 | Val Acc: 65.26% | LR: 0.000909


Epoch 44/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.2190]


Epoch 44/200 | Train Loss: 1.2292 | Val Loss: 1.5558 | Val Acc: 66.95% | LR: 0.000905
  * New best model! Accuracy: 66.95%


Epoch 45/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=1.5086]


Epoch 45/200 | Train Loss: 1.2334 | Val Loss: 1.5539 | Val Acc: 66.32% | LR: 0.000900


Epoch 46/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.1046]


Epoch 46/200 | Train Loss: 1.2117 | Val Loss: 1.5161 | Val Acc: 66.96% | LR: 0.000895
  * New best model! Accuracy: 66.96%


Epoch 47/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=0.9730]


Epoch 47/200 | Train Loss: 1.2159 | Val Loss: 1.5386 | Val Acc: 65.64% | LR: 0.000890


Epoch 48/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.4863]


Epoch 48/200 | Train Loss: 1.2218 | Val Loss: 1.4886 | Val Acc: 66.74% | LR: 0.000885


Epoch 49/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=1.1626]


Epoch 49/200 | Train Loss: 1.2207 | Val Loss: 1.5310 | Val Acc: 67.70% | LR: 0.000880
  * New best model! Accuracy: 67.70%


Epoch 50/200: 100%|██████████| 390/390 [01:23<00:00,  4.64it/s, loss=1.3418]


Epoch 50/200 | Train Loss: 1.2117 | Val Loss: 1.4984 | Val Acc: 66.85% | LR: 0.000874
  Checkpoint saved: checkpoint_epoch_50.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910


Epoch 51/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=1.2657]


Epoch 51/200 | Train Loss: 1.2283 | Val Loss: 1.5123 | Val Acc: 67.45% | LR: 0.000869


Epoch 52/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.1725]


Epoch 52/200 | Train Loss: 1.2171 | Val Loss: 1.5712 | Val Acc: 66.56% | LR: 0.000863


Epoch 53/200: 100%|██████████| 390/390 [01:23<00:00,  4.68it/s, loss=0.9799]


Epoch 53/200 | Train Loss: 1.2053 | Val Loss: 1.4749 | Val Acc: 67.45% | LR: 0.000858


Epoch 54/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=1.1423]


Epoch 54/200 | Train Loss: 1.2093 | Val Loss: 1.4838 | Val Acc: 67.95% | LR: 0.000852
  * New best model! Accuracy: 67.95%


Epoch 55/200: 100%|██████████| 390/390 [01:24<00:00,  4.60it/s, loss=1.3463]


Epoch 55/200 | Train Loss: 1.2115 | Val Loss: 1.5309 | Val Acc: 66.89% | LR: 0.000846


Epoch 56/200: 100%|██████████| 390/390 [01:23<00:00,  4.68it/s, loss=1.0412]


Epoch 56/200 | Train Loss: 1.2093 | Val Loss: 1.5170 | Val Acc: 67.65% | LR: 0.000841


Epoch 57/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=1.1071]


Epoch 57/200 | Train Loss: 1.2033 | Val Loss: 1.4403 | Val Acc: 68.59% | LR: 0.000835
  * New best model! Accuracy: 68.59%


Epoch 58/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=0.8726]


Epoch 58/200 | Train Loss: 1.1868 | Val Loss: 1.4430 | Val Acc: 67.18% | LR: 0.000829


Epoch 59/200: 100%|██████████| 390/390 [01:23<00:00,  4.64it/s, loss=1.1154]


Epoch 59/200 | Train Loss: 1.2112 | Val Loss: 1.5339 | Val Acc: 67.74% | LR: 0.000822


Epoch 60/200: 100%|██████████| 390/390 [01:25<00:00,  4.55it/s, loss=0.9132]


Epoch 60/200 | Train Loss: 1.1844 | Val Loss: 1.4974 | Val Acc: 65.99% | LR: 0.000816
  Checkpoint saved: checkpoint_epoch_60.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_10.pth


Epoch 61/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.1552]


Epoch 61/200 | Train Loss: 1.2008 | Val Loss: 1.4846 | Val Acc: 67.47% | LR: 0.000810


Epoch 62/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=1.1423]


Epoch 62/200 | Train Loss: 1.1970 | Val Loss: 1.3996 | Val Acc: 68.58% | LR: 0.000804


Epoch 63/200: 100%|██████████| 390/390 [01:24<00:00,  4.60it/s, loss=0.8898]


Epoch 63/200 | Train Loss: 1.1927 | Val Loss: 1.4124 | Val Acc: 68.72% | LR: 0.000797
  * New best model! Accuracy: 68.72%


Epoch 64/200: 100%|██████████| 390/390 [01:24<00:00,  4.60it/s, loss=1.2509]


Epoch 64/200 | Train Loss: 1.1906 | Val Loss: 1.4716 | Val Acc: 67.59% | LR: 0.000791


Epoch 65/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=0.8010]


Epoch 65/200 | Train Loss: 1.1841 | Val Loss: 1.4929 | Val Acc: 67.00% | LR: 0.000784


Epoch 66/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.2641]


Epoch 66/200 | Train Loss: 1.1974 | Val Loss: 1.4652 | Val Acc: 68.28% | LR: 0.000777


Epoch 67/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.3814]


Epoch 67/200 | Train Loss: 1.1771 | Val Loss: 1.4650 | Val Acc: 68.23% | LR: 0.000771


Epoch 68/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.3416]


Epoch 68/200 | Train Loss: 1.1934 | Val Loss: 1.4671 | Val Acc: 68.53% | LR: 0.000764


Epoch 69/200: 100%|██████████| 390/390 [01:25<00:00,  4.58it/s, loss=1.3434]


Epoch 69/200 | Train Loss: 1.1810 | Val Loss: 1.4610 | Val Acc: 68.87% | LR: 0.000757
  * New best model! Accuracy: 68.87%


Epoch 70/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.2129]


Epoch 70/200 | Train Loss: 1.1798 | Val Loss: 1.4938 | Val Acc: 68.39% | LR: 0.000750
  Checkpoint saved: checkpoint_epoch_70.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_20.pth


Epoch 71/200: 100%|██████████| 390/390 [01:24<00:00,  4.60it/s, loss=1.3923]


Epoch 71/200 | Train Loss: 1.1689 | Val Loss: 1.4777 | Val Acc: 67.88% | LR: 0.000743


Epoch 72/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=0.7190]


Epoch 72/200 | Train Loss: 1.1888 | Val Loss: 1.4952 | Val Acc: 68.51% | LR: 0.000736


Epoch 73/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.7362]


Epoch 73/200 | Train Loss: 1.1713 | Val Loss: 1.4457 | Val Acc: 69.14% | LR: 0.000729
  * New best model! Accuracy: 69.14%


Epoch 74/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=1.1620]


Epoch 74/200 | Train Loss: 1.1735 | Val Loss: 1.5533 | Val Acc: 67.49% | LR: 0.000722


Epoch 75/200: 100%|██████████| 390/390 [01:24<00:00,  4.60it/s, loss=1.3543]


Epoch 75/200 | Train Loss: 1.1394 | Val Loss: 1.4790 | Val Acc: 69.59% | LR: 0.000714
  * New best model! Accuracy: 69.59%


Epoch 76/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=0.9486]


Epoch 76/200 | Train Loss: 1.1704 | Val Loss: 1.4245 | Val Acc: 69.07% | LR: 0.000707


Epoch 77/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.3582]


Epoch 77/200 | Train Loss: 1.1559 | Val Loss: 1.3836 | Val Acc: 69.11% | LR: 0.000700


Epoch 78/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.8921]


Epoch 78/200 | Train Loss: 1.1683 | Val Loss: 1.4037 | Val Acc: 69.45% | LR: 0.000692


Epoch 79/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=1.1697]


Epoch 79/200 | Train Loss: 1.1614 | Val Loss: 1.3414 | Val Acc: 70.42% | LR: 0.000685
  * New best model! Accuracy: 70.42%


Epoch 80/200: 100%|██████████| 390/390 [01:25<00:00,  4.58it/s, loss=1.3645]


Epoch 80/200 | Train Loss: 1.1648 | Val Loss: 1.4453 | Val Acc: 68.78% | LR: 0.000677
  Checkpoint saved: checkpoint_epoch_80.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_30.pth


Epoch 81/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.3585]


Epoch 81/200 | Train Loss: 1.1767 | Val Loss: 1.4749 | Val Acc: 69.07% | LR: 0.000670


Epoch 82/200: 100%|██████████| 390/390 [01:24<00:00,  4.59it/s, loss=0.7672]


Epoch 82/200 | Train Loss: 1.1444 | Val Loss: 1.3910 | Val Acc: 69.09% | LR: 0.000662


Epoch 83/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=1.2499]


Epoch 83/200 | Train Loss: 1.1714 | Val Loss: 1.5524 | Val Acc: 67.62% | LR: 0.000655


Epoch 84/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.1135]


Epoch 84/200 | Train Loss: 1.1517 | Val Loss: 1.3960 | Val Acc: 69.49% | LR: 0.000647


Epoch 85/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.7893]


Epoch 85/200 | Train Loss: 1.1514 | Val Loss: 1.3333 | Val Acc: 70.46% | LR: 0.000639
  * New best model! Accuracy: 70.46%


Epoch 86/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.2342]


Epoch 86/200 | Train Loss: 1.1596 | Val Loss: 1.3703 | Val Acc: 70.32% | LR: 0.000631


Epoch 87/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=0.8286]


Epoch 87/200 | Train Loss: 1.1397 | Val Loss: 1.3418 | Val Acc: 69.89% | LR: 0.000624


Epoch 88/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.0209]


Epoch 88/200 | Train Loss: 1.1336 | Val Loss: 1.3412 | Val Acc: 70.21% | LR: 0.000616


Epoch 89/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.3650]


Epoch 89/200 | Train Loss: 1.1401 | Val Loss: 1.4602 | Val Acc: 70.16% | LR: 0.000608


Epoch 90/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=1.3257]


Epoch 90/200 | Train Loss: 1.1575 | Val Loss: 1.3761 | Val Acc: 70.44% | LR: 0.000600
  Checkpoint saved: checkpoint_epoch_90.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_40.pth


Epoch 91/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=1.3272]


Epoch 91/200 | Train Loss: 1.1394 | Val Loss: 1.3891 | Val Acc: 70.32% | LR: 0.000592


Epoch 92/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.2092]


Epoch 92/200 | Train Loss: 1.1362 | Val Loss: 1.3559 | Val Acc: 70.53% | LR: 0.000584
  * New best model! Accuracy: 70.53%


Epoch 93/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.4255]


Epoch 93/200 | Train Loss: 1.1418 | Val Loss: 1.3817 | Val Acc: 70.27% | LR: 0.000576


Epoch 94/200: 100%|██████████| 390/390 [01:24<00:00,  4.59it/s, loss=1.0491]


Epoch 94/200 | Train Loss: 1.1556 | Val Loss: 1.3427 | Val Acc: 70.90% | LR: 0.000568
  * New best model! Accuracy: 70.90%


Epoch 95/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.3141]


Epoch 95/200 | Train Loss: 1.1176 | Val Loss: 1.3692 | Val Acc: 69.75% | LR: 0.000560


Epoch 96/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.8747]


Epoch 96/200 | Train Loss: 1.1209 | Val Loss: 1.3097 | Val Acc: 71.68% | LR: 0.000552
  * New best model! Accuracy: 71.68%


Epoch 97/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.1227]


Epoch 97/200 | Train Loss: 1.1295 | Val Loss: 1.3530 | Val Acc: 70.99% | LR: 0.000544


Epoch 98/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.3367]


Epoch 98/200 | Train Loss: 1.1249 | Val Loss: 1.3724 | Val Acc: 70.29% | LR: 0.000536


Epoch 99/200: 100%|██████████| 390/390 [01:23<00:00,  4.68it/s, loss=1.0739]


Epoch 99/200 | Train Loss: 1.1077 | Val Loss: 1.3525 | Val Acc: 71.17% | LR: 0.000528


Epoch 100/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=0.8172]


Epoch 100/200 | Train Loss: 1.1267 | Val Loss: 1.3282 | Val Acc: 72.02% | LR: 0.000520
  * New best model! Accuracy: 72.02%
  Checkpoint saved: checkpoint_epoch_100.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_100.pth


Epoch 101/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.2163]


Epoch 101/200 | Train Loss: 1.1249 | Val Loss: 1.3244 | Val Acc: 71.55% | LR: 0.000512


Epoch 102/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=0.9652]


Epoch 102/200 | Train Loss: 1.1278 | Val Loss: 1.3092 | Val Acc: 71.65% | LR: 0.000504


Epoch 103/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.4078]


Epoch 103/200 | Train Loss: 1.1062 | Val Loss: 1.3400 | Val Acc: 71.83% | LR: 0.000496


Epoch 104/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=0.7615]


Epoch 104/200 | Train Loss: 1.1171 | Val Loss: 1.2504 | Val Acc: 72.47% | LR: 0.000488
  * New best model! Accuracy: 72.47%


Epoch 105/200: 100%|██████████| 390/390 [01:23<00:00,  4.64it/s, loss=1.2533]


Epoch 105/200 | Train Loss: 1.1077 | Val Loss: 1.3367 | Val Acc: 70.87% | LR: 0.000480


Epoch 106/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=0.6975]


Epoch 106/200 | Train Loss: 1.1053 | Val Loss: 1.2747 | Val Acc: 70.86% | LR: 0.000472


Epoch 107/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.3947]


Epoch 107/200 | Train Loss: 1.0940 | Val Loss: 1.3038 | Val Acc: 72.41% | LR: 0.000464


Epoch 108/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.0779]


Epoch 108/200 | Train Loss: 1.1101 | Val Loss: 1.3076 | Val Acc: 72.44% | LR: 0.000456


Epoch 109/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=1.2837]


Epoch 109/200 | Train Loss: 1.0854 | Val Loss: 1.2897 | Val Acc: 72.14% | LR: 0.000448


Epoch 110/200: 100%|██████████| 390/390 [01:24<00:00,  4.60it/s, loss=1.3429]


Epoch 110/200 | Train Loss: 1.1098 | Val Loss: 1.2683 | Val Acc: 72.76% | LR: 0.000440
  * New best model! Accuracy: 72.76%
  Checkpoint saved: checkpoint_epoch_110.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_110.pth


Epoch 111/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.8404]


Epoch 111/200 | Train Loss: 1.1150 | Val Loss: 1.2949 | Val Acc: 72.52% | LR: 0.000432


Epoch 112/200: 100%|██████████| 390/390 [01:11<00:00,  5.47it/s, loss=0.9005]


Epoch 112/200 | Train Loss: 1.0924 | Val Loss: 1.2810 | Val Acc: 72.03% | LR: 0.000424


Epoch 113/200: 100%|██████████| 390/390 [01:10<00:00,  5.50it/s, loss=1.1897]


Epoch 113/200 | Train Loss: 1.0957 | Val Loss: 1.2717 | Val Acc: 71.35% | LR: 0.000416


Epoch 114/200: 100%|██████████| 390/390 [01:11<00:00,  5.49it/s, loss=1.2362]


Epoch 114/200 | Train Loss: 1.0685 | Val Loss: 1.2692 | Val Acc: 72.87% | LR: 0.000408
  * New best model! Accuracy: 72.87%


Epoch 115/200: 100%|██████████| 390/390 [01:10<00:00,  5.52it/s, loss=0.8780]


Epoch 115/200 | Train Loss: 1.0780 | Val Loss: 1.2245 | Val Acc: 73.45% | LR: 0.000400
  * New best model! Accuracy: 73.45%


Epoch 116/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=1.1663]


Epoch 116/200 | Train Loss: 1.0987 | Val Loss: 1.3021 | Val Acc: 72.29% | LR: 0.000392


Epoch 117/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=1.3845]


Epoch 117/200 | Train Loss: 1.0931 | Val Loss: 1.2896 | Val Acc: 72.92% | LR: 0.000384


Epoch 118/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.7777]


Epoch 118/200 | Train Loss: 1.0689 | Val Loss: 1.2783 | Val Acc: 72.50% | LR: 0.000376


Epoch 119/200: 100%|██████████| 390/390 [01:10<00:00,  5.51it/s, loss=1.2137]


Epoch 119/200 | Train Loss: 1.0652 | Val Loss: 1.2363 | Val Acc: 73.18% | LR: 0.000369


Epoch 120/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=1.2832]


Epoch 120/200 | Train Loss: 1.1006 | Val Loss: 1.2306 | Val Acc: 73.62% | LR: 0.000361
  * New best model! Accuracy: 73.62%
  Checkpoint saved: checkpoint_epoch_120.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_120.pth


Epoch 121/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.8316]


Epoch 121/200 | Train Loss: 1.0833 | Val Loss: 1.2590 | Val Acc: 73.22% | LR: 0.000353


Epoch 122/200: 100%|██████████| 390/390 [01:13<00:00,  5.32it/s, loss=0.8057]


Epoch 122/200 | Train Loss: 1.0733 | Val Loss: 1.1849 | Val Acc: 74.47% | LR: 0.000345
  * New best model! Accuracy: 74.47%


Epoch 123/200: 100%|██████████| 390/390 [01:26<00:00,  4.52it/s, loss=1.2661]


Epoch 123/200 | Train Loss: 1.0886 | Val Loss: 1.2588 | Val Acc: 73.22% | LR: 0.000338


Epoch 124/200: 100%|██████████| 390/390 [01:26<00:00,  4.52it/s, loss=1.2007]


Epoch 124/200 | Train Loss: 1.0705 | Val Loss: 1.2190 | Val Acc: 73.60% | LR: 0.000330


Epoch 125/200: 100%|██████████| 390/390 [01:26<00:00,  4.53it/s, loss=0.7618]


Epoch 125/200 | Train Loss: 1.0589 | Val Loss: 1.1910 | Val Acc: 73.76% | LR: 0.000323


Epoch 126/200: 100%|██████████| 390/390 [01:26<00:00,  4.50it/s, loss=0.7976]


Epoch 126/200 | Train Loss: 1.0531 | Val Loss: 1.2043 | Val Acc: 73.72% | LR: 0.000315


Epoch 127/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=1.2054]


Epoch 127/200 | Train Loss: 1.0639 | Val Loss: 1.2034 | Val Acc: 74.18% | LR: 0.000308


Epoch 128/200: 100%|██████████| 390/390 [01:26<00:00,  4.50it/s, loss=0.6323]


Epoch 128/200 | Train Loss: 1.0431 | Val Loss: 1.1435 | Val Acc: 74.56% | LR: 0.000300
  * New best model! Accuracy: 74.56%


Epoch 129/200: 100%|██████████| 390/390 [01:26<00:00,  4.51it/s, loss=0.9547]


Epoch 129/200 | Train Loss: 1.0610 | Val Loss: 1.1992 | Val Acc: 74.16% | LR: 0.000293


Epoch 130/200: 100%|██████████| 390/390 [01:26<00:00,  4.52it/s, loss=1.2429]


Epoch 130/200 | Train Loss: 1.0503 | Val Loss: 1.1942 | Val Acc: 74.19% | LR: 0.000286
  Checkpoint saved: checkpoint_epoch_130.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_130.pth


Epoch 131/200: 100%|██████████| 390/390 [01:26<00:00,  4.53it/s, loss=1.2050]


Epoch 131/200 | Train Loss: 1.0657 | Val Loss: 1.2296 | Val Acc: 74.37% | LR: 0.000278


Epoch 132/200: 100%|██████████| 390/390 [01:26<00:00,  4.53it/s, loss=1.1473]


Epoch 132/200 | Train Loss: 1.0452 | Val Loss: 1.2016 | Val Acc: 74.18% | LR: 0.000271


Epoch 133/200: 100%|██████████| 390/390 [01:26<00:00,  4.52it/s, loss=1.1944]


Epoch 133/200 | Train Loss: 1.0531 | Val Loss: 1.1902 | Val Acc: 74.36% | LR: 0.000264


Epoch 134/200: 100%|██████████| 390/390 [01:26<00:00,  4.50it/s, loss=1.2102]


Epoch 134/200 | Train Loss: 1.0401 | Val Loss: 1.2047 | Val Acc: 74.54% | LR: 0.000257


Epoch 135/200: 100%|██████████| 390/390 [01:26<00:00,  4.48it/s, loss=1.2074]


Epoch 135/200 | Train Loss: 1.0335 | Val Loss: 1.1476 | Val Acc: 74.74% | LR: 0.000250
  * New best model! Accuracy: 74.74%


Epoch 136/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=1.1559]


Epoch 136/200 | Train Loss: 1.0450 | Val Loss: 1.2011 | Val Acc: 74.51% | LR: 0.000243


Epoch 137/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=1.1390]


Epoch 137/200 | Train Loss: 1.0327 | Val Loss: 1.1982 | Val Acc: 74.29% | LR: 0.000236


Epoch 138/200: 100%|██████████| 390/390 [01:26<00:00,  4.53it/s, loss=0.9657]


Epoch 138/200 | Train Loss: 1.0666 | Val Loss: 1.1539 | Val Acc: 75.52% | LR: 0.000229
  * New best model! Accuracy: 75.52%


Epoch 139/200: 100%|██████████| 390/390 [01:25<00:00,  4.55it/s, loss=0.8013]


Epoch 139/200 | Train Loss: 1.0458 | Val Loss: 1.1706 | Val Acc: 74.78% | LR: 0.000223


Epoch 140/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=1.0407]


Epoch 140/200 | Train Loss: 1.0200 | Val Loss: 1.1022 | Val Acc: 75.71% | LR: 0.000216
  * New best model! Accuracy: 75.71%
  Checkpoint saved: checkpoint_epoch_140.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_140.pth


Epoch 141/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=1.2301]


Epoch 141/200 | Train Loss: 1.0361 | Val Loss: 1.1609 | Val Acc: 75.61% | LR: 0.000209


Epoch 142/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=0.7892]


Epoch 142/200 | Train Loss: 1.0288 | Val Loss: 1.1545 | Val Acc: 75.70% | LR: 0.000203


Epoch 143/200: 100%|██████████| 390/390 [01:26<00:00,  4.52it/s, loss=1.2207]


Epoch 143/200 | Train Loss: 1.0388 | Val Loss: 1.1470 | Val Acc: 75.56% | LR: 0.000196


Epoch 144/200: 100%|██████████| 390/390 [01:26<00:00,  4.53it/s, loss=1.1308]


Epoch 144/200 | Train Loss: 1.0267 | Val Loss: 1.1584 | Val Acc: 75.33% | LR: 0.000190


Epoch 145/200: 100%|██████████| 390/390 [01:26<00:00,  4.52it/s, loss=1.2685]


Epoch 145/200 | Train Loss: 1.0340 | Val Loss: 1.1399 | Val Acc: 75.43% | LR: 0.000184


Epoch 146/200: 100%|██████████| 390/390 [01:25<00:00,  4.57it/s, loss=1.1469]


Epoch 146/200 | Train Loss: 1.0073 | Val Loss: 1.1479 | Val Acc: 75.19% | LR: 0.000178


Epoch 147/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=0.9327]


Epoch 147/200 | Train Loss: 1.0158 | Val Loss: 1.1038 | Val Acc: 75.77% | LR: 0.000171
  * New best model! Accuracy: 75.77%


Epoch 148/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=0.9351]


Epoch 148/200 | Train Loss: 1.0286 | Val Loss: 1.1704 | Val Acc: 75.67% | LR: 0.000165


Epoch 149/200: 100%|██████████| 390/390 [01:25<00:00,  4.55it/s, loss=1.2857]


Epoch 149/200 | Train Loss: 1.0244 | Val Loss: 1.1530 | Val Acc: 75.60% | LR: 0.000159


Epoch 150/200: 100%|██████████| 390/390 [01:25<00:00,  4.54it/s, loss=0.6687]


Epoch 150/200 | Train Loss: 0.9888 | Val Loss: 1.0892 | Val Acc: 75.90% | LR: 0.000154
  * New best model! Accuracy: 75.90%
  Checkpoint saved: checkpoint_epoch_150.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_150.pth


Epoch 151/200: 100%|██████████| 390/390 [01:19<00:00,  4.88it/s, loss=1.3015]


Epoch 151/200 | Train Loss: 1.0234 | Val Loss: 1.1553 | Val Acc: 76.13% | LR: 0.000148
  * New best model! Accuracy: 76.13%


Epoch 152/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=1.0131]


Epoch 152/200 | Train Loss: 1.0312 | Val Loss: 1.1354 | Val Acc: 76.07% | LR: 0.000142


Epoch 153/200: 100%|██████████| 390/390 [01:11<00:00,  5.45it/s, loss=0.9762]


Epoch 153/200 | Train Loss: 1.0158 | Val Loss: 1.1030 | Val Acc: 76.35% | LR: 0.000137
  * New best model! Accuracy: 76.35%


Epoch 154/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=1.0201]


Epoch 154/200 | Train Loss: 0.9919 | Val Loss: 1.1145 | Val Acc: 76.09% | LR: 0.000131


Epoch 155/200: 100%|██████████| 390/390 [01:10<00:00,  5.50it/s, loss=1.1463]


Epoch 155/200 | Train Loss: 0.9985 | Val Loss: 1.1513 | Val Acc: 75.56% | LR: 0.000126


Epoch 156/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=1.2519]


Epoch 156/200 | Train Loss: 0.9918 | Val Loss: 1.1368 | Val Acc: 76.22% | LR: 0.000120


Epoch 157/200: 100%|██████████| 390/390 [01:15<00:00,  5.18it/s, loss=1.2102]


Epoch 157/200 | Train Loss: 0.9959 | Val Loss: 1.1121 | Val Acc: 76.26% | LR: 0.000115


Epoch 158/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.1001]


Epoch 158/200 | Train Loss: 0.9795 | Val Loss: 1.0749 | Val Acc: 76.74% | LR: 0.000110
  * New best model! Accuracy: 76.74%


Epoch 159/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=0.6659]


Epoch 159/200 | Train Loss: 1.0037 | Val Loss: 1.1122 | Val Acc: 76.38% | LR: 0.000105


Epoch 160/200: 100%|██████████| 390/390 [01:24<00:00,  4.61it/s, loss=1.2831]


Epoch 160/200 | Train Loss: 0.9842 | Val Loss: 1.0800 | Val Acc: 76.88% | LR: 0.000100
  * New best model! Accuracy: 76.88%
  Checkpoint saved: checkpoint_epoch_160.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_160.pth


Epoch 161/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.2200]


Epoch 161/200 | Train Loss: 0.9936 | Val Loss: 1.1069 | Val Acc: 76.72% | LR: 0.000095


Epoch 162/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.6424]


Epoch 162/200 | Train Loss: 0.9820 | Val Loss: 1.0962 | Val Acc: 76.76% | LR: 0.000091


Epoch 163/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.7108]


Epoch 163/200 | Train Loss: 1.0046 | Val Loss: 1.0669 | Val Acc: 76.71% | LR: 0.000086


Epoch 164/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=0.7799]


Epoch 164/200 | Train Loss: 0.9950 | Val Loss: 1.1031 | Val Acc: 76.69% | LR: 0.000082


Epoch 165/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.2661]


Epoch 165/200 | Train Loss: 0.9833 | Val Loss: 1.1041 | Val Acc: 76.45% | LR: 0.000077


Epoch 166/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.1435]


Epoch 166/200 | Train Loss: 0.9872 | Val Loss: 1.0655 | Val Acc: 77.13% | LR: 0.000073
  * New best model! Accuracy: 77.13%


Epoch 167/200: 100%|██████████| 390/390 [01:24<00:00,  4.60it/s, loss=1.2375]


Epoch 167/200 | Train Loss: 0.9923 | Val Loss: 1.0877 | Val Acc: 77.41% | LR: 0.000069
  * New best model! Accuracy: 77.41%


Epoch 168/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.7618]


Epoch 168/200 | Train Loss: 0.9979 | Val Loss: 1.0819 | Val Acc: 76.88% | LR: 0.000065


Epoch 169/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.7459]


Epoch 169/200 | Train Loss: 0.9798 | Val Loss: 1.0509 | Val Acc: 77.12% | LR: 0.000061


Epoch 170/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=1.2887]


Epoch 170/200 | Train Loss: 0.9839 | Val Loss: 1.0697 | Val Acc: 77.17% | LR: 0.000057
  Checkpoint saved: checkpoint_epoch_170.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_170.pth


Epoch 171/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=0.5400]


Epoch 171/200 | Train Loss: 0.9765 | Val Loss: 1.0828 | Val Acc: 77.26% | LR: 0.000054


Epoch 172/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.2355]


Epoch 172/200 | Train Loss: 0.9870 | Val Loss: 1.1203 | Val Acc: 76.75% | LR: 0.000050


Epoch 173/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=0.6794]


Epoch 173/200 | Train Loss: 0.9603 | Val Loss: 1.0402 | Val Acc: 77.53% | LR: 0.000047
  * New best model! Accuracy: 77.53%


Epoch 174/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=0.6364]


Epoch 174/200 | Train Loss: 0.9823 | Val Loss: 1.0562 | Val Acc: 77.41% | LR: 0.000043


Epoch 175/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.0454]


Epoch 175/200 | Train Loss: 0.9624 | Val Loss: 1.0733 | Val Acc: 77.33% | LR: 0.000040


Epoch 176/200: 100%|██████████| 390/390 [01:23<00:00,  4.64it/s, loss=0.6899]


Epoch 176/200 | Train Loss: 0.9755 | Val Loss: 1.0657 | Val Acc: 77.30% | LR: 0.000037


Epoch 177/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.1082]


Epoch 177/200 | Train Loss: 0.9859 | Val Loss: 1.0720 | Val Acc: 77.29% | LR: 0.000034


Epoch 178/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.2648]


Epoch 178/200 | Train Loss: 0.9657 | Val Loss: 1.0502 | Val Acc: 77.59% | LR: 0.000031
  * New best model! Accuracy: 77.59%


Epoch 179/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.0595]


Epoch 179/200 | Train Loss: 0.9851 | Val Loss: 1.0649 | Val Acc: 77.59% | LR: 0.000028


Epoch 180/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.1567]


Epoch 180/200 | Train Loss: 0.9709 | Val Loss: 1.0215 | Val Acc: 77.93% | LR: 0.000026
  * New best model! Accuracy: 77.93%
  Checkpoint saved: checkpoint_epoch_180.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_180.pth


Epoch 181/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.0300]


Epoch 181/200 | Train Loss: 0.9746 | Val Loss: 1.0686 | Val Acc: 77.53% | LR: 0.000023


Epoch 182/200: 100%|██████████| 390/390 [01:23<00:00,  4.64it/s, loss=1.1910]


Epoch 182/200 | Train Loss: 0.9579 | Val Loss: 1.0924 | Val Acc: 77.60% | LR: 0.000021


Epoch 183/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.1645]


Epoch 183/200 | Train Loss: 0.9892 | Val Loss: 1.0570 | Val Acc: 77.60% | LR: 0.000019


Epoch 184/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.9647]


Epoch 184/200 | Train Loss: 0.9793 | Val Loss: 1.0552 | Val Acc: 77.61% | LR: 0.000017


Epoch 185/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=0.6776]


Epoch 185/200 | Train Loss: 0.9411 | Val Loss: 1.0324 | Val Acc: 77.81% | LR: 0.000015


Epoch 186/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=0.5963]


Epoch 186/200 | Train Loss: 0.9598 | Val Loss: 1.0457 | Val Acc: 77.66% | LR: 0.000013


Epoch 187/200: 100%|██████████| 390/390 [01:23<00:00,  4.64it/s, loss=1.1580]


Epoch 187/200 | Train Loss: 0.9734 | Val Loss: 1.0573 | Val Acc: 77.42% | LR: 0.000011


Epoch 188/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.1970]


Epoch 188/200 | Train Loss: 0.9585 | Val Loss: 1.0584 | Val Acc: 77.75% | LR: 0.000009


Epoch 189/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.7517]


Epoch 189/200 | Train Loss: 0.9580 | Val Loss: 1.0484 | Val Acc: 77.60% | LR: 0.000008


Epoch 190/200: 100%|██████████| 390/390 [01:24<00:00,  4.63it/s, loss=0.8255]


Epoch 190/200 | Train Loss: 0.9464 | Val Loss: 1.0334 | Val Acc: 77.79% | LR: 0.000006
  Checkpoint saved: checkpoint_epoch_190.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_190.pth


Epoch 191/200: 100%|██████████| 390/390 [01:23<00:00,  4.66it/s, loss=0.8895]


Epoch 191/200 | Train Loss: 0.9485 | Val Loss: 1.0500 | Val Acc: 77.90% | LR: 0.000005


Epoch 192/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=0.7977]


Epoch 192/200 | Train Loss: 0.9392 | Val Loss: 1.0376 | Val Acc: 77.85% | LR: 0.000004


Epoch 193/200: 100%|██████████| 390/390 [01:24<00:00,  4.64it/s, loss=1.0526]


Epoch 193/200 | Train Loss: 0.9655 | Val Loss: 1.0611 | Val Acc: 77.61% | LR: 0.000003


Epoch 194/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.2405]


Epoch 194/200 | Train Loss: 0.9751 | Val Loss: 1.0941 | Val Acc: 77.60% | LR: 0.000002


Epoch 195/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=1.1346]


Epoch 195/200 | Train Loss: 0.9712 | Val Loss: 1.0193 | Val Acc: 77.90% | LR: 0.000002


Epoch 196/200: 100%|██████████| 390/390 [01:23<00:00,  4.65it/s, loss=0.5931]


Epoch 196/200 | Train Loss: 0.9654 | Val Loss: 1.0629 | Val Acc: 77.70% | LR: 0.000001


Epoch 197/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=0.9495]


Epoch 197/200 | Train Loss: 0.9446 | Val Loss: 1.0163 | Val Acc: 77.86% | LR: 0.000001


Epoch 198/200: 100%|██████████| 390/390 [01:23<00:00,  4.67it/s, loss=0.7216]


Epoch 198/200 | Train Loss: 0.9674 | Val Loss: 1.0373 | Val Acc: 77.71% | LR: 0.000000


Epoch 199/200: 100%|██████████| 390/390 [01:24<00:00,  4.62it/s, loss=1.1002]


Epoch 199/200 | Train Loss: 0.9596 | Val Loss: 1.0500 | Val Acc: 77.54% | LR: 0.000000


Epoch 200/200: 100%|██████████| 390/390 [01:26<00:00,  4.51it/s, loss=1.2580]


Epoch 200/200 | Train Loss: 0.9420 | Val Loss: 1.0756 | Val Acc: 77.64% | LR: 0.000000
  Checkpoint saved: checkpoint_epoch_200.pth
History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
  Removed old checkpoint: checkpoint_epoch_200.pth


<All keys matched successfully>

In [11]:
# Cell 10: Save Final Results
# Save training history
logger.save_history()

# Save final results (same structure as teacher)
final_results = logger.save_final_results(
    model_name="EfficientNetV2-S",
    total_epochs=total_epochs,
    early_stopped=early_stopped,
    config=config.to_dict(),
    teacher_accuracy=teacher_accuracy
)

# Save final model
final_model_path = MODELS_DIR / f"{config.experiment_id}_final.pth"
torch.save({
    'model_state_dict': best_weights,
    'config': config.to_dict(),
    'results': final_results,
    'history': logger.history
}, final_model_path)

print(f"\nFinal model saved: {final_model_path}")

History saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910

TRAINING COMPLETE - v4_saturation_20251209_170910
Model: EfficientNetV2-S
Best Accuracy: 77.93%
Best Epoch: 180
Total Epochs: 200
Early Stopped: False
Training Time: 336.5 minutes
Teacher Accuracy: 84.39%
Retention Rate: 92.35%
Results saved: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910

Final model saved: d:\Projects\KnowledgeDistillation\code_v2_32\models\v4_saturation_final.pth


In [12]:
# Cell 11: Summary
print("\n" + "=" * 60)
print("EXPERIMENT COMPLETE")
print("=" * 60)
print(f"\nExperiment: {config.experiment_id}")
print(f"Method: {config.distillation.method}")
print(f"\nResults:")
print(f"  Teacher Accuracy: {teacher_accuracy:.2f}%")
print(f"  Student Accuracy: {logger.best_accuracy:.2f}%")
print(f"  Retention Rate: {(logger.best_accuracy/teacher_accuracy)*100:.2f}%")
print(f"\nTraining:")
print(f"  Best Epoch: {logger.best_epoch}")
print(f"  Total Epochs: {total_epochs}")
print(f"  Early Stopped: {early_stopped}")
print(f"  Training Time: {logger.get_training_time():.1f} minutes")
print(f"\nSaved Files:")
print(f"  Results: {logger.results_dir}")
print(f"    - config.json")
print(f"    - training_history.csv")
print(f"    - training_history.json")
print(f"    - final_results.json")
print(f"  Model: {final_model_path}")


EXPERIMENT COMPLETE

Experiment: v4_saturation
Method: standard_kd

Results:
  Teacher Accuracy: 84.39%
  Student Accuracy: 77.93%
  Retention Rate: 92.35%

Training:
  Best Epoch: 180
  Total Epochs: 200
  Early Stopped: False
  Training Time: 336.7 minutes

Saved Files:
  Results: d:\Projects\KnowledgeDistillation\code_v2_32\results\v4_saturation_20251209_170910
    - config.json
    - training_history.csv
    - training_history.json
    - final_results.json
  Model: d:\Projects\KnowledgeDistillation\code_v2_32\models\v4_saturation_final.pth
