# Knowledge Distillation for EfficientNetV2 on CIFAR-100 (SOTA) - Local Version

This notebook implements **State-of-the-Art** knowledge distillation with:

- **Decoupled Knowledge Distillation (DKD)** - Separates target and non-target class knowledge
- **CutMix + Mixup** - Advanced data augmentation

---

## Requirements

```bash
pip install torch torchvision tqdm thop matplotlib numpy
```

### Expected Results:

- **Teacher (EfficientNetV2-L):** ~68-70% accuracy
- **Distilled Student (DKD):** ~69-72% accuracy

---


In [1]:
# Cell 1: Setup and Imports
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.models import efficientnet_v2_s, efficientnet_v2_l, EfficientNet_V2_S_Weights, EfficientNet_V2_L_Weights
from tqdm import tqdm
import copy
import numpy as np
import matplotlib.pyplot as plt
import glob
import random
from pathlib import Path

# Setup Directories (Local paths)
PROJECT_ROOT = Path('./outputs')
MODEL_DIR = PROJECT_ROOT / 'models'
DATA_DIR = PROJECT_ROOT / 'data'
CHECKPOINT_DIR = PROJECT_ROOT / 'checkpoints'

MODEL_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Directories ready at: {PROJECT_ROOT.absolute()}")

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Directories ready at: d:\Projects\MasterProject\code\outputs
Using device: cuda
GPU: NVIDIA GeForce RTX 5070 Laptop GPU


In [2]:
# Cell 2: Experiment Configuration (SOTA)
# ==========================================
# HYPERPARAMETERS
# ==========================================
NUM_EPOCHS = 200
BATCH_SIZE = 128
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.05
PATIENCE = 30

# Distillation Params (DKD)
DKD_ALPHA = 1.0             # Weight for Target Knowledge
DKD_BETA = 8.0              # Weight for Non-Target Knowledge (Crucial)
TEMPERATURE = 4.0           # Softmax Temperature

# Augmentation Params
MIXUP_ALPHA = 0.8
CUTMIX_ALPHA = 1.0
CHECKPOINT_FREQUENCY = 20
NUM_CLASSES = 100

print(f"{'='*50}")
print(f"CONFIG: Epochs={NUM_EPOCHS} | Batch={BATCH_SIZE} | Temp={TEMPERATURE}")
print(f"DKD: Alpha={DKD_ALPHA}, Beta={DKD_BETA}")
print(f"{'='*50}")

CONFIG: Epochs=200 | Batch=128 | Temp=4.0
DKD: Alpha=1.0, Beta=8.0


In [3]:
# Cell 3: Data Loading
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
])

trainset = torchvision.datasets.CIFAR100(root=str(DATA_DIR), train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, 
                                          num_workers=4, pin_memory=True, drop_last=True)

testset = torchvision.datasets.CIFAR100(root=str(DATA_DIR), train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False, 
                                         num_workers=4, pin_memory=True)

print(f"Data loaded: {len(trainset)} Training, {len(testset)} Test images")

Data loaded: 50000 Training, 10000 Test images


In [4]:
# Cell 4: Helper Functions (KD Loss, CutMix, Utils)

# ==========================================
# 1. Standard Knowledge Distillation Loss (Stable)
# Combines soft target loss with hard target loss
# ==========================================
def kd_loss(student_logits, teacher_logits, labels, temp=4.0, alpha=0.9):
    """
    Standard Knowledge Distillation Loss (Hinton et al.)
    - Soft loss: KL divergence between student and teacher soft targets
    - Hard loss: Cross-entropy with true labels
    - alpha: weight for soft loss (1-alpha for hard loss)
    """
    # Soft targets (KL Divergence)
    soft_student = F.log_softmax(student_logits / temp, dim=1)
    soft_teacher = F.softmax(teacher_logits / temp, dim=1)
    soft_loss = F.kl_div(soft_student, soft_teacher, reduction='batchmean') * (temp ** 2)
    
    # Hard targets (Cross Entropy)
    hard_loss = F.cross_entropy(student_logits, labels)
    
    # Combined loss
    loss = alpha * soft_loss + (1 - alpha) * hard_loss
    
    return loss

def kd_loss_mixup(student_logits, teacher_logits, labels_a, labels_b, lam, temp=4.0, alpha=0.9):
    """
    KD Loss for mixed samples (CutMix/Mixup compatible)
    """
    # Soft targets (KL Divergence) - same for both labels since teacher sees mixed input
    soft_student = F.log_softmax(student_logits / temp, dim=1)
    soft_teacher = F.softmax(teacher_logits / temp, dim=1)
    soft_loss = F.kl_div(soft_student, soft_teacher, reduction='batchmean') * (temp ** 2)
    
    # Hard targets (Mixed Cross Entropy)
    hard_loss = lam * F.cross_entropy(student_logits, labels_a) + \
                (1 - lam) * F.cross_entropy(student_logits, labels_b)
    
    # Combined loss
    loss = alpha * soft_loss + (1 - alpha) * hard_loss
    
    # NaN safety check
    if torch.isnan(loss):
        return hard_loss  # Fallback to CE only
    
    return loss

# ==========================================
# 2. Augmentations: Mixup & CutMix
# ==========================================
def mixup_data(x, y, alpha=1.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def rand_bbox(size, lam):
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)
    cx = np.random.randint(W)
    cy = np.random.randint(H)
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    return bbx1, bby1, bbx2, bby2

def cutmix_data(x, y, alpha=1.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)
    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)
    x[:, :, bbx1:bbx2, bby1:bby2] = x[index, :, bbx1:bbx2, bby1:bby2]
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size()[-1] * x.size()[-2]))
    y_a, y_b = y, y[index]
    return x, y_a, y_b, lam

# ==========================================
# 3. Utilities (Save/Load/Evaluate)
# ==========================================
def evaluate_model_with_loss(model, dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    with torch.no_grad():
        for data in dataloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    acc = 100 * correct / total
    avg_loss = running_loss / len(dataloader)
    return acc, avg_loss

def save_checkpoint(model, optimizer, scheduler, epoch, best_acc, history, model_name, epochs_no_improve):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'best_acc': best_acc,
        'history': history,
        'epochs_no_improve': epochs_no_improve
    }
    path = CHECKPOINT_DIR / f"{model_name}_epoch{epoch+1}.pth"
    torch.save(checkpoint, path)
    print(f"  Checkpoint saved: {path}")
    
def load_checkpoint(model, optimizer, scheduler, model_name):
    checkpoints = sorted(glob.glob(str(CHECKPOINT_DIR / f"{model_name}_epoch*.pth")))
    if not checkpoints:
        return None
    latest = checkpoints[-1]
    print(f"  Loading checkpoint: {latest}")
    checkpoint = torch.load(latest, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    return checkpoint

def cleanup_old_checkpoints(model_name, keep=3):
    checkpoints = sorted(glob.glob(str(CHECKPOINT_DIR / f"{model_name}_epoch*.pth")))
    if len(checkpoints) > keep:
        for chk in checkpoints[:-keep]:
            os.remove(chk)
            print(f"  Cleaned up: {os.path.basename(chk)}")

print("Helper functions loaded (Standard KD, CutMix, Mixup)")

Helper functions loaded (Standard KD, CutMix, Mixup)


In [5]:
# Cell 5: Optimized Training Loop
def train_model_optimized(model, dataloader, optimizer, scheduler, num_epochs, model_name, 
                         teacher_model=None, temp=4.0, kd_alpha=0.9,
                         patience=30, grad_clip=1.0):
    """
    Training loop with:
    - Standard KD loss (stable, proven)
    - CutMix/Mixup augmentation
    - Mixed precision training
    - Gradient clipping
    - NaN detection and recovery
    """
    
    # 1. Load Checkpoint
    checkpoint = load_checkpoint(model, optimizer, scheduler, model_name)
    if checkpoint:
        start_epoch = checkpoint['epoch'] + 1
        best_acc = checkpoint['best_acc']
        history = checkpoint['history']
        epochs_no_improve = checkpoint['epochs_no_improve']
        best_model_wts = copy.deepcopy(model.state_dict())
        print(f"  Resuming from epoch {start_epoch}, Best Acc: {best_acc:.2f}%")
    else:
        start_epoch = 0
        best_acc = 0.0
        history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}
        epochs_no_improve = 0
        best_model_wts = copy.deepcopy(model.state_dict())
        print(f"  Starting fresh training...")

    # 2. Setup
    scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())
    val_criterion = nn.CrossEntropyLoss()
    
    if teacher_model:
        teacher_model.eval()
        for param in teacher_model.parameters():
            param.requires_grad = False

    # 3. Training Loop
    for epoch in range(start_epoch, num_epochs):
        model.train()
        running_loss = 0.0
        valid_batches = 0
        
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
        
        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Randomly choose Mixup (50%) or CutMix (50%)
            use_cutmix = np.random.rand() > 0.5
            if use_cutmix:
                inputs_aug, labels_a, labels_b, lam = cutmix_data(inputs.clone(), labels, alpha=CUTMIX_ALPHA)
            else:
                inputs_aug, labels_a, labels_b, lam = mixup_data(inputs, labels, alpha=MIXUP_ALPHA)
            
            optimizer.zero_grad()
            
            with torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
                # Student Forward
                student_outputs = model(inputs_aug)
                
                # Loss Calculation
                if teacher_model:
                    # Teacher Forward (no grad)
                    with torch.no_grad():
                        teacher_outputs = teacher_model(inputs_aug)
                    
                    # Standard KD Loss with Mixup support
                    loss = kd_loss_mixup(
                        student_outputs, teacher_outputs, 
                        labels_a, labels_b, lam,
                        temp=temp, alpha=kd_alpha
                    )
                else:
                    # Standard CE for Teacher training
                    loss = lam * F.cross_entropy(student_outputs, labels_a) + \
                           (1 - lam) * F.cross_entropy(student_outputs, labels_b)
            
            # Skip batch if loss is NaN
            if torch.isnan(loss):
                loop.set_postfix(loss="NaN-skip")
                continue
            
            # Backward
            scaler.scale(loss).backward()
            
            if grad_clip > 0:
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
                
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += loss.item()
            valid_batches += 1
            loop.set_postfix(loss=f"{loss.item():.3f}")

        # Step Scheduler
        scheduler.step()
        
        # Validation
        train_loss = running_loss / max(valid_batches, 1)
        val_acc, val_loss = evaluate_model_with_loss(model, testloader, val_criterion)
        
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_acc)
        
        current_lr = scheduler.get_last_lr()[0]
        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | LR: {current_lr:.6f}")
        
        # Save Best
        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), MODEL_DIR / f"{model_name}.pth")
            epochs_no_improve = 0
            print(f"  New best model saved! Accuracy: {best_acc:.2f}%")
        else:
            epochs_no_improve += 1
            
        # Checkpointing
        if (epoch + 1) % CHECKPOINT_FREQUENCY == 0:
            save_checkpoint(model, optimizer, scheduler, epoch, best_acc, history, model_name, epochs_no_improve)
            cleanup_old_checkpoints(model_name)
            
        # Early Stopping
        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs.")
            break
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    print(f"\nTraining complete. Best accuracy: {best_acc:.2f}%")
    model.load_state_dict(best_model_wts)
    return model, history

print("Training function loaded (with NaN protection)")

Training function loaded (with NaN protection)


In [6]:
# Cell 6: Initialize Models
print("Loading Teacher (EfficientNetV2-L)...")
teacher_model = efficientnet_v2_l(weights=EfficientNet_V2_L_Weights.IMAGENET1K_V1)
teacher_model.classifier[1] = nn.Linear(teacher_model.classifier[1].in_features, NUM_CLASSES)
teacher_model = teacher_model.to(device)

print("Loading Student (EfficientNetV2-S)...")
student_model = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
student_model.classifier[1] = nn.Linear(student_model.classifier[1].in_features, NUM_CLASSES)
student_model = student_model.to(device)

print("Models loaded")

Loading Teacher (EfficientNetV2-L)...
Loading Student (EfficientNetV2-S)...
Models loaded


In [7]:
# Cell 7: Train/Load Teacher Model
print("\n" + "="*70)
print("TEACHER MODEL")
print("="*70)

teacher_path = MODEL_DIR / "teacher_model.pth"
if teacher_path.exists():
    print(f"Found existing Teacher Model: {teacher_path}")
    teacher_model.load_state_dict(torch.load(teacher_path, map_location=device))
else:
    print("Training Teacher Model (This may take a while)...")
    opt_t = optim.AdamW(teacher_model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    sch_t = optim.lr_scheduler.CosineAnnealingLR(opt_t, T_max=NUM_EPOCHS)
    teacher_model, teacher_history = train_model_optimized(
        teacher_model, trainloader, opt_t, sch_t, NUM_EPOCHS, "teacher_model", teacher_model=None
    )

# Evaluate Teacher
teacher_model.eval()
teacher_accuracy, _ = evaluate_model_with_loss(teacher_model, testloader, nn.CrossEntropyLoss())
print(f"\nTeacher Accuracy: {teacher_accuracy:.2f}%")


TEACHER MODEL
Found existing Teacher Model: outputs\models\teacher_model.pth

Teacher Accuracy: 75.75%


In [8]:
# Cell 8: Train Distilled Student (Standard KD + CutMix)
print("\n" + "="*70)
print("DISTILLED STUDENT MODEL (Standard KD + CutMix/Mixup)")
print("="*70)

# KD Parameters
KD_ALPHA = 0.9  # Weight for soft loss (0.9 = 90% soft, 10% hard)
KD_TEMP = 4.0   # Temperature for softening

student_name = "distilled_student_kd"
student_path = MODEL_DIR / f"{student_name}.pth"

if student_path.exists():
    print(f"Found existing Distilled Model: {student_path}")
    student_model.load_state_dict(torch.load(student_path, map_location=device))
    distilled_accuracy, _ = evaluate_model_with_loss(student_model, testloader, nn.CrossEntropyLoss())
    print(f"Distilled Student Accuracy: {distilled_accuracy:.2f}%")
else:
    print(f"\nStarting Knowledge Distillation...")
    print(f"  KD Alpha (soft weight): {KD_ALPHA}")
    print(f"  Temperature: {KD_TEMP}")
    print(f"  Augmentation: CutMix + Mixup")
    
    # Re-initialize student model (fresh weights from ImageNet)
    student_model = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
    student_model.classifier[1] = nn.Linear(student_model.classifier[1].in_features, NUM_CLASSES)
    student_model = student_model.to(device)
    
    # Optimizer & Scheduler
    opt_s = optim.AdamW(student_model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    sch_s = optim.lr_scheduler.CosineAnnealingLR(opt_s, T_max=NUM_EPOCHS)
    
    # Run Training
    trained_student, distilled_history = train_model_optimized(
        model=student_model,
        dataloader=trainloader,
        optimizer=opt_s,
        scheduler=sch_s,
        num_epochs=NUM_EPOCHS,
        model_name=student_name,
        teacher_model=teacher_model,
        temp=KD_TEMP,
        kd_alpha=KD_ALPHA,
        patience=PATIENCE
    )
    
    distilled_accuracy, _ = evaluate_model_with_loss(trained_student, testloader, nn.CrossEntropyLoss())
    print(f"\nDistilled Student Final Accuracy: {distilled_accuracy:.2f}%")


DISTILLED STUDENT MODEL (Standard KD + CutMix/Mixup)

Starting Knowledge Distillation...
  KD Alpha (soft weight): 0.9
  Temperature: 4.0
  Augmentation: CutMix + Mixup
  Starting fresh training...


Epoch 1/200: 100%|██████████| 390/390 [01:11<00:00,  5.44it/s, loss=1.068]


Epoch 1/200 | Train Loss: 1.4984 | Val Loss: 2.9059 | Val Acc: 30.31% | LR: 0.001000
  New best model saved! Accuracy: 30.31%


Epoch 2/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.199]


Epoch 2/200 | Train Loss: 1.3106 | Val Loss: 2.6329 | Val Acc: 36.79% | LR: 0.001000
  New best model saved! Accuracy: 36.79%


Epoch 3/200: 100%|██████████| 390/390 [01:01<00:00,  6.32it/s, loss=1.511]


Epoch 3/200 | Train Loss: 1.2492 | Val Loss: 2.7419 | Val Acc: 39.57% | LR: 0.000999
  New best model saved! Accuracy: 39.57%


Epoch 4/200: 100%|██████████| 390/390 [00:59<00:00,  6.53it/s, loss=0.918]


Epoch 4/200 | Train Loss: 1.1933 | Val Loss: 2.4072 | Val Acc: 41.78% | LR: 0.000999
  New best model saved! Accuracy: 41.78%


Epoch 5/200: 100%|██████████| 390/390 [00:59<00:00,  6.58it/s, loss=1.000]


Epoch 5/200 | Train Loss: 1.1631 | Val Loss: 2.1269 | Val Acc: 46.35% | LR: 0.000998
  New best model saved! Accuracy: 46.35%


Epoch 6/200: 100%|██████████| 390/390 [00:58<00:00,  6.65it/s, loss=0.925]


Epoch 6/200 | Train Loss: 1.1607 | Val Loss: 2.2338 | Val Acc: 46.86% | LR: 0.000998
  New best model saved! Accuracy: 46.86%


Epoch 7/200: 100%|██████████| 390/390 [01:09<00:00,  5.63it/s, loss=0.996]


Epoch 7/200 | Train Loss: 1.1789 | Val Loss: 2.3189 | Val Acc: 45.79% | LR: 0.000997


Epoch 8/200: 100%|██████████| 390/390 [01:15<00:00,  5.16it/s, loss=1.058]


Epoch 8/200 | Train Loss: 1.1542 | Val Loss: 2.1901 | Val Acc: 47.79% | LR: 0.000996
  New best model saved! Accuracy: 47.79%


Epoch 9/200: 100%|██████████| 390/390 [01:16<00:00,  5.12it/s, loss=0.911]


Epoch 9/200 | Train Loss: 1.0654 | Val Loss: 2.2431 | Val Acc: 47.01% | LR: 0.000995


Epoch 10/200: 100%|██████████| 390/390 [01:14<00:00,  5.24it/s, loss=0.912]


Epoch 10/200 | Train Loss: 1.1418 | Val Loss: 2.1168 | Val Acc: 49.49% | LR: 0.000994
  New best model saved! Accuracy: 49.49%


Epoch 11/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.989]


Epoch 11/200 | Train Loss: 1.0743 | Val Loss: 2.0323 | Val Acc: 51.14% | LR: 0.000993
  New best model saved! Accuracy: 51.14%


Epoch 12/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.907]


Epoch 12/200 | Train Loss: 1.1149 | Val Loss: 1.9992 | Val Acc: 50.97% | LR: 0.000991


Epoch 13/200: 100%|██████████| 390/390 [01:10<00:00,  5.50it/s, loss=0.791]


Epoch 13/200 | Train Loss: 1.0123 | Val Loss: 2.0244 | Val Acc: 52.53% | LR: 0.000990
  New best model saved! Accuracy: 52.53%


Epoch 14/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.821]


Epoch 14/200 | Train Loss: 1.0614 | Val Loss: 2.0744 | Val Acc: 52.56% | LR: 0.000988
  New best model saved! Accuracy: 52.56%


Epoch 15/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.861]


Epoch 15/200 | Train Loss: 1.0470 | Val Loss: 1.9567 | Val Acc: 52.16% | LR: 0.000986


Epoch 16/200: 100%|██████████| 390/390 [01:02<00:00,  6.22it/s, loss=0.841]


Epoch 16/200 | Train Loss: 1.0322 | Val Loss: 1.8937 | Val Acc: 53.57% | LR: 0.000984
  New best model saved! Accuracy: 53.57%


Epoch 17/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=0.763]


Epoch 17/200 | Train Loss: 1.0363 | Val Loss: 1.9656 | Val Acc: 53.31% | LR: 0.000982


Epoch 18/200: 100%|██████████| 390/390 [01:13<00:00,  5.33it/s, loss=1.001]


Epoch 18/200 | Train Loss: 1.0060 | Val Loss: 2.0837 | Val Acc: 51.94% | LR: 0.000980


Epoch 19/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=0.790]


Epoch 19/200 | Train Loss: 1.0312 | Val Loss: 1.8825 | Val Acc: 55.09% | LR: 0.000978
  New best model saved! Accuracy: 55.09%


Epoch 20/200: 100%|██████████| 390/390 [01:12<00:00,  5.36it/s, loss=0.709]


Epoch 20/200 | Train Loss: 1.0367 | Val Loss: 1.9695 | Val Acc: 54.61% | LR: 0.000976
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch20.pth


Epoch 21/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=0.791]


Epoch 21/200 | Train Loss: 1.0057 | Val Loss: 1.8346 | Val Acc: 55.10% | LR: 0.000973
  New best model saved! Accuracy: 55.10%


Epoch 22/200: 100%|██████████| 390/390 [01:13<00:00,  5.31it/s, loss=0.845]


Epoch 22/200 | Train Loss: 1.0295 | Val Loss: 1.8807 | Val Acc: 55.56% | LR: 0.000970
  New best model saved! Accuracy: 55.56%


Epoch 23/200: 100%|██████████| 390/390 [01:12<00:00,  5.36it/s, loss=0.799]


Epoch 23/200 | Train Loss: 1.0326 | Val Loss: 1.8306 | Val Acc: 55.42% | LR: 0.000968


Epoch 24/200: 100%|██████████| 390/390 [01:11<00:00,  5.45it/s, loss=0.628]


Epoch 24/200 | Train Loss: 0.9970 | Val Loss: 1.8376 | Val Acc: 55.99% | LR: 0.000965
  New best model saved! Accuracy: 55.99%


Epoch 25/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.752]


Epoch 25/200 | Train Loss: 0.9927 | Val Loss: 1.8310 | Val Acc: 56.54% | LR: 0.000962
  New best model saved! Accuracy: 56.54%


Epoch 26/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.706]


Epoch 26/200 | Train Loss: 0.9864 | Val Loss: 1.8450 | Val Acc: 56.41% | LR: 0.000959


Epoch 27/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.674]


Epoch 27/200 | Train Loss: 0.9552 | Val Loss: 1.8123 | Val Acc: 56.64% | LR: 0.000956
  New best model saved! Accuracy: 56.64%


Epoch 28/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.794]


Epoch 28/200 | Train Loss: 0.9634 | Val Loss: 1.8771 | Val Acc: 57.39% | LR: 0.000952
  New best model saved! Accuracy: 57.39%


Epoch 29/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=1.277]


Epoch 29/200 | Train Loss: 0.9893 | Val Loss: 1.9061 | Val Acc: 56.43% | LR: 0.000949


Epoch 30/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=2.556]


Epoch 30/200 | Train Loss: 0.9823 | Val Loss: 1.7538 | Val Acc: 57.51% | LR: 0.000946
  New best model saved! Accuracy: 57.51%


Epoch 31/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.598]


Epoch 31/200 | Train Loss: 0.9476 | Val Loss: 1.8284 | Val Acc: 56.71% | LR: 0.000942


Epoch 32/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.744]


Epoch 32/200 | Train Loss: 0.9688 | Val Loss: 1.7769 | Val Acc: 57.61% | LR: 0.000938
  New best model saved! Accuracy: 57.61%


Epoch 33/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.630]


Epoch 33/200 | Train Loss: 0.9660 | Val Loss: 1.8315 | Val Acc: 57.68% | LR: 0.000934
  New best model saved! Accuracy: 57.68%


Epoch 34/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.772]


Epoch 34/200 | Train Loss: 0.9945 | Val Loss: 1.8039 | Val Acc: 58.08% | LR: 0.000930
  New best model saved! Accuracy: 58.08%


Epoch 35/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.758]


Epoch 35/200 | Train Loss: 0.9541 | Val Loss: 1.7235 | Val Acc: 58.52% | LR: 0.000926
  New best model saved! Accuracy: 58.52%


Epoch 36/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.993]


Epoch 36/200 | Train Loss: 0.9714 | Val Loss: 1.8255 | Val Acc: 57.69% | LR: 0.000922


Epoch 37/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.711]


Epoch 37/200 | Train Loss: 0.9407 | Val Loss: 1.7128 | Val Acc: 59.14% | LR: 0.000918
  New best model saved! Accuracy: 59.14%


Epoch 38/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.594]


Epoch 38/200 | Train Loss: 0.9203 | Val Loss: 1.7305 | Val Acc: 59.77% | LR: 0.000914
  New best model saved! Accuracy: 59.77%


Epoch 39/200: 100%|██████████| 390/390 [01:09<00:00,  5.60it/s, loss=0.906]


Epoch 39/200 | Train Loss: 0.9003 | Val Loss: 1.7528 | Val Acc: 59.41% | LR: 0.000909


Epoch 40/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.770]


Epoch 40/200 | Train Loss: 0.9011 | Val Loss: 1.7115 | Val Acc: 60.12% | LR: 0.000905
  New best model saved! Accuracy: 60.12%
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch40.pth


Epoch 41/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=1.034]


Epoch 41/200 | Train Loss: 0.8993 | Val Loss: 1.7161 | Val Acc: 60.00% | LR: 0.000900


Epoch 42/200: 100%|██████████| 390/390 [01:09<00:00,  5.60it/s, loss=0.957]


Epoch 42/200 | Train Loss: 0.9508 | Val Loss: 1.6454 | Val Acc: 61.29% | LR: 0.000895
  New best model saved! Accuracy: 61.29%


Epoch 43/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.756]


Epoch 43/200 | Train Loss: 0.9391 | Val Loss: 1.7656 | Val Acc: 58.61% | LR: 0.000890


Epoch 44/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.569]


Epoch 44/200 | Train Loss: 0.9274 | Val Loss: 1.6711 | Val Acc: 59.28% | LR: 0.000885


Epoch 45/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=1.046]


Epoch 45/200 | Train Loss: 0.9147 | Val Loss: 1.7126 | Val Acc: 59.00% | LR: 0.000880


Epoch 46/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.601]


Epoch 46/200 | Train Loss: 0.9330 | Val Loss: 1.6950 | Val Acc: 58.89% | LR: 0.000875


Epoch 47/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=2.247]


Epoch 47/200 | Train Loss: 0.9352 | Val Loss: 1.7062 | Val Acc: 60.05% | LR: 0.000870


Epoch 48/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=1.025]


Epoch 48/200 | Train Loss: 0.9004 | Val Loss: 1.7120 | Val Acc: 60.07% | LR: 0.000864


Epoch 49/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.772]


Epoch 49/200 | Train Loss: 0.9306 | Val Loss: 1.6527 | Val Acc: 60.59% | LR: 0.000859


Epoch 50/200: 100%|██████████| 390/390 [01:09<00:00,  5.61it/s, loss=0.779]


Epoch 50/200 | Train Loss: 0.9090 | Val Loss: 1.7112 | Val Acc: 58.87% | LR: 0.000854


Epoch 51/200: 100%|██████████| 390/390 [01:09<00:00,  5.60it/s, loss=0.772]


Epoch 51/200 | Train Loss: 0.8869 | Val Loss: 1.7422 | Val Acc: 59.24% | LR: 0.000848


Epoch 52/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=1.099]


Epoch 52/200 | Train Loss: 0.9101 | Val Loss: 1.6318 | Val Acc: 60.89% | LR: 0.000842


Epoch 53/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.742]


Epoch 53/200 | Train Loss: 0.9386 | Val Loss: 1.7180 | Val Acc: 59.20% | LR: 0.000837


Epoch 54/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.659]


Epoch 54/200 | Train Loss: 0.8742 | Val Loss: 1.7166 | Val Acc: 60.09% | LR: 0.000831


Epoch 55/200: 100%|██████████| 390/390 [01:09<00:00,  5.61it/s, loss=0.742]


Epoch 55/200 | Train Loss: 0.8906 | Val Loss: 1.6854 | Val Acc: 61.45% | LR: 0.000825
  New best model saved! Accuracy: 61.45%


Epoch 56/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.834]


Epoch 56/200 | Train Loss: 0.8774 | Val Loss: 1.7361 | Val Acc: 60.33% | LR: 0.000819


Epoch 57/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.734]


Epoch 57/200 | Train Loss: 0.8925 | Val Loss: 1.6606 | Val Acc: 60.47% | LR: 0.000813


Epoch 58/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.535]


Epoch 58/200 | Train Loss: 0.8830 | Val Loss: 1.6725 | Val Acc: 60.67% | LR: 0.000806


Epoch 59/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=0.739]


Epoch 59/200 | Train Loss: 0.9143 | Val Loss: 1.6084 | Val Acc: 61.25% | LR: 0.000800


Epoch 60/200: 100%|██████████| 390/390 [01:09<00:00,  5.61it/s, loss=0.727]


Epoch 60/200 | Train Loss: 0.8913 | Val Loss: 1.6884 | Val Acc: 61.24% | LR: 0.000794
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch60.pth


Epoch 61/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=0.838]


Epoch 61/200 | Train Loss: 0.8710 | Val Loss: 1.6241 | Val Acc: 61.05% | LR: 0.000788


Epoch 62/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.822]


Epoch 62/200 | Train Loss: 0.8733 | Val Loss: 1.6083 | Val Acc: 61.85% | LR: 0.000781
  New best model saved! Accuracy: 61.85%


Epoch 63/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.656]


Epoch 63/200 | Train Loss: 0.8617 | Val Loss: 1.6641 | Val Acc: 61.54% | LR: 0.000775


Epoch 64/200: 100%|██████████| 390/390 [01:13<00:00,  5.30it/s, loss=3.007]


Epoch 64/200 | Train Loss: 0.8942 | Val Loss: 1.6159 | Val Acc: 61.75% | LR: 0.000768


Epoch 65/200: 100%|██████████| 390/390 [01:14<00:00,  5.25it/s, loss=0.907]


Epoch 65/200 | Train Loss: 0.8632 | Val Loss: 1.5954 | Val Acc: 63.09% | LR: 0.000761
  New best model saved! Accuracy: 63.09%


Epoch 66/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=0.741]


Epoch 66/200 | Train Loss: 0.8477 | Val Loss: 1.6056 | Val Acc: 62.06% | LR: 0.000755


Epoch 67/200: 100%|██████████| 390/390 [01:13<00:00,  5.34it/s, loss=0.847]


Epoch 67/200 | Train Loss: 0.8356 | Val Loss: 1.6658 | Val Acc: 61.65% | LR: 0.000748


Epoch 68/200: 100%|██████████| 390/390 [01:13<00:00,  5.34it/s, loss=0.657]


Epoch 68/200 | Train Loss: 0.8517 | Val Loss: 1.6090 | Val Acc: 61.91% | LR: 0.000741


Epoch 69/200: 100%|██████████| 390/390 [01:13<00:00,  5.33it/s, loss=1.267]


Epoch 69/200 | Train Loss: 0.8425 | Val Loss: 1.5726 | Val Acc: 62.63% | LR: 0.000734


Epoch 70/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.679]


Epoch 70/200 | Train Loss: 0.8725 | Val Loss: 1.6430 | Val Acc: 61.48% | LR: 0.000727


Epoch 71/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.760]


Epoch 71/200 | Train Loss: 0.8866 | Val Loss: 1.5856 | Val Acc: 62.49% | LR: 0.000720


Epoch 72/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=1.652]


Epoch 72/200 | Train Loss: 0.8409 | Val Loss: 1.5640 | Val Acc: 63.30% | LR: 0.000713
  New best model saved! Accuracy: 63.30%


Epoch 73/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.725]


Epoch 73/200 | Train Loss: 0.8427 | Val Loss: 1.5660 | Val Acc: 62.93% | LR: 0.000706


Epoch 74/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.748]


Epoch 74/200 | Train Loss: 0.8321 | Val Loss: 1.5975 | Val Acc: 63.16% | LR: 0.000699


Epoch 75/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.580]


Epoch 75/200 | Train Loss: 0.8265 | Val Loss: 1.5721 | Val Acc: 63.28% | LR: 0.000691


Epoch 76/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.730]


Epoch 76/200 | Train Loss: 0.8618 | Val Loss: 1.6365 | Val Acc: 61.60% | LR: 0.000684


Epoch 77/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.637]


Epoch 77/200 | Train Loss: 0.8377 | Val Loss: 1.6020 | Val Acc: 63.96% | LR: 0.000677
  New best model saved! Accuracy: 63.96%


Epoch 78/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.626]


Epoch 78/200 | Train Loss: 0.8209 | Val Loss: 1.6624 | Val Acc: 62.98% | LR: 0.000669


Epoch 79/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=0.713]


Epoch 79/200 | Train Loss: 0.8310 | Val Loss: 1.6002 | Val Acc: 62.76% | LR: 0.000662


Epoch 80/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.750]


Epoch 80/200 | Train Loss: 0.8313 | Val Loss: 1.5658 | Val Acc: 64.02% | LR: 0.000655
  New best model saved! Accuracy: 64.02%
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch80.pth
  Cleaned up: distilled_student_kd_epoch20.pth


Epoch 81/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.729]


Epoch 81/200 | Train Loss: 0.8143 | Val Loss: 1.6058 | Val Acc: 62.94% | LR: 0.000647


Epoch 82/200: 100%|██████████| 390/390 [01:10<00:00,  5.52it/s, loss=0.588]


Epoch 82/200 | Train Loss: 0.8294 | Val Loss: 1.5289 | Val Acc: 63.61% | LR: 0.000639


Epoch 83/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.598]


Epoch 83/200 | Train Loss: 0.8414 | Val Loss: 1.5498 | Val Acc: 64.01% | LR: 0.000632


Epoch 84/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=1.044]


Epoch 84/200 | Train Loss: 0.8372 | Val Loss: 1.5757 | Val Acc: 63.21% | LR: 0.000624


Epoch 85/200: 100%|██████████| 390/390 [01:10<00:00,  5.51it/s, loss=0.421]


Epoch 85/200 | Train Loss: 0.8198 | Val Loss: 1.5631 | Val Acc: 63.51% | LR: 0.000617


Epoch 86/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=1.146]


Epoch 86/200 | Train Loss: 0.8201 | Val Loss: 1.5299 | Val Acc: 63.63% | LR: 0.000609


Epoch 87/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.647]


Epoch 87/200 | Train Loss: 0.8339 | Val Loss: 1.5481 | Val Acc: 64.08% | LR: 0.000601
  New best model saved! Accuracy: 64.08%


Epoch 88/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.594]


Epoch 88/200 | Train Loss: 0.8321 | Val Loss: 1.6647 | Val Acc: 62.81% | LR: 0.000594


Epoch 89/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.654]


Epoch 89/200 | Train Loss: 0.7964 | Val Loss: 1.6202 | Val Acc: 62.85% | LR: 0.000586


Epoch 90/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=1.582]


Epoch 90/200 | Train Loss: 0.8023 | Val Loss: 1.5059 | Val Acc: 64.68% | LR: 0.000578
  New best model saved! Accuracy: 64.68%


Epoch 91/200: 100%|██████████| 390/390 [01:09<00:00,  5.62it/s, loss=0.918]


Epoch 91/200 | Train Loss: 0.8319 | Val Loss: 1.5707 | Val Acc: 63.85% | LR: 0.000570


Epoch 92/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.559]


Epoch 92/200 | Train Loss: 0.8367 | Val Loss: 1.4544 | Val Acc: 65.24% | LR: 0.000563
  New best model saved! Accuracy: 65.24%


Epoch 93/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.689]


Epoch 93/200 | Train Loss: 0.7738 | Val Loss: 1.5126 | Val Acc: 65.38% | LR: 0.000555
  New best model saved! Accuracy: 65.38%


Epoch 94/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=0.389]


Epoch 94/200 | Train Loss: 0.7827 | Val Loss: 1.5920 | Val Acc: 63.86% | LR: 0.000547


Epoch 95/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.692]


Epoch 95/200 | Train Loss: 0.8230 | Val Loss: 1.5488 | Val Acc: 64.33% | LR: 0.000539


Epoch 96/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.491]


Epoch 96/200 | Train Loss: 0.8287 | Val Loss: 1.4846 | Val Acc: 64.97% | LR: 0.000531


Epoch 97/200: 100%|██████████| 390/390 [01:10<00:00,  5.52it/s, loss=0.649]


Epoch 97/200 | Train Loss: 0.7966 | Val Loss: 1.5676 | Val Acc: 63.89% | LR: 0.000524


Epoch 98/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.525]


Epoch 98/200 | Train Loss: 0.7492 | Val Loss: 1.4646 | Val Acc: 65.75% | LR: 0.000516
  New best model saved! Accuracy: 65.75%


Epoch 99/200: 100%|██████████| 390/390 [01:10<00:00,  5.51it/s, loss=1.428]


Epoch 99/200 | Train Loss: 0.8046 | Val Loss: 1.5345 | Val Acc: 65.00% | LR: 0.000508


Epoch 100/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.711]


Epoch 100/200 | Train Loss: 0.8091 | Val Loss: 1.4717 | Val Acc: 65.11% | LR: 0.000500
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch100.pth
  Cleaned up: distilled_student_kd_epoch100.pth


Epoch 101/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.682]


Epoch 101/200 | Train Loss: 0.7505 | Val Loss: 1.5364 | Val Acc: 64.87% | LR: 0.000492


Epoch 102/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.669]


Epoch 102/200 | Train Loss: 0.8456 | Val Loss: 1.5194 | Val Acc: 64.58% | LR: 0.000484


Epoch 103/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.626]


Epoch 103/200 | Train Loss: 0.7791 | Val Loss: 1.4539 | Val Acc: 65.01% | LR: 0.000476


Epoch 104/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=1.009]


Epoch 104/200 | Train Loss: 0.7637 | Val Loss: 1.5360 | Val Acc: 64.85% | LR: 0.000469


Epoch 105/200: 100%|██████████| 390/390 [01:10<00:00,  5.52it/s, loss=1.232]


Epoch 105/200 | Train Loss: 0.7667 | Val Loss: 1.4938 | Val Acc: 65.19% | LR: 0.000461


Epoch 106/200: 100%|██████████| 390/390 [01:10<00:00,  5.52it/s, loss=0.753]


Epoch 106/200 | Train Loss: 0.7762 | Val Loss: 1.5215 | Val Acc: 65.45% | LR: 0.000453


Epoch 107/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.557]


Epoch 107/200 | Train Loss: 0.7900 | Val Loss: 1.5446 | Val Acc: 65.56% | LR: 0.000445


Epoch 108/200: 100%|██████████| 390/390 [01:09<00:00,  5.60it/s, loss=0.686]


Epoch 108/200 | Train Loss: 0.7775 | Val Loss: 1.5057 | Val Acc: 64.83% | LR: 0.000437


Epoch 109/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.399]


Epoch 109/200 | Train Loss: 0.7975 | Val Loss: 1.4279 | Val Acc: 65.86% | LR: 0.000430
  New best model saved! Accuracy: 65.86%


Epoch 110/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.523]


Epoch 110/200 | Train Loss: 0.7440 | Val Loss: 1.4363 | Val Acc: 66.28% | LR: 0.000422
  New best model saved! Accuracy: 66.28%


Epoch 111/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.622]


Epoch 111/200 | Train Loss: 0.7546 | Val Loss: 1.4882 | Val Acc: 66.40% | LR: 0.000414
  New best model saved! Accuracy: 66.40%


Epoch 112/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.642]


Epoch 112/200 | Train Loss: 0.7520 | Val Loss: 1.4922 | Val Acc: 65.33% | LR: 0.000406


Epoch 113/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.826]


Epoch 113/200 | Train Loss: 0.7984 | Val Loss: 1.4709 | Val Acc: 66.64% | LR: 0.000399
  New best model saved! Accuracy: 66.64%


Epoch 114/200: 100%|██████████| 390/390 [01:09<00:00,  5.62it/s, loss=0.582]


Epoch 114/200 | Train Loss: 0.7489 | Val Loss: 1.4005 | Val Acc: 66.54% | LR: 0.000391


Epoch 115/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=0.620]


Epoch 115/200 | Train Loss: 0.7724 | Val Loss: 1.4000 | Val Acc: 67.57% | LR: 0.000383
  New best model saved! Accuracy: 67.57%


Epoch 116/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=1.212]


Epoch 116/200 | Train Loss: 0.7351 | Val Loss: 1.4560 | Val Acc: 66.95% | LR: 0.000376


Epoch 117/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=1.606]


Epoch 117/200 | Train Loss: 0.7667 | Val Loss: 1.3887 | Val Acc: 67.21% | LR: 0.000368


Epoch 118/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.538]


Epoch 118/200 | Train Loss: 0.7582 | Val Loss: 1.4366 | Val Acc: 67.13% | LR: 0.000361


Epoch 119/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.566]


Epoch 119/200 | Train Loss: 0.7245 | Val Loss: 1.3727 | Val Acc: 67.24% | LR: 0.000353


Epoch 120/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=1.066]


Epoch 120/200 | Train Loss: 0.7045 | Val Loss: 1.3966 | Val Acc: 67.84% | LR: 0.000345
  New best model saved! Accuracy: 67.84%
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch120.pth
  Cleaned up: distilled_student_kd_epoch120.pth


Epoch 121/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.341]


Epoch 121/200 | Train Loss: 0.7135 | Val Loss: 1.4588 | Val Acc: 67.76% | LR: 0.000338


Epoch 122/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.709]


Epoch 122/200 | Train Loss: 0.7228 | Val Loss: 1.4143 | Val Acc: 66.90% | LR: 0.000331


Epoch 123/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.562]


Epoch 123/200 | Train Loss: 0.7103 | Val Loss: 1.4418 | Val Acc: 67.68% | LR: 0.000323


Epoch 124/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.586]


Epoch 124/200 | Train Loss: 0.7370 | Val Loss: 1.4510 | Val Acc: 67.59% | LR: 0.000316


Epoch 125/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=2.852]


Epoch 125/200 | Train Loss: 0.7325 | Val Loss: 1.4428 | Val Acc: 67.40% | LR: 0.000309


Epoch 126/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.615]


Epoch 126/200 | Train Loss: 0.7025 | Val Loss: 1.4357 | Val Acc: 66.90% | LR: 0.000301


Epoch 127/200: 100%|██████████| 390/390 [01:09<00:00,  5.62it/s, loss=0.361]


Epoch 127/200 | Train Loss: 0.7616 | Val Loss: 1.3908 | Val Acc: 68.01% | LR: 0.000294
  New best model saved! Accuracy: 68.01%


Epoch 128/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.585]


Epoch 128/200 | Train Loss: 0.7209 | Val Loss: 1.3389 | Val Acc: 68.57% | LR: 0.000287
  New best model saved! Accuracy: 68.57%


Epoch 129/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.618]


Epoch 129/200 | Train Loss: 0.7388 | Val Loss: 1.4326 | Val Acc: 68.00% | LR: 0.000280


Epoch 130/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.524]


Epoch 130/200 | Train Loss: 0.7078 | Val Loss: 1.3868 | Val Acc: 68.84% | LR: 0.000273
  New best model saved! Accuracy: 68.84%


Epoch 131/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.485]


Epoch 131/200 | Train Loss: 0.6787 | Val Loss: 1.3782 | Val Acc: 69.28% | LR: 0.000266
  New best model saved! Accuracy: 69.28%


Epoch 132/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.655]


Epoch 132/200 | Train Loss: 0.7061 | Val Loss: 1.3351 | Val Acc: 68.79% | LR: 0.000259


Epoch 133/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=1.225]


Epoch 133/200 | Train Loss: 0.6889 | Val Loss: 1.3379 | Val Acc: 68.85% | LR: 0.000252


Epoch 134/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.641]


Epoch 134/200 | Train Loss: 0.6852 | Val Loss: 1.3982 | Val Acc: 68.07% | LR: 0.000245


Epoch 135/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.564]


Epoch 135/200 | Train Loss: 0.7186 | Val Loss: 1.3348 | Val Acc: 68.72% | LR: 0.000239


Epoch 136/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.389]


Epoch 136/200 | Train Loss: 0.6805 | Val Loss: 1.4031 | Val Acc: 68.91% | LR: 0.000232


Epoch 137/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.493]


Epoch 137/200 | Train Loss: 0.7215 | Val Loss: 1.3173 | Val Acc: 69.21% | LR: 0.000225


Epoch 138/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.559]


Epoch 138/200 | Train Loss: 0.6967 | Val Loss: 1.3388 | Val Acc: 68.90% | LR: 0.000219


Epoch 139/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.611]


Epoch 139/200 | Train Loss: 0.6792 | Val Loss: 1.3771 | Val Acc: 69.16% | LR: 0.000212


Epoch 140/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=0.601]


Epoch 140/200 | Train Loss: 0.6994 | Val Loss: 1.3556 | Val Acc: 69.82% | LR: 0.000206
  New best model saved! Accuracy: 69.82%
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch140.pth
  Cleaned up: distilled_student_kd_epoch140.pth


Epoch 141/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.731]


Epoch 141/200 | Train Loss: 0.6754 | Val Loss: 1.2750 | Val Acc: 70.28% | LR: 0.000200
  New best model saved! Accuracy: 70.28%


Epoch 142/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.555]


Epoch 142/200 | Train Loss: 0.6714 | Val Loss: 1.3832 | Val Acc: 69.19% | LR: 0.000194


Epoch 143/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.407]


Epoch 143/200 | Train Loss: 0.6571 | Val Loss: 1.3640 | Val Acc: 69.25% | LR: 0.000187


Epoch 144/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.592]


Epoch 144/200 | Train Loss: 0.7075 | Val Loss: 1.3440 | Val Acc: 69.90% | LR: 0.000181


Epoch 145/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.658]


Epoch 145/200 | Train Loss: 0.6962 | Val Loss: 1.3199 | Val Acc: 69.94% | LR: 0.000175


Epoch 146/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.413]


Epoch 146/200 | Train Loss: 0.6689 | Val Loss: 1.3581 | Val Acc: 69.47% | LR: 0.000169


Epoch 147/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.305]


Epoch 147/200 | Train Loss: 0.6328 | Val Loss: 1.3403 | Val Acc: 70.37% | LR: 0.000163
  New best model saved! Accuracy: 70.37%


Epoch 148/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.294]


Epoch 148/200 | Train Loss: 0.6671 | Val Loss: 1.2987 | Val Acc: 70.54% | LR: 0.000158
  New best model saved! Accuracy: 70.54%


Epoch 149/200: 100%|██████████| 390/390 [01:09<00:00,  5.61it/s, loss=0.871]


Epoch 149/200 | Train Loss: 0.6559 | Val Loss: 1.3645 | Val Acc: 70.17% | LR: 0.000152


Epoch 150/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=1.054]


Epoch 150/200 | Train Loss: 0.6445 | Val Loss: 1.3164 | Val Acc: 70.05% | LR: 0.000146


Epoch 151/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.637]


Epoch 151/200 | Train Loss: 0.6873 | Val Loss: 1.3220 | Val Acc: 70.18% | LR: 0.000141


Epoch 152/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.783]


Epoch 152/200 | Train Loss: 0.6314 | Val Loss: 1.3549 | Val Acc: 69.87% | LR: 0.000136


Epoch 153/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.717]


Epoch 153/200 | Train Loss: 0.6214 | Val Loss: 1.3086 | Val Acc: 70.70% | LR: 0.000130
  New best model saved! Accuracy: 70.70%


Epoch 154/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.583]


Epoch 154/200 | Train Loss: 0.6697 | Val Loss: 1.3356 | Val Acc: 70.55% | LR: 0.000125


Epoch 155/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.326]


Epoch 155/200 | Train Loss: 0.6358 | Val Loss: 1.3111 | Val Acc: 70.66% | LR: 0.000120


Epoch 156/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=0.350]


Epoch 156/200 | Train Loss: 0.6623 | Val Loss: 1.3307 | Val Acc: 70.11% | LR: 0.000115


Epoch 157/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.647]


Epoch 157/200 | Train Loss: 0.6342 | Val Loss: 1.3364 | Val Acc: 70.41% | LR: 0.000110


Epoch 158/200: 100%|██████████| 390/390 [01:09<00:00,  5.60it/s, loss=0.596]


Epoch 158/200 | Train Loss: 0.6459 | Val Loss: 1.2925 | Val Acc: 70.91% | LR: 0.000105
  New best model saved! Accuracy: 70.91%


Epoch 159/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.574]


Epoch 159/200 | Train Loss: 0.6426 | Val Loss: 1.2765 | Val Acc: 71.27% | LR: 0.000100
  New best model saved! Accuracy: 71.27%


Epoch 160/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=1.098]


Epoch 160/200 | Train Loss: 0.5993 | Val Loss: 1.3248 | Val Acc: 71.09% | LR: 0.000095
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch160.pth
  Cleaned up: distilled_student_kd_epoch160.pth


Epoch 161/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.319]


Epoch 161/200 | Train Loss: 0.6231 | Val Loss: 1.2932 | Val Acc: 71.28% | LR: 0.000091
  New best model saved! Accuracy: 71.28%


Epoch 162/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.886]


Epoch 162/200 | Train Loss: 0.6358 | Val Loss: 1.2677 | Val Acc: 71.33% | LR: 0.000086
  New best model saved! Accuracy: 71.33%


Epoch 163/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=0.295]


Epoch 163/200 | Train Loss: 0.6354 | Val Loss: 1.2776 | Val Acc: 70.81% | LR: 0.000082


Epoch 164/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.888]


Epoch 164/200 | Train Loss: 0.6238 | Val Loss: 1.2708 | Val Acc: 71.30% | LR: 0.000078


Epoch 165/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.609]


Epoch 165/200 | Train Loss: 0.6344 | Val Loss: 1.3306 | Val Acc: 71.09% | LR: 0.000074


Epoch 166/200: 100%|██████████| 390/390 [01:09<00:00,  5.61it/s, loss=0.894]


Epoch 166/200 | Train Loss: 0.5980 | Val Loss: 1.2813 | Val Acc: 71.34% | LR: 0.000070
  New best model saved! Accuracy: 71.34%


Epoch 167/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.896]


Epoch 167/200 | Train Loss: 0.6062 | Val Loss: 1.2694 | Val Acc: 71.32% | LR: 0.000066


Epoch 168/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.506]


Epoch 168/200 | Train Loss: 0.6494 | Val Loss: 1.2394 | Val Acc: 71.62% | LR: 0.000062
  New best model saved! Accuracy: 71.62%


Epoch 169/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.443]


Epoch 169/200 | Train Loss: 0.6387 | Val Loss: 1.2663 | Val Acc: 71.70% | LR: 0.000058
  New best model saved! Accuracy: 71.70%


Epoch 170/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.710]


Epoch 170/200 | Train Loss: 0.6259 | Val Loss: 1.2587 | Val Acc: 71.74% | LR: 0.000054
  New best model saved! Accuracy: 71.74%


Epoch 171/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.821]


Epoch 171/200 | Train Loss: 0.6213 | Val Loss: 1.2894 | Val Acc: 71.96% | LR: 0.000051
  New best model saved! Accuracy: 71.96%


Epoch 172/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.395]


Epoch 172/200 | Train Loss: 0.6356 | Val Loss: 1.2632 | Val Acc: 71.93% | LR: 0.000048


Epoch 173/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.316]


Epoch 173/200 | Train Loss: 0.6048 | Val Loss: 1.2455 | Val Acc: 71.74% | LR: 0.000044


Epoch 174/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.303]


Epoch 174/200 | Train Loss: 0.5934 | Val Loss: 1.2709 | Val Acc: 71.75% | LR: 0.000041


Epoch 175/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.601]


Epoch 175/200 | Train Loss: 0.6089 | Val Loss: 1.2860 | Val Acc: 71.77% | LR: 0.000038


Epoch 176/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.600]


Epoch 176/200 | Train Loss: 0.5929 | Val Loss: 1.2591 | Val Acc: 71.78% | LR: 0.000035


Epoch 177/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.799]


Epoch 177/200 | Train Loss: 0.6419 | Val Loss: 1.2614 | Val Acc: 71.97% | LR: 0.000032
  New best model saved! Accuracy: 71.97%


Epoch 178/200: 100%|██████████| 390/390 [01:10<00:00,  5.51it/s, loss=0.332]


Epoch 178/200 | Train Loss: 0.6275 | Val Loss: 1.2329 | Val Acc: 71.95% | LR: 0.000030


Epoch 179/200: 100%|██████████| 390/390 [01:09<00:00,  5.59it/s, loss=0.551]


Epoch 179/200 | Train Loss: 0.5820 | Val Loss: 1.2801 | Val Acc: 71.83% | LR: 0.000027


Epoch 180/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.302]


Epoch 180/200 | Train Loss: 0.5821 | Val Loss: 1.2435 | Val Acc: 72.20% | LR: 0.000024
  New best model saved! Accuracy: 72.20%
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch180.pth
  Cleaned up: distilled_student_kd_epoch180.pth


Epoch 181/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.546]


Epoch 181/200 | Train Loss: 0.5995 | Val Loss: 1.2872 | Val Acc: 71.93% | LR: 0.000022


Epoch 182/200: 100%|██████████| 390/390 [01:09<00:00,  5.58it/s, loss=0.595]


Epoch 182/200 | Train Loss: 0.6353 | Val Loss: 1.2570 | Val Acc: 72.07% | LR: 0.000020


Epoch 183/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.251]


Epoch 183/200 | Train Loss: 0.6003 | Val Loss: 1.2687 | Val Acc: 71.63% | LR: 0.000018


Epoch 184/200: 100%|██████████| 390/390 [01:10<00:00,  5.57it/s, loss=0.369]


Epoch 184/200 | Train Loss: 0.5946 | Val Loss: 1.2838 | Val Acc: 71.71% | LR: 0.000016


Epoch 185/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.604]


Epoch 185/200 | Train Loss: 0.6033 | Val Loss: 1.2969 | Val Acc: 71.68% | LR: 0.000014


Epoch 186/200: 100%|██████████| 390/390 [01:09<00:00,  5.60it/s, loss=0.263]


Epoch 186/200 | Train Loss: 0.6050 | Val Loss: 1.2349 | Val Acc: 72.35% | LR: 0.000012
  New best model saved! Accuracy: 72.35%


Epoch 187/200: 100%|██████████| 390/390 [01:10<00:00,  5.53it/s, loss=0.504]


Epoch 187/200 | Train Loss: 0.5843 | Val Loss: 1.2478 | Val Acc: 72.12% | LR: 0.000010


Epoch 188/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.532]


Epoch 188/200 | Train Loss: 0.5931 | Val Loss: 1.2631 | Val Acc: 72.19% | LR: 0.000009


Epoch 189/200: 100%|██████████| 390/390 [01:09<00:00,  5.57it/s, loss=1.427]


Epoch 189/200 | Train Loss: 0.6204 | Val Loss: 1.2347 | Val Acc: 72.22% | LR: 0.000007


Epoch 190/200: 100%|██████████| 390/390 [01:10<00:00,  5.51it/s, loss=0.360]


Epoch 190/200 | Train Loss: 0.5838 | Val Loss: 1.2519 | Val Acc: 72.27% | LR: 0.000006


Epoch 191/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=0.528]


Epoch 191/200 | Train Loss: 0.6402 | Val Loss: 1.2514 | Val Acc: 72.00% | LR: 0.000005


Epoch 192/200: 100%|██████████| 390/390 [01:10<00:00,  5.55it/s, loss=1.215]


Epoch 192/200 | Train Loss: 0.5705 | Val Loss: 1.2576 | Val Acc: 72.30% | LR: 0.000004


Epoch 193/200: 100%|██████████| 390/390 [01:13<00:00,  5.33it/s, loss=0.470]


Epoch 193/200 | Train Loss: 0.5743 | Val Loss: 1.2603 | Val Acc: 72.19% | LR: 0.000003


Epoch 194/200: 100%|██████████| 390/390 [01:10<00:00,  5.56it/s, loss=0.548]


Epoch 194/200 | Train Loss: 0.6022 | Val Loss: 1.2726 | Val Acc: 72.35% | LR: 0.000002


Epoch 195/200: 100%|██████████| 390/390 [01:10<00:00,  5.51it/s, loss=0.716]


Epoch 195/200 | Train Loss: 0.5771 | Val Loss: 1.2457 | Val Acc: 72.24% | LR: 0.000002


Epoch 196/200: 100%|██████████| 390/390 [01:10<00:00,  5.52it/s, loss=0.384]


Epoch 196/200 | Train Loss: 0.5827 | Val Loss: 1.2541 | Val Acc: 72.27% | LR: 0.000001


Epoch 197/200: 100%|██████████| 390/390 [01:10<00:00,  5.50it/s, loss=0.398]


Epoch 197/200 | Train Loss: 0.5988 | Val Loss: 1.2572 | Val Acc: 71.88% | LR: 0.000001


Epoch 198/200: 100%|██████████| 390/390 [01:11<00:00,  5.47it/s, loss=0.474]


Epoch 198/200 | Train Loss: 0.5981 | Val Loss: 1.2802 | Val Acc: 71.84% | LR: 0.000000


Epoch 199/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.363]


Epoch 199/200 | Train Loss: 0.5968 | Val Loss: 1.2367 | Val Acc: 72.36% | LR: 0.000000
  New best model saved! Accuracy: 72.36%


Epoch 200/200: 100%|██████████| 390/390 [01:10<00:00,  5.54it/s, loss=0.498]


Epoch 200/200 | Train Loss: 0.5887 | Val Loss: 1.2385 | Val Acc: 72.19% | LR: 0.000000
  Checkpoint saved: outputs\checkpoints\distilled_student_kd_epoch200.pth
  Cleaned up: distilled_student_kd_epoch200.pth

Training complete. Best accuracy: 72.36%

Distilled Student Final Accuracy: 72.36%


In [9]:
# Cell 9: Results Summary
print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)

print("\n| Model                    | Accuracy (%) |")
print("|--------------------------|--------------|")
try:
    print(f"| Teacher (EfficientNet-L) | {teacher_accuracy:12.2f} |")
except NameError:
    print(f"| Teacher (EfficientNet-L) | {'N/A':>12} |")

try:
    print(f"| Distilled (DKD+CutMix)   | {distilled_accuracy:12.2f} |")
except NameError:
    print(f"| Distilled (DKD+CutMix)   | {'N/A':>12} |")

try:
    improvement = distilled_accuracy - teacher_accuracy
    print(f"\n{'='*80}")
    if improvement > 0:
        print(f"Student SURPASSED Teacher by: {improvement:+.2f}%")
    else:
        print(f"Gap from Teacher: {improvement:.2f}%")
    print(f"{'='*80}")
except NameError:
    pass

print(f"\nAll models saved to: {MODEL_DIR.absolute()}")
print(f"Checkpoints saved to: {CHECKPOINT_DIR.absolute()}")


FINAL RESULTS SUMMARY

| Model                    | Accuracy (%) |
|--------------------------|--------------|
| Teacher (EfficientNet-L) |        75.75 |
| Distilled (DKD+CutMix)   |        72.36 |

Gap from Teacher: -3.39%

All models saved to: d:\Projects\MasterProject\code\outputs\models
Checkpoints saved to: d:\Projects\MasterProject\code\outputs\checkpoints
