# Knowledge Distillation for EfficientNetV2 on CIFAR-100 (SOTA) - Local Version

This notebook implements **State-of-the-Art** knowledge distillation with:

- **Decoupled Knowledge Distillation (DKD)** - Separates target and non-target class knowledge
- **CutMix + Mixup** - Advanced data augmentation

---

## Requirements

```bash
pip install torch torchvision tqdm thop matplotlib numpy
```

### Expected Results:

- **Teacher (EfficientNetV2-L):** ~68-70% accuracy
- **Distilled Student (DKD):** ~69-72% accuracy

---


In [1]:
# Cell 1: Setup and Imports
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.models import efficientnet_v2_s, efficientnet_v2_l, EfficientNet_V2_S_Weights, EfficientNet_V2_L_Weights
from tqdm import tqdm
import copy
import numpy as np
import matplotlib.pyplot as plt
import glob
import random
from pathlib import Path

# Setup Directories (Local paths)
PROJECT_ROOT = Path('./outputs')
MODEL_DIR = PROJECT_ROOT / 'models'
DATA_DIR = PROJECT_ROOT / 'data'
CHECKPOINT_DIR = PROJECT_ROOT / 'checkpoints'

MODEL_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Directories ready at: {PROJECT_ROOT.absolute()}")

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Directories ready at: d:\Projects\MasterProject\code\outputs
Using device: cuda
GPU: NVIDIA GeForce RTX 5070 Laptop GPU


In [2]:
# Cell 2: Experiment Configuration (SOTA)
# ==========================================
# HYPERPARAMETERS
# ==========================================
NUM_EPOCHS = 200
BATCH_SIZE = 128
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.05
PATIENCE = 30

# Distillation Params (DKD)
DKD_ALPHA = 1.0             # Weight for Target Knowledge
DKD_BETA = 8.0              # Weight for Non-Target Knowledge (Crucial)
TEMPERATURE = 4.0           # Softmax Temperature

# Augmentation Params
MIXUP_ALPHA = 0.8
CUTMIX_ALPHA = 1.0
CHECKPOINT_FREQUENCY = 20
NUM_CLASSES = 100

print(f"{'='*50}")
print(f"CONFIG: Epochs={NUM_EPOCHS} | Batch={BATCH_SIZE} | Temp={TEMPERATURE}")
print(f"DKD: Alpha={DKD_ALPHA}, Beta={DKD_BETA}")
print(f"{'='*50}")

CONFIG: Epochs=200 | Batch=128 | Temp=4.0
DKD: Alpha=1.0, Beta=8.0


In [3]:
# Cell 3: Data Loading (Enhanced with AutoAugment + RandomErasing)
from torchvision.transforms import autoaugment

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    autoaugment.AutoAugment(policy=autoaugment.AutoAugmentPolicy.CIFAR10),
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
    transforms.RandomErasing(p=0.25, scale=(0.02, 0.2)),  # Cutout-like augmentation
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
])

trainset = torchvision.datasets.CIFAR100(root=str(DATA_DIR), train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, 
                                          num_workers=4, pin_memory=True, drop_last=True)

testset = torchvision.datasets.CIFAR100(root=str(DATA_DIR), train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False, 
                                         num_workers=4, pin_memory=True)

print(f"Data loaded: {len(trainset)} Training, {len(testset)} Test images")
print(f"  Augmentation: AutoAugment + RandomErasing + CutMix/Mixup")

Data loaded: 50000 Training, 10000 Test images
  Augmentation: AutoAugment + RandomErasing + CutMix/Mixup


In [4]:
# Cell 4: Helper Functions (Enhanced KD Loss with Label Smoothing)

# ==========================================
# 1. Enhanced Knowledge Distillation Loss
# With Label Smoothing for better generalization
# ==========================================
def kd_loss(student_logits, teacher_logits, labels, temp=4.0, alpha=0.7, label_smoothing=0.1):
    """
    Enhanced Knowledge Distillation Loss
    - Soft loss: KL divergence between student and teacher soft targets
    - Hard loss: Cross-entropy with label smoothing
    - alpha: weight for soft loss (1-alpha for hard loss)
    """
    # Soft targets (KL Divergence)
    soft_student = F.log_softmax(student_logits / temp, dim=1)
    soft_teacher = F.softmax(teacher_logits / temp, dim=1)
    soft_loss = F.kl_div(soft_student, soft_teacher, reduction='batchmean') * (temp ** 2)
    
    # Hard targets with Label Smoothing
    hard_loss = F.cross_entropy(student_logits, labels, label_smoothing=label_smoothing)
    
    # Combined loss
    loss = alpha * soft_loss + (1 - alpha) * hard_loss
    
    return loss

def kd_loss_mixup(student_logits, teacher_logits, labels_a, labels_b, lam, 
                  temp=4.0, alpha=0.7, label_smoothing=0.1):
    """
    Enhanced KD Loss for mixed samples (CutMix/Mixup compatible)
    With Label Smoothing
    """
    # Soft targets (KL Divergence)
    soft_student = F.log_softmax(student_logits / temp, dim=1)
    soft_teacher = F.softmax(teacher_logits / temp, dim=1)
    soft_loss = F.kl_div(soft_student, soft_teacher, reduction='batchmean') * (temp ** 2)
    
    # Hard targets with Label Smoothing (Mixed)
    hard_loss = lam * F.cross_entropy(student_logits, labels_a, label_smoothing=label_smoothing) + \
                (1 - lam) * F.cross_entropy(student_logits, labels_b, label_smoothing=label_smoothing)
    
    # Combined loss
    loss = alpha * soft_loss + (1 - alpha) * hard_loss
    
    # NaN safety check
    if torch.isnan(loss):
        return hard_loss  # Fallback to CE only
    
    return loss

# ==========================================
# 2. Augmentations: Mixup & CutMix
# ==========================================
def mixup_data(x, y, alpha=1.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def rand_bbox(size, lam):
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)
    cx = np.random.randint(W)
    cy = np.random.randint(H)
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    return bbx1, bby1, bbx2, bby2

def cutmix_data(x, y, alpha=1.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)
    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)
    x[:, :, bbx1:bbx2, bby1:bby2] = x[index, :, bbx1:bbx2, bby1:bby2]
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size()[-1] * x.size()[-2]))
    y_a, y_b = y, y[index]
    return x, y_a, y_b, lam

# ==========================================
# 3. Utilities (Save/Load/Evaluate)
# ==========================================
def evaluate_model_with_loss(model, dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    with torch.no_grad():
        for data in dataloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    acc = 100 * correct / total
    avg_loss = running_loss / len(dataloader)
    return acc, avg_loss

def save_checkpoint(model, optimizer, scheduler, epoch, best_acc, history, model_name, epochs_no_improve):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'best_acc': best_acc,
        'history': history,
        'epochs_no_improve': epochs_no_improve
    }
    path = CHECKPOINT_DIR / f"{model_name}_epoch{epoch+1}.pth"
    torch.save(checkpoint, path)
    print(f"  Checkpoint saved: {path}")
    
def load_checkpoint(model, optimizer, scheduler, model_name):
    checkpoints = sorted(glob.glob(str(CHECKPOINT_DIR / f"{model_name}_epoch*.pth")))
    if not checkpoints:
        return None
    latest = checkpoints[-1]
    print(f"  Loading checkpoint: {latest}")
    checkpoint = torch.load(latest, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    return checkpoint

def cleanup_old_checkpoints(model_name, keep=3):
    checkpoints = sorted(glob.glob(str(CHECKPOINT_DIR / f"{model_name}_epoch*.pth")))
    if len(checkpoints) > keep:
        for chk in checkpoints[:-keep]:
            os.remove(chk)
            print(f"  Cleaned up: {os.path.basename(chk)}")

print("Helper functions loaded (Enhanced: Label Smoothing + Optimized Alpha)")

Helper functions loaded (Enhanced: Label Smoothing + Optimized Alpha)


In [5]:
# Cell 5: Enhanced Training Loop (with LR Warmup)
def train_model_optimized(model, dataloader, optimizer, scheduler, num_epochs, model_name, 
                         teacher_model=None, temp=4.0, kd_alpha=0.7, label_smoothing=0.1,
                         patience=30, grad_clip=1.0, warmup_epochs=5):
    """
    Enhanced Training loop with:
    - Standard KD loss with Label Smoothing
    - CutMix/Mixup augmentation
    - Learning Rate Warmup
    - Mixed precision training
    - Gradient clipping
    - NaN detection and recovery
    """
    
    # 1. Load Checkpoint
    checkpoint = load_checkpoint(model, optimizer, scheduler, model_name)
    if checkpoint:
        start_epoch = checkpoint['epoch'] + 1
        best_acc = checkpoint['best_acc']
        history = checkpoint['history']
        epochs_no_improve = checkpoint['epochs_no_improve']
        best_model_wts = copy.deepcopy(model.state_dict())
        print(f"  Resuming from epoch {start_epoch}, Best Acc: {best_acc:.2f}%")
    else:
        start_epoch = 0
        best_acc = 0.0
        history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}
        epochs_no_improve = 0
        best_model_wts = copy.deepcopy(model.state_dict())
        print(f"  Starting fresh training...")

    # 2. Setup
    scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())
    val_criterion = nn.CrossEntropyLoss()
    
    # Store base LR for warmup
    base_lr = optimizer.param_groups[0]['lr']
    
    if teacher_model:
        teacher_model.eval()
        for param in teacher_model.parameters():
            param.requires_grad = False

    # 3. Training Loop
    for epoch in range(start_epoch, num_epochs):
        model.train()
        running_loss = 0.0
        valid_batches = 0
        
        # Learning Rate Warmup
        if epoch < warmup_epochs:
            warmup_lr = base_lr * (epoch + 1) / warmup_epochs
            for param_group in optimizer.param_groups:
                param_group['lr'] = warmup_lr
        
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
        
        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Randomly choose Mixup (50%) or CutMix (50%)
            use_cutmix = np.random.rand() > 0.5
            if use_cutmix:
                inputs_aug, labels_a, labels_b, lam = cutmix_data(inputs.clone(), labels, alpha=CUTMIX_ALPHA)
            else:
                inputs_aug, labels_a, labels_b, lam = mixup_data(inputs, labels, alpha=MIXUP_ALPHA)
            
            optimizer.zero_grad()
            
            with torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
                # Student Forward
                student_outputs = model(inputs_aug)
                
                # Loss Calculation
                if teacher_model:
                    # Teacher Forward (no grad)
                    with torch.no_grad():
                        teacher_outputs = teacher_model(inputs_aug)
                    
                    # Enhanced KD Loss with Label Smoothing
                    loss = kd_loss_mixup(
                        student_outputs, teacher_outputs, 
                        labels_a, labels_b, lam,
                        temp=temp, alpha=kd_alpha, label_smoothing=label_smoothing
                    )
                else:
                    # Standard CE with Label Smoothing for Teacher training
                    loss = lam * F.cross_entropy(student_outputs, labels_a, label_smoothing=label_smoothing) + \
                           (1 - lam) * F.cross_entropy(student_outputs, labels_b, label_smoothing=label_smoothing)
            
            # Skip batch if loss is NaN
            if torch.isnan(loss):
                loop.set_postfix(loss="NaN-skip")
                continue
            
            # Backward
            scaler.scale(loss).backward()
            
            if grad_clip > 0:
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
                
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += loss.item()
            valid_batches += 1
            loop.set_postfix(loss=f"{loss.item():.3f}")

        # Step Scheduler (only after warmup)
        if epoch >= warmup_epochs:
            scheduler.step()
        
        # Validation
        train_loss = running_loss / max(valid_batches, 1)
        val_acc, val_loss = evaluate_model_with_loss(model, testloader, val_criterion)
        
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_acc)
        
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | LR: {current_lr:.6f}")
        
        # Save Best
        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), MODEL_DIR / f"{model_name}.pth")
            epochs_no_improve = 0
            print(f"  New best model saved! Accuracy: {best_acc:.2f}%")
        else:
            epochs_no_improve += 1
            
        # Checkpointing
        if (epoch + 1) % CHECKPOINT_FREQUENCY == 0:
            save_checkpoint(model, optimizer, scheduler, epoch, best_acc, history, model_name, epochs_no_improve)
            cleanup_old_checkpoints(model_name)
            
        # Early Stopping
        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs.")
            break
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    print(f"\nTraining complete. Best accuracy: {best_acc:.2f}%")
    model.load_state_dict(best_model_wts)
    return model, history

print("Training function loaded (Enhanced: LR Warmup + Label Smoothing)")

Training function loaded (Enhanced: LR Warmup + Label Smoothing)


In [6]:
# Cell 6: Initialize Models
print("Loading Teacher (EfficientNetV2-L)...")
teacher_model = efficientnet_v2_l(weights=EfficientNet_V2_L_Weights.IMAGENET1K_V1)
teacher_model.classifier[1] = nn.Linear(teacher_model.classifier[1].in_features, NUM_CLASSES)
teacher_model = teacher_model.to(device)

print("Loading Student (EfficientNetV2-S)...")
student_model = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
student_model.classifier[1] = nn.Linear(student_model.classifier[1].in_features, NUM_CLASSES)
student_model = student_model.to(device)

print("Models loaded")

Loading Teacher (EfficientNetV2-L)...
Loading Student (EfficientNetV2-S)...
Models loaded


In [7]:
# Cell 7: Train/Load Teacher Model
print("\n" + "="*70)
print("TEACHER MODEL")
print("="*70)

teacher_path = MODEL_DIR / "teacher_model.pth"
if teacher_path.exists():
    print(f"Found existing Teacher Model: {teacher_path}")
    teacher_model.load_state_dict(torch.load(teacher_path, map_location=device))
else:
    print("Training Teacher Model (This may take a while)...")
    opt_t = optim.AdamW(teacher_model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    sch_t = optim.lr_scheduler.CosineAnnealingLR(opt_t, T_max=NUM_EPOCHS)
    teacher_model, teacher_history = train_model_optimized(
        teacher_model, trainloader, opt_t, sch_t, NUM_EPOCHS, "teacher_model", teacher_model=None
    )

# Evaluate Teacher
teacher_model.eval()
teacher_accuracy, _ = evaluate_model_with_loss(teacher_model, testloader, nn.CrossEntropyLoss())
print(f"\nTeacher Accuracy: {teacher_accuracy:.2f}%")


TEACHER MODEL
Found existing Teacher Model: outputs\models\teacher_model.pth

Teacher Accuracy: 75.76%


In [8]:
# Cell 8: Train Enhanced Distilled Student
print("\n" + "="*70)
print("ENHANCED DISTILLED STUDENT (v2)")
print("="*70)

# Enhanced KD Parameters
KD_ALPHA = 0.7           # More hard label signal (was 0.9)
KD_TEMP = 4.0            # Temperature for softening
LABEL_SMOOTHING = 0.1    # Regularization
WARMUP_EPOCHS = 5        # LR warmup

student_name = "distilled_student_enhanced"
student_path = MODEL_DIR / f"{student_name}.pth"

if student_path.exists():
    print(f"Found existing Enhanced Model: {student_path}")
    student_model.load_state_dict(torch.load(student_path, map_location=device))
    distilled_accuracy, _ = evaluate_model_with_loss(student_model, testloader, nn.CrossEntropyLoss())
    print(f"Enhanced Student Accuracy: {distilled_accuracy:.2f}%")
else:
    print(f"\nStarting Enhanced Knowledge Distillation...")
    print(f"  KD Alpha (soft weight): {KD_ALPHA}")
    print(f"  Temperature: {KD_TEMP}")
    print(f"  Label Smoothing: {LABEL_SMOOTHING}")
    print(f"  LR Warmup: {WARMUP_EPOCHS} epochs")
    print(f"  Augmentation: AutoAugment + RandomErasing + CutMix/Mixup")
    
    # Re-initialize student model (fresh weights from ImageNet)
    student_model = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
    student_model.classifier[1] = nn.Linear(student_model.classifier[1].in_features, NUM_CLASSES)
    student_model = student_model.to(device)
    
    # Optimizer & Scheduler
    opt_s = optim.AdamW(student_model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    sch_s = optim.lr_scheduler.CosineAnnealingLR(opt_s, T_max=NUM_EPOCHS - WARMUP_EPOCHS)
    
    # Run Enhanced Training
    trained_student, distilled_history = train_model_optimized(
        model=student_model,
        dataloader=trainloader,
        optimizer=opt_s,
        scheduler=sch_s,
        num_epochs=NUM_EPOCHS,
        model_name=student_name,
        teacher_model=teacher_model,
        temp=KD_TEMP,
        kd_alpha=KD_ALPHA,
        label_smoothing=LABEL_SMOOTHING,
        patience=PATIENCE,
        warmup_epochs=WARMUP_EPOCHS
    )
    
    distilled_accuracy, _ = evaluate_model_with_loss(trained_student, testloader, nn.CrossEntropyLoss())
    print(f"\nEnhanced Student Final Accuracy: {distilled_accuracy:.2f}%")


ENHANCED DISTILLED STUDENT (v2)

Starting Enhanced Knowledge Distillation...
  KD Alpha (soft weight): 0.7
  Temperature: 4.0
  Label Smoothing: 0.1
  LR Warmup: 5 epochs
  Augmentation: AutoAugment + RandomErasing + CutMix/Mixup
  Starting fresh training...


Epoch 1/200: 100%|██████████| 390/390 [01:05<00:00,  5.93it/s, loss=1.797]


Epoch 1/200 | Train Loss: 2.0196 | Val Loss: 2.9395 | Val Acc: 31.13% | LR: 0.000200
  New best model saved! Accuracy: 31.13%


Epoch 2/200: 100%|██████████| 390/390 [01:16<00:00,  5.11it/s, loss=1.795]


Epoch 2/200 | Train Loss: 1.7863 | Val Loss: 2.3039 | Val Acc: 43.64% | LR: 0.000400
  New best model saved! Accuracy: 43.64%


Epoch 3/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=3.572]


Epoch 3/200 | Train Loss: 1.6890 | Val Loss: 2.2108 | Val Acc: 45.98% | LR: 0.000600
  New best model saved! Accuracy: 45.98%


Epoch 4/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.530]


Epoch 4/200 | Train Loss: 1.6473 | Val Loss: 2.1364 | Val Acc: 47.99% | LR: 0.000800
  New best model saved! Accuracy: 47.99%


Epoch 5/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.607]


Epoch 5/200 | Train Loss: 1.6479 | Val Loss: 2.2271 | Val Acc: 47.82% | LR: 0.001000


Epoch 6/200: 100%|██████████| 390/390 [01:15<00:00,  5.14it/s, loss=1.233]


Epoch 6/200 | Train Loss: 1.6045 | Val Loss: 1.9863 | Val Acc: 51.15% | LR: 0.001000
  New best model saved! Accuracy: 51.15%


Epoch 7/200: 100%|██████████| 390/390 [01:16<00:00,  5.11it/s, loss=1.619]


Epoch 7/200 | Train Loss: 1.6148 | Val Loss: 2.1493 | Val Acc: 49.58% | LR: 0.001000


Epoch 8/200: 100%|██████████| 390/390 [01:16<00:00,  5.13it/s, loss=1.675]


Epoch 8/200 | Train Loss: 1.5943 | Val Loss: 1.9678 | Val Acc: 51.40% | LR: 0.000999
  New best model saved! Accuracy: 51.40%


Epoch 9/200: 100%|██████████| 390/390 [01:16<00:00,  5.11it/s, loss=1.555]


Epoch 9/200 | Train Loss: 1.5634 | Val Loss: 1.9286 | Val Acc: 54.08% | LR: 0.000999
  New best model saved! Accuracy: 54.08%


Epoch 10/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.525]


Epoch 10/200 | Train Loss: 1.5585 | Val Loss: 1.9508 | Val Acc: 54.56% | LR: 0.000998
  New best model saved! Accuracy: 54.56%


Epoch 11/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.659]


Epoch 11/200 | Train Loss: 1.5632 | Val Loss: 1.9278 | Val Acc: 54.64% | LR: 0.000998
  New best model saved! Accuracy: 54.64%


Epoch 12/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.543]


Epoch 12/200 | Train Loss: 1.5306 | Val Loss: 1.8076 | Val Acc: 56.68% | LR: 0.000997
  New best model saved! Accuracy: 56.68%


Epoch 13/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.408]


Epoch 13/200 | Train Loss: 1.4819 | Val Loss: 1.8306 | Val Acc: 56.93% | LR: 0.000996
  New best model saved! Accuracy: 56.93%


Epoch 14/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.525]


Epoch 14/200 | Train Loss: 1.5050 | Val Loss: 1.8356 | Val Acc: 56.00% | LR: 0.000995


Epoch 15/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.277]


Epoch 15/200 | Train Loss: 1.5144 | Val Loss: 1.8029 | Val Acc: 56.07% | LR: 0.000994


Epoch 16/200: 100%|██████████| 390/390 [01:16<00:00,  5.11it/s, loss=1.542]


Epoch 16/200 | Train Loss: 1.4920 | Val Loss: 1.8275 | Val Acc: 55.49% | LR: 0.000992


Epoch 17/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.654]


Epoch 17/200 | Train Loss: 1.4937 | Val Loss: 1.7966 | Val Acc: 56.55% | LR: 0.000991


Epoch 18/200: 100%|██████████| 390/390 [01:16<00:00,  5.11it/s, loss=1.561]


Epoch 18/200 | Train Loss: 1.4895 | Val Loss: 1.8387 | Val Acc: 56.56% | LR: 0.000989


Epoch 19/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.481]


Epoch 19/200 | Train Loss: 1.4943 | Val Loss: 1.7883 | Val Acc: 57.76% | LR: 0.000987
  New best model saved! Accuracy: 57.76%


Epoch 20/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.254]


Epoch 20/200 | Train Loss: 1.4550 | Val Loss: 1.8097 | Val Acc: 57.68% | LR: 0.000985
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch20.pth


Epoch 21/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.232]


Epoch 21/200 | Train Loss: 1.4728 | Val Loss: 1.7933 | Val Acc: 57.11% | LR: 0.000983


Epoch 22/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.479]


Epoch 22/200 | Train Loss: 1.4840 | Val Loss: 1.8447 | Val Acc: 58.21% | LR: 0.000981
  New best model saved! Accuracy: 58.21%


Epoch 23/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.428]


Epoch 23/200 | Train Loss: 1.4715 | Val Loss: 1.7661 | Val Acc: 57.79% | LR: 0.000979


Epoch 24/200: 100%|██████████| 390/390 [01:22<00:00,  4.74it/s, loss=1.195]


Epoch 24/200 | Train Loss: 1.4897 | Val Loss: 1.8245 | Val Acc: 57.93% | LR: 0.000977


Epoch 25/200: 100%|██████████| 390/390 [01:17<00:00,  5.04it/s, loss=1.346]


Epoch 25/200 | Train Loss: 1.4481 | Val Loss: 1.8029 | Val Acc: 59.57% | LR: 0.000974
  New best model saved! Accuracy: 59.57%


Epoch 26/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.336]


Epoch 26/200 | Train Loss: 1.4495 | Val Loss: 1.7460 | Val Acc: 59.26% | LR: 0.000972


Epoch 27/200: 100%|██████████| 390/390 [01:17<00:00,  5.04it/s, loss=1.714]


Epoch 27/200 | Train Loss: 1.4519 | Val Loss: 1.8578 | Val Acc: 58.37% | LR: 0.000969


Epoch 28/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.463]


Epoch 28/200 | Train Loss: 1.4365 | Val Loss: 1.7682 | Val Acc: 59.32% | LR: 0.000966


Epoch 29/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.323]


Epoch 29/200 | Train Loss: 1.4417 | Val Loss: 1.7464 | Val Acc: 60.14% | LR: 0.000963
  New best model saved! Accuracy: 60.14%


Epoch 30/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.359]


Epoch 30/200 | Train Loss: 1.4241 | Val Loss: 1.7976 | Val Acc: 59.01% | LR: 0.000960


Epoch 31/200: 100%|██████████| 390/390 [01:17<00:00,  5.02it/s, loss=1.242]


Epoch 31/200 | Train Loss: 1.4773 | Val Loss: 1.7065 | Val Acc: 60.53% | LR: 0.000957
  New best model saved! Accuracy: 60.53%


Epoch 32/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.529]


Epoch 32/200 | Train Loss: 1.4280 | Val Loss: 1.7297 | Val Acc: 60.09% | LR: 0.000953


Epoch 33/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.416]


Epoch 33/200 | Train Loss: 1.4424 | Val Loss: 1.6846 | Val Acc: 60.31% | LR: 0.000950


Epoch 34/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.207]


Epoch 34/200 | Train Loss: 1.4248 | Val Loss: 1.6446 | Val Acc: 61.72% | LR: 0.000946
  New best model saved! Accuracy: 61.72%


Epoch 35/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=2.353]


Epoch 35/200 | Train Loss: 1.4221 | Val Loss: 1.8391 | Val Acc: 59.46% | LR: 0.000943


Epoch 36/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.387]


Epoch 36/200 | Train Loss: 1.4354 | Val Loss: 1.7656 | Val Acc: 59.43% | LR: 0.000939


Epoch 37/200: 100%|██████████| 390/390 [01:16<00:00,  5.11it/s, loss=1.382]


Epoch 37/200 | Train Loss: 1.4198 | Val Loss: 1.7734 | Val Acc: 59.69% | LR: 0.000935


Epoch 38/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.483]


Epoch 38/200 | Train Loss: 1.4138 | Val Loss: 1.6524 | Val Acc: 60.58% | LR: 0.000931


Epoch 39/200: 100%|██████████| 390/390 [01:16<00:00,  5.12it/s, loss=1.485]


Epoch 39/200 | Train Loss: 1.4049 | Val Loss: 1.6389 | Val Acc: 61.98% | LR: 0.000927
  New best model saved! Accuracy: 61.98%


Epoch 40/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.492]


Epoch 40/200 | Train Loss: 1.4053 | Val Loss: 1.6891 | Val Acc: 61.34% | LR: 0.000923
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch40.pth


Epoch 41/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.472]


Epoch 41/200 | Train Loss: 1.4463 | Val Loss: 1.6985 | Val Acc: 60.72% | LR: 0.000918


Epoch 42/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.314]


Epoch 42/200 | Train Loss: 1.3922 | Val Loss: 1.7450 | Val Acc: 59.93% | LR: 0.000914


Epoch 43/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.282]


Epoch 43/200 | Train Loss: 1.4159 | Val Loss: 1.6888 | Val Acc: 61.13% | LR: 0.000909


Epoch 44/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.207]


Epoch 44/200 | Train Loss: 1.3701 | Val Loss: 1.6240 | Val Acc: 61.95% | LR: 0.000905


Epoch 45/200: 100%|██████████| 390/390 [01:15<00:00,  5.14it/s, loss=1.044]


Epoch 45/200 | Train Loss: 1.3927 | Val Loss: 1.6691 | Val Acc: 61.18% | LR: 0.000900


Epoch 46/200: 100%|██████████| 390/390 [01:17<00:00,  5.04it/s, loss=1.328]


Epoch 46/200 | Train Loss: 1.4260 | Val Loss: 1.6524 | Val Acc: 62.06% | LR: 0.000895
  New best model saved! Accuracy: 62.06%


Epoch 47/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=2.977]


Epoch 47/200 | Train Loss: 1.3913 | Val Loss: 1.6835 | Val Acc: 59.76% | LR: 0.000890


Epoch 48/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.533]


Epoch 48/200 | Train Loss: 1.3958 | Val Loss: 1.6740 | Val Acc: 61.71% | LR: 0.000885


Epoch 49/200: 100%|██████████| 390/390 [01:16<00:00,  5.13it/s, loss=1.430]


Epoch 49/200 | Train Loss: 1.4251 | Val Loss: 1.7469 | Val Acc: 61.15% | LR: 0.000880


Epoch 50/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.494]


Epoch 50/200 | Train Loss: 1.4004 | Val Loss: 1.6801 | Val Acc: 60.64% | LR: 0.000874


Epoch 51/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.300]


Epoch 51/200 | Train Loss: 1.4299 | Val Loss: 1.5695 | Val Acc: 63.77% | LR: 0.000869
  New best model saved! Accuracy: 63.77%


Epoch 52/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.567]


Epoch 52/200 | Train Loss: 1.4187 | Val Loss: 1.6730 | Val Acc: 62.29% | LR: 0.000863


Epoch 53/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.442]


Epoch 53/200 | Train Loss: 1.4040 | Val Loss: 1.8826 | Val Acc: 61.57% | LR: 0.000858


Epoch 54/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.424]


Epoch 54/200 | Train Loss: 1.3698 | Val Loss: 1.5417 | Val Acc: 62.77% | LR: 0.000852


Epoch 55/200: 100%|██████████| 390/390 [01:16<00:00,  5.12it/s, loss=1.442]


Epoch 55/200 | Train Loss: 1.4025 | Val Loss: 1.5604 | Val Acc: 63.28% | LR: 0.000846


Epoch 56/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.439]


Epoch 56/200 | Train Loss: 1.3850 | Val Loss: 1.5670 | Val Acc: 63.53% | LR: 0.000841


Epoch 57/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.358]


Epoch 57/200 | Train Loss: 1.3628 | Val Loss: 1.6223 | Val Acc: 62.84% | LR: 0.000835


Epoch 58/200: 100%|██████████| 390/390 [01:17<00:00,  5.04it/s, loss=1.123]


Epoch 58/200 | Train Loss: 1.3800 | Val Loss: 1.5934 | Val Acc: 61.99% | LR: 0.000829


Epoch 59/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.402]


Epoch 59/200 | Train Loss: 1.4148 | Val Loss: 1.6245 | Val Acc: 63.37% | LR: 0.000822


Epoch 60/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.204]


Epoch 60/200 | Train Loss: 1.3836 | Val Loss: 1.7392 | Val Acc: 61.87% | LR: 0.000816
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch60.pth


Epoch 61/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.481]


Epoch 61/200 | Train Loss: 1.3658 | Val Loss: 1.5725 | Val Acc: 63.50% | LR: 0.000810


Epoch 62/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.585]


Epoch 62/200 | Train Loss: 1.3929 | Val Loss: 1.6231 | Val Acc: 62.79% | LR: 0.000804


Epoch 63/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.194]


Epoch 63/200 | Train Loss: 1.3561 | Val Loss: 1.6219 | Val Acc: 63.16% | LR: 0.000797


Epoch 64/200: 100%|██████████| 390/390 [01:16<00:00,  5.11it/s, loss=1.405]


Epoch 64/200 | Train Loss: 1.3586 | Val Loss: 1.6379 | Val Acc: 63.34% | LR: 0.000791


Epoch 65/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.134]


Epoch 65/200 | Train Loss: 1.3710 | Val Loss: 1.5188 | Val Acc: 64.86% | LR: 0.000784
  New best model saved! Accuracy: 64.86%


Epoch 66/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.427]


Epoch 66/200 | Train Loss: 1.3650 | Val Loss: 1.6543 | Val Acc: 63.28% | LR: 0.000777


Epoch 67/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.506]


Epoch 67/200 | Train Loss: 1.3827 | Val Loss: 1.5576 | Val Acc: 64.44% | LR: 0.000771


Epoch 68/200: 100%|██████████| 390/390 [01:17<00:00,  5.04it/s, loss=1.364]


Epoch 68/200 | Train Loss: 1.3769 | Val Loss: 1.5973 | Val Acc: 63.36% | LR: 0.000764


Epoch 69/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.280]


Epoch 69/200 | Train Loss: 1.3452 | Val Loss: 1.5308 | Val Acc: 63.88% | LR: 0.000757


Epoch 70/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.340]


Epoch 70/200 | Train Loss: 1.3336 | Val Loss: 1.6187 | Val Acc: 64.18% | LR: 0.000750


Epoch 71/200: 100%|██████████| 390/390 [01:15<00:00,  5.15it/s, loss=1.457]


Epoch 71/200 | Train Loss: 1.3412 | Val Loss: 1.5626 | Val Acc: 64.48% | LR: 0.000743


Epoch 72/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=2.507]


Epoch 72/200 | Train Loss: 1.3552 | Val Loss: 1.5928 | Val Acc: 63.87% | LR: 0.000736


Epoch 73/200: 100%|██████████| 390/390 [01:18<00:00,  4.99it/s, loss=1.042]


Epoch 73/200 | Train Loss: 1.3425 | Val Loss: 1.5236 | Val Acc: 64.33% | LR: 0.000729


Epoch 74/200: 100%|██████████| 390/390 [01:18<00:00,  4.98it/s, loss=0.947]


Epoch 74/200 | Train Loss: 1.3556 | Val Loss: 1.5615 | Val Acc: 65.21% | LR: 0.000722
  New best model saved! Accuracy: 65.21%


Epoch 75/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.450]


Epoch 75/200 | Train Loss: 1.3446 | Val Loss: 1.5076 | Val Acc: 64.65% | LR: 0.000714


Epoch 76/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.438]


Epoch 76/200 | Train Loss: 1.3519 | Val Loss: 1.4769 | Val Acc: 65.93% | LR: 0.000707
  New best model saved! Accuracy: 65.93%


Epoch 77/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.310]


Epoch 77/200 | Train Loss: 1.3271 | Val Loss: 1.5067 | Val Acc: 65.57% | LR: 0.000700


Epoch 78/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.304]


Epoch 78/200 | Train Loss: 1.3187 | Val Loss: 1.6658 | Val Acc: 63.05% | LR: 0.000692


Epoch 79/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.412]


Epoch 79/200 | Train Loss: 1.3679 | Val Loss: 1.5462 | Val Acc: 65.98% | LR: 0.000685
  New best model saved! Accuracy: 65.98%


Epoch 80/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.474]


Epoch 80/200 | Train Loss: 1.3590 | Val Loss: 1.5113 | Val Acc: 66.01% | LR: 0.000677
  New best model saved! Accuracy: 66.01%
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch80.pth
  Cleaned up: distilled_student_enhanced_epoch20.pth


Epoch 81/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.359]


Epoch 81/200 | Train Loss: 1.3552 | Val Loss: 1.5116 | Val Acc: 65.53% | LR: 0.000670


Epoch 82/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.291]


Epoch 82/200 | Train Loss: 1.3287 | Val Loss: 1.4997 | Val Acc: 65.63% | LR: 0.000662


Epoch 83/200: 100%|██████████| 390/390 [01:16<00:00,  5.12it/s, loss=1.256]


Epoch 83/200 | Train Loss: 1.3565 | Val Loss: 1.4520 | Val Acc: 66.19% | LR: 0.000655
  New best model saved! Accuracy: 66.19%


Epoch 84/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.336]


Epoch 84/200 | Train Loss: 1.3477 | Val Loss: 1.5096 | Val Acc: 65.65% | LR: 0.000647


Epoch 85/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=0.920]


Epoch 85/200 | Train Loss: 1.3367 | Val Loss: 1.5043 | Val Acc: 67.13% | LR: 0.000639
  New best model saved! Accuracy: 67.13%


Epoch 86/200: 100%|██████████| 390/390 [01:17<00:00,  5.04it/s, loss=1.379]


Epoch 86/200 | Train Loss: 1.3465 | Val Loss: 1.5285 | Val Acc: 65.21% | LR: 0.000631


Epoch 87/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.292]


Epoch 87/200 | Train Loss: 1.3363 | Val Loss: 1.5070 | Val Acc: 65.66% | LR: 0.000624


Epoch 88/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.026]


Epoch 88/200 | Train Loss: 1.3011 | Val Loss: 1.4572 | Val Acc: 66.46% | LR: 0.000616


Epoch 89/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.359]


Epoch 89/200 | Train Loss: 1.3212 | Val Loss: 1.4740 | Val Acc: 66.25% | LR: 0.000608


Epoch 90/200: 100%|██████████| 390/390 [01:17<00:00,  5.04it/s, loss=1.387]


Epoch 90/200 | Train Loss: 1.3450 | Val Loss: 1.4568 | Val Acc: 67.56% | LR: 0.000600
  New best model saved! Accuracy: 67.56%


Epoch 91/200: 100%|██████████| 390/390 [01:16<00:00,  5.11it/s, loss=1.633]


Epoch 91/200 | Train Loss: 1.3337 | Val Loss: 1.4639 | Val Acc: 66.49% | LR: 0.000592


Epoch 92/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.017]


Epoch 92/200 | Train Loss: 1.2906 | Val Loss: 1.4365 | Val Acc: 66.98% | LR: 0.000584


Epoch 93/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.411]


Epoch 93/200 | Train Loss: 1.3423 | Val Loss: 1.4781 | Val Acc: 67.04% | LR: 0.000576


Epoch 94/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=0.892]


Epoch 94/200 | Train Loss: 1.3073 | Val Loss: 1.4708 | Val Acc: 66.71% | LR: 0.000568


Epoch 95/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.120]


Epoch 95/200 | Train Loss: 1.3045 | Val Loss: 1.4876 | Val Acc: 67.51% | LR: 0.000560


Epoch 96/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.075]


Epoch 96/200 | Train Loss: 1.3064 | Val Loss: 1.4393 | Val Acc: 67.78% | LR: 0.000552
  New best model saved! Accuracy: 67.78%


Epoch 97/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.400]


Epoch 97/200 | Train Loss: 1.2873 | Val Loss: 1.5367 | Val Acc: 65.74% | LR: 0.000544


Epoch 98/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.213]


Epoch 98/200 | Train Loss: 1.2874 | Val Loss: 1.4143 | Val Acc: 67.59% | LR: 0.000536


Epoch 99/200: 100%|██████████| 390/390 [01:17<00:00,  5.02it/s, loss=1.270]


Epoch 99/200 | Train Loss: 1.3339 | Val Loss: 1.3562 | Val Acc: 68.24% | LR: 0.000528
  New best model saved! Accuracy: 68.24%


Epoch 100/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.396]


Epoch 100/200 | Train Loss: 1.3076 | Val Loss: 1.3735 | Val Acc: 68.10% | LR: 0.000520
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch100.pth
  Cleaned up: distilled_student_enhanced_epoch100.pth


Epoch 101/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.326]


Epoch 101/200 | Train Loss: 1.3044 | Val Loss: 1.6009 | Val Acc: 66.28% | LR: 0.000512


Epoch 102/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.013]


Epoch 102/200 | Train Loss: 1.2831 | Val Loss: 1.4324 | Val Acc: 67.56% | LR: 0.000504


Epoch 103/200: 100%|██████████| 390/390 [01:17<00:00,  5.02it/s, loss=1.288]


Epoch 103/200 | Train Loss: 1.2579 | Val Loss: 1.5428 | Val Acc: 66.36% | LR: 0.000496


Epoch 104/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.296]


Epoch 104/200 | Train Loss: 1.2734 | Val Loss: 1.5353 | Val Acc: 66.51% | LR: 0.000488


Epoch 105/200: 100%|██████████| 390/390 [01:17<00:00,  5.03it/s, loss=1.374]


Epoch 105/200 | Train Loss: 1.2840 | Val Loss: 1.4734 | Val Acc: 66.59% | LR: 0.000480


Epoch 106/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.356]


Epoch 106/200 | Train Loss: 1.2770 | Val Loss: 1.4403 | Val Acc: 67.70% | LR: 0.000472


Epoch 107/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.189]


Epoch 107/200 | Train Loss: 1.2819 | Val Loss: 1.3885 | Val Acc: 67.84% | LR: 0.000464


Epoch 108/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.316]


Epoch 108/200 | Train Loss: 1.2968 | Val Loss: 1.4829 | Val Acc: 68.29% | LR: 0.000456
  New best model saved! Accuracy: 68.29%


Epoch 109/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=0.978]


Epoch 109/200 | Train Loss: 1.2868 | Val Loss: 1.3992 | Val Acc: 67.58% | LR: 0.000448


Epoch 110/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.068]


Epoch 110/200 | Train Loss: 1.2743 | Val Loss: 1.4583 | Val Acc: 67.31% | LR: 0.000440


Epoch 111/200: 100%|██████████| 390/390 [01:17<00:00,  5.04it/s, loss=1.286]


Epoch 111/200 | Train Loss: 1.2616 | Val Loss: 1.4607 | Val Acc: 68.39% | LR: 0.000432
  New best model saved! Accuracy: 68.39%


Epoch 112/200: 100%|██████████| 390/390 [01:16<00:00,  5.07it/s, loss=1.328]


Epoch 112/200 | Train Loss: 1.2809 | Val Loss: 1.6019 | Val Acc: 67.00% | LR: 0.000424


Epoch 113/200: 100%|██████████| 390/390 [01:17<00:00,  5.05it/s, loss=1.338]


Epoch 113/200 | Train Loss: 1.2760 | Val Loss: 1.4498 | Val Acc: 68.04% | LR: 0.000416


Epoch 114/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.133]


Epoch 114/200 | Train Loss: 1.2813 | Val Loss: 1.3878 | Val Acc: 68.87% | LR: 0.000408
  New best model saved! Accuracy: 68.87%


Epoch 115/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=1.253]


Epoch 115/200 | Train Loss: 1.2900 | Val Loss: 1.3940 | Val Acc: 68.28% | LR: 0.000400


Epoch 116/200: 100%|██████████| 390/390 [01:16<00:00,  5.10it/s, loss=0.897]


Epoch 116/200 | Train Loss: 1.2569 | Val Loss: 1.4319 | Val Acc: 67.49% | LR: 0.000392


Epoch 117/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.236]


Epoch 117/200 | Train Loss: 1.2368 | Val Loss: 1.3847 | Val Acc: 68.88% | LR: 0.000384
  New best model saved! Accuracy: 68.88%


Epoch 118/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.111]


Epoch 118/200 | Train Loss: 1.2991 | Val Loss: 1.4095 | Val Acc: 69.05% | LR: 0.000376
  New best model saved! Accuracy: 69.05%


Epoch 119/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=1.209]


Epoch 119/200 | Train Loss: 1.2391 | Val Loss: 1.3459 | Val Acc: 69.51% | LR: 0.000369
  New best model saved! Accuracy: 69.51%


Epoch 120/200: 100%|██████████| 390/390 [01:16<00:00,  5.09it/s, loss=1.425]


Epoch 120/200 | Train Loss: 1.2396 | Val Loss: 1.4119 | Val Acc: 68.81% | LR: 0.000361
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch120.pth
  Cleaned up: distilled_student_enhanced_epoch120.pth


Epoch 121/200: 100%|██████████| 390/390 [01:16<00:00,  5.08it/s, loss=0.850]


Epoch 121/200 | Train Loss: 1.2601 | Val Loss: 1.4132 | Val Acc: 69.05% | LR: 0.000353


Epoch 122/200: 100%|██████████| 390/390 [01:17<00:00,  5.06it/s, loss=1.361]


Epoch 122/200 | Train Loss: 1.2638 | Val Loss: 1.3959 | Val Acc: 69.07% | LR: 0.000345


Epoch 123/200: 100%|██████████| 390/390 [01:13<00:00,  5.31it/s, loss=1.243]


Epoch 123/200 | Train Loss: 1.2580 | Val Loss: 1.4561 | Val Acc: 69.03% | LR: 0.000338


Epoch 124/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=1.173]


Epoch 124/200 | Train Loss: 1.2790 | Val Loss: 1.3839 | Val Acc: 69.47% | LR: 0.000330


Epoch 125/200: 100%|██████████| 390/390 [01:11<00:00,  5.42it/s, loss=1.233]


Epoch 125/200 | Train Loss: 1.2356 | Val Loss: 1.3663 | Val Acc: 69.13% | LR: 0.000323


Epoch 126/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.279]


Epoch 126/200 | Train Loss: 1.2410 | Val Loss: 1.3588 | Val Acc: 70.85% | LR: 0.000315
  New best model saved! Accuracy: 70.85%


Epoch 127/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=0.872]


Epoch 127/200 | Train Loss: 1.2281 | Val Loss: 1.3458 | Val Acc: 69.90% | LR: 0.000308


Epoch 128/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.230]


Epoch 128/200 | Train Loss: 1.2316 | Val Loss: 1.3556 | Val Acc: 70.11% | LR: 0.000300


Epoch 129/200: 100%|██████████| 390/390 [01:11<00:00,  5.42it/s, loss=1.236]


Epoch 129/200 | Train Loss: 1.2391 | Val Loss: 1.3266 | Val Acc: 70.40% | LR: 0.000293


Epoch 130/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=1.133]


Epoch 130/200 | Train Loss: 1.2367 | Val Loss: 1.3405 | Val Acc: 70.23% | LR: 0.000286


Epoch 131/200: 100%|██████████| 390/390 [01:11<00:00,  5.44it/s, loss=1.156]


Epoch 131/200 | Train Loss: 1.2224 | Val Loss: 1.3876 | Val Acc: 69.58% | LR: 0.000278


Epoch 132/200: 100%|██████████| 390/390 [01:11<00:00,  5.42it/s, loss=1.087]


Epoch 132/200 | Train Loss: 1.2219 | Val Loss: 1.3356 | Val Acc: 70.48% | LR: 0.000271


Epoch 133/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=2.144]


Epoch 133/200 | Train Loss: 1.2213 | Val Loss: 1.3490 | Val Acc: 70.74% | LR: 0.000264


Epoch 134/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.326]


Epoch 134/200 | Train Loss: 1.2150 | Val Loss: 1.3621 | Val Acc: 70.19% | LR: 0.000257


Epoch 135/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=1.249]


Epoch 135/200 | Train Loss: 1.2208 | Val Loss: 1.2788 | Val Acc: 70.67% | LR: 0.000250


Epoch 136/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=0.817]


Epoch 136/200 | Train Loss: 1.2293 | Val Loss: 1.3756 | Val Acc: 69.92% | LR: 0.000243


Epoch 137/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.181]


Epoch 137/200 | Train Loss: 1.2379 | Val Loss: 1.4454 | Val Acc: 70.18% | LR: 0.000236


Epoch 138/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.300]


Epoch 138/200 | Train Loss: 1.2161 | Val Loss: 1.2855 | Val Acc: 71.31% | LR: 0.000229
  New best model saved! Accuracy: 71.31%


Epoch 139/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=1.321]


Epoch 139/200 | Train Loss: 1.1960 | Val Loss: 1.3413 | Val Acc: 70.06% | LR: 0.000223


Epoch 140/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.195]


Epoch 140/200 | Train Loss: 1.2089 | Val Loss: 1.2911 | Val Acc: 70.88% | LR: 0.000216
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch140.pth
  Cleaned up: distilled_student_enhanced_epoch140.pth


Epoch 141/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=1.131]


Epoch 141/200 | Train Loss: 1.2315 | Val Loss: 1.2714 | Val Acc: 71.31% | LR: 0.000209


Epoch 142/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.474]


Epoch 142/200 | Train Loss: 1.2278 | Val Loss: 1.4168 | Val Acc: 70.06% | LR: 0.000203


Epoch 143/200: 100%|██████████| 390/390 [01:12<00:00,  5.35it/s, loss=1.023]


Epoch 143/200 | Train Loss: 1.2194 | Val Loss: 1.3296 | Val Acc: 71.33% | LR: 0.000196
  New best model saved! Accuracy: 71.33%


Epoch 144/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.259]


Epoch 144/200 | Train Loss: 1.2233 | Val Loss: 1.2924 | Val Acc: 71.47% | LR: 0.000190
  New best model saved! Accuracy: 71.47%


Epoch 145/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=1.189]


Epoch 145/200 | Train Loss: 1.1957 | Val Loss: 1.3000 | Val Acc: 71.52% | LR: 0.000184
  New best model saved! Accuracy: 71.52%


Epoch 146/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=1.023]


Epoch 146/200 | Train Loss: 1.2092 | Val Loss: 1.3184 | Val Acc: 71.26% | LR: 0.000178


Epoch 147/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=0.812]


Epoch 147/200 | Train Loss: 1.2115 | Val Loss: 1.3033 | Val Acc: 71.64% | LR: 0.000171
  New best model saved! Accuracy: 71.64%


Epoch 148/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=0.842]


Epoch 148/200 | Train Loss: 1.1836 | Val Loss: 1.2378 | Val Acc: 71.95% | LR: 0.000165
  New best model saved! Accuracy: 71.95%


Epoch 149/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.073]


Epoch 149/200 | Train Loss: 1.1968 | Val Loss: 1.2906 | Val Acc: 72.08% | LR: 0.000159
  New best model saved! Accuracy: 72.08%


Epoch 150/200: 100%|██████████| 390/390 [01:11<00:00,  5.42it/s, loss=1.221]


Epoch 150/200 | Train Loss: 1.2001 | Val Loss: 1.2990 | Val Acc: 72.14% | LR: 0.000154
  New best model saved! Accuracy: 72.14%


Epoch 151/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.326]


Epoch 151/200 | Train Loss: 1.2121 | Val Loss: 1.3113 | Val Acc: 72.19% | LR: 0.000148
  New best model saved! Accuracy: 72.19%


Epoch 152/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.133]


Epoch 152/200 | Train Loss: 1.1840 | Val Loss: 1.2900 | Val Acc: 72.04% | LR: 0.000142


Epoch 153/200: 100%|██████████| 390/390 [01:12<00:00,  5.36it/s, loss=0.769]


Epoch 153/200 | Train Loss: 1.2066 | Val Loss: 1.3280 | Val Acc: 71.57% | LR: 0.000137


Epoch 154/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=1.301]


Epoch 154/200 | Train Loss: 1.2049 | Val Loss: 1.3601 | Val Acc: 71.68% | LR: 0.000131


Epoch 155/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=0.818]


Epoch 155/200 | Train Loss: 1.2052 | Val Loss: 1.3186 | Val Acc: 72.06% | LR: 0.000126


Epoch 156/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=0.782]


Epoch 156/200 | Train Loss: 1.1921 | Val Loss: 1.2497 | Val Acc: 72.36% | LR: 0.000120
  New best model saved! Accuracy: 72.36%


Epoch 157/200: 100%|██████████| 390/390 [01:12<00:00,  5.35it/s, loss=1.308]


Epoch 157/200 | Train Loss: 1.1962 | Val Loss: 1.2647 | Val Acc: 72.29% | LR: 0.000115


Epoch 158/200: 100%|██████████| 390/390 [01:12<00:00,  5.35it/s, loss=1.253]


Epoch 158/200 | Train Loss: 1.1687 | Val Loss: 1.2939 | Val Acc: 72.12% | LR: 0.000110


Epoch 159/200: 100%|██████████| 390/390 [01:12<00:00,  5.35it/s, loss=1.221]


Epoch 159/200 | Train Loss: 1.1927 | Val Loss: 1.2381 | Val Acc: 73.20% | LR: 0.000105
  New best model saved! Accuracy: 73.20%


Epoch 160/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.095]


Epoch 160/200 | Train Loss: 1.1694 | Val Loss: 1.2921 | Val Acc: 72.58% | LR: 0.000100
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch160.pth
  Cleaned up: distilled_student_enhanced_epoch160.pth


Epoch 161/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=0.788]


Epoch 161/200 | Train Loss: 1.2319 | Val Loss: 1.1945 | Val Acc: 73.57% | LR: 0.000095
  New best model saved! Accuracy: 73.57%


Epoch 162/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.035]


Epoch 162/200 | Train Loss: 1.1955 | Val Loss: 1.2350 | Val Acc: 73.02% | LR: 0.000091


Epoch 163/200: 100%|██████████| 390/390 [01:12<00:00,  5.36it/s, loss=0.963]


Epoch 163/200 | Train Loss: 1.1979 | Val Loss: 1.1749 | Val Acc: 73.92% | LR: 0.000086
  New best model saved! Accuracy: 73.92%


Epoch 164/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=3.321]


Epoch 164/200 | Train Loss: 1.1795 | Val Loss: 1.2583 | Val Acc: 73.54% | LR: 0.000082


Epoch 165/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.520]


Epoch 165/200 | Train Loss: 1.1762 | Val Loss: 1.2637 | Val Acc: 73.09% | LR: 0.000077


Epoch 166/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=0.859]


Epoch 166/200 | Train Loss: 1.1717 | Val Loss: 1.2463 | Val Acc: 72.96% | LR: 0.000073


Epoch 167/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=0.735]


Epoch 167/200 | Train Loss: 1.1695 | Val Loss: 1.2273 | Val Acc: 73.44% | LR: 0.000069


Epoch 168/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=0.923]


Epoch 168/200 | Train Loss: 1.1550 | Val Loss: 1.2236 | Val Acc: 72.67% | LR: 0.000065


Epoch 169/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.036]


Epoch 169/200 | Train Loss: 1.1692 | Val Loss: 1.2222 | Val Acc: 73.79% | LR: 0.000061


Epoch 170/200: 100%|██████████| 390/390 [01:11<00:00,  5.43it/s, loss=1.234]


Epoch 170/200 | Train Loss: 1.1635 | Val Loss: 1.2222 | Val Acc: 73.39% | LR: 0.000057


Epoch 171/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.125]


Epoch 171/200 | Train Loss: 1.1433 | Val Loss: 1.2489 | Val Acc: 73.32% | LR: 0.000054


Epoch 172/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=1.017]


Epoch 172/200 | Train Loss: 1.1782 | Val Loss: 1.2264 | Val Acc: 73.48% | LR: 0.000050


Epoch 173/200: 100%|██████████| 390/390 [01:12<00:00,  5.35it/s, loss=0.869]


Epoch 173/200 | Train Loss: 1.1687 | Val Loss: 1.2286 | Val Acc: 73.06% | LR: 0.000047


Epoch 174/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=0.785]


Epoch 174/200 | Train Loss: 1.1494 | Val Loss: 1.2239 | Val Acc: 73.33% | LR: 0.000043


Epoch 175/200: 100%|██████████| 390/390 [01:12<00:00,  5.35it/s, loss=1.293]


Epoch 175/200 | Train Loss: 1.1525 | Val Loss: 1.2461 | Val Acc: 73.80% | LR: 0.000040


Epoch 176/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=1.279]


Epoch 176/200 | Train Loss: 1.1265 | Val Loss: 1.2117 | Val Acc: 73.76% | LR: 0.000037


Epoch 177/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=1.282]


Epoch 177/200 | Train Loss: 1.1618 | Val Loss: 1.2655 | Val Acc: 73.13% | LR: 0.000034


Epoch 178/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=0.903]


Epoch 178/200 | Train Loss: 1.1368 | Val Loss: 1.2643 | Val Acc: 72.74% | LR: 0.000031


Epoch 179/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.262]


Epoch 179/200 | Train Loss: 1.1657 | Val Loss: 1.2508 | Val Acc: 73.49% | LR: 0.000028


Epoch 180/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=0.809]


Epoch 180/200 | Train Loss: 1.1363 | Val Loss: 1.2601 | Val Acc: 73.10% | LR: 0.000026
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch180.pth
  Cleaned up: distilled_student_enhanced_epoch180.pth


Epoch 181/200: 100%|██████████| 390/390 [01:11<00:00,  5.43it/s, loss=1.264]


Epoch 181/200 | Train Loss: 1.1436 | Val Loss: 1.2535 | Val Acc: 73.39% | LR: 0.000023


Epoch 182/200: 100%|██████████| 390/390 [01:11<00:00,  5.43it/s, loss=1.270]


Epoch 182/200 | Train Loss: 1.1850 | Val Loss: 1.2568 | Val Acc: 73.44% | LR: 0.000021


Epoch 183/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=0.840]


Epoch 183/200 | Train Loss: 1.1235 | Val Loss: 1.2287 | Val Acc: 73.83% | LR: 0.000019


Epoch 184/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.052]


Epoch 184/200 | Train Loss: 1.1591 | Val Loss: 1.2637 | Val Acc: 73.08% | LR: 0.000017


Epoch 185/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.464]


Epoch 185/200 | Train Loss: 1.1719 | Val Loss: 1.2545 | Val Acc: 73.45% | LR: 0.000015


Epoch 186/200: 100%|██████████| 390/390 [01:12<00:00,  5.39it/s, loss=0.984]


Epoch 186/200 | Train Loss: 1.1526 | Val Loss: 1.1838 | Val Acc: 74.17% | LR: 0.000013
  New best model saved! Accuracy: 74.17%


Epoch 187/200: 100%|██████████| 390/390 [01:13<00:00,  5.34it/s, loss=1.175]


Epoch 187/200 | Train Loss: 1.1460 | Val Loss: 1.1888 | Val Acc: 74.11% | LR: 0.000011


Epoch 188/200: 100%|██████████| 390/390 [01:13<00:00,  5.33it/s, loss=1.595]


Epoch 188/200 | Train Loss: 1.1225 | Val Loss: 1.2074 | Val Acc: 74.21% | LR: 0.000009
  New best model saved! Accuracy: 74.21%


Epoch 189/200: 100%|██████████| 390/390 [01:12<00:00,  5.35it/s, loss=0.843]


Epoch 189/200 | Train Loss: 1.1574 | Val Loss: 1.2211 | Val Acc: 73.98% | LR: 0.000008


Epoch 190/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=0.798]


Epoch 190/200 | Train Loss: 1.1214 | Val Loss: 1.2307 | Val Acc: 73.91% | LR: 0.000006


Epoch 191/200: 100%|██████████| 390/390 [01:12<00:00,  5.38it/s, loss=1.138]


Epoch 191/200 | Train Loss: 1.1312 | Val Loss: 1.2362 | Val Acc: 73.94% | LR: 0.000005


Epoch 192/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=0.930]


Epoch 192/200 | Train Loss: 1.1341 | Val Loss: 1.2208 | Val Acc: 73.71% | LR: 0.000004


Epoch 193/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.097]


Epoch 193/200 | Train Loss: 1.1467 | Val Loss: 1.2650 | Val Acc: 73.07% | LR: 0.000003


Epoch 194/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.257]


Epoch 194/200 | Train Loss: 1.1641 | Val Loss: 1.2476 | Val Acc: 73.52% | LR: 0.000002


Epoch 195/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=1.579]


Epoch 195/200 | Train Loss: 1.1480 | Val Loss: 1.2531 | Val Acc: 73.51% | LR: 0.000002


Epoch 196/200: 100%|██████████| 390/390 [01:12<00:00,  5.41it/s, loss=0.854]


Epoch 196/200 | Train Loss: 1.1602 | Val Loss: 1.2091 | Val Acc: 73.93% | LR: 0.000001


Epoch 197/200: 100%|██████████| 390/390 [01:12<00:00,  5.37it/s, loss=1.186]


Epoch 197/200 | Train Loss: 1.1466 | Val Loss: 1.2219 | Val Acc: 73.62% | LR: 0.000001


Epoch 198/200: 100%|██████████| 390/390 [01:11<00:00,  5.43it/s, loss=1.089]


Epoch 198/200 | Train Loss: 1.1348 | Val Loss: 1.2664 | Val Acc: 73.54% | LR: 0.000000


Epoch 199/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=0.923]


Epoch 199/200 | Train Loss: 1.1275 | Val Loss: 1.1881 | Val Acc: 74.16% | LR: 0.000000


Epoch 200/200: 100%|██████████| 390/390 [01:12<00:00,  5.40it/s, loss=1.131]


Epoch 200/200 | Train Loss: 1.1472 | Val Loss: 1.2012 | Val Acc: 74.03% | LR: 0.000000
  Checkpoint saved: outputs\checkpoints\distilled_student_enhanced_epoch200.pth
  Cleaned up: distilled_student_enhanced_epoch200.pth

Training complete. Best accuracy: 74.21%

Enhanced Student Final Accuracy: 74.21%


In [9]:
# Cell 9: Results Summary
print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)

print("\n| Model                      | Accuracy (%) |")
print("|----------------------------|--------------|")
try:
    print(f"| Teacher (EfficientNet-L)   | {teacher_accuracy:12.2f} |")
except NameError:
    print(f"| Teacher (EfficientNet-L)   | {'N/A':>12} |")

try:
    print(f"| Enhanced Student (v2)      | {distilled_accuracy:12.2f} |")
except NameError:
    print(f"| Enhanced Student (v2)      | {'N/A':>12} |")

try:
    improvement = distilled_accuracy - teacher_accuracy
    print(f"\n{'='*80}")
    if improvement > 0:
        print(f"Student SURPASSED Teacher by: {improvement:+.2f}%")
    else:
        print(f"Gap from Teacher: {improvement:.2f}%")
    print(f"{'='*80}")
except NameError:
    pass

print(f"\nEnhancements applied:")
print(f"  - AutoAugment + RandomErasing")
print(f"  - Label Smoothing (0.1)")
print(f"  - LR Warmup (5 epochs)")
print(f"  - Optimized KD Alpha (0.7)")

print(f"\nAll models saved to: {MODEL_DIR.absolute()}")
print(f"Checkpoints saved to: {CHECKPOINT_DIR.absolute()}")


FINAL RESULTS SUMMARY

| Model                      | Accuracy (%) |
|----------------------------|--------------|
| Teacher (EfficientNet-L)   |        75.76 |
| Enhanced Student (v2)      |        74.21 |

Gap from Teacher: -1.55%

Enhancements applied:
  - AutoAugment + RandomErasing
  - Label Smoothing (0.1)
  - LR Warmup (5 epochs)
  - Optimized KD Alpha (0.7)

All models saved to: d:\Projects\MasterProject\code\outputs\models
Checkpoints saved to: d:\Projects\MasterProject\code\outputs\checkpoints
