In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time
import sys
import os
import shutil
# from torch.cuda.amp import autocast, GradScaler # REMOVED: Old syntax

# --- 1. Device Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 2. Fancy Progress Bar ---
class FancyProgress:
    def __init__(self, total, bar_length=40):
        self.total = total
        self.bar_length = bar_length
        self.start = time.time()

    def rgb(self, r, g, b):
        return f"\033[38;2;{r};{g};{b}m"

    def color_gradient(self, t):
        t = max(0.0, min(1.0, t))
        if t < 0.5:
            r = int(2 * t * 255); g = 255
        else:
            r = 255; g = int((1 - 2 * (t - 0.5)) * 255)
        return self.rgb(r, g, 0)

    def update(self, step, suffix=""):
        now = time.time()
        elapsed = now - self.start
        t = step / self.total if self.total > 0 else 0
        percent = t * 100
        filled = int(self.bar_length * t)
        bar = "".join([self.color_gradient(i/self.bar_length) + "‚ñà" if i < filled else "\033[0m‚ñë" for i in range(self.bar_length)])
        eta = (elapsed / t - elapsed) if t > 0 else 0
        def fmt(x): return time.strftime("%M:%S", time.gmtime(x))
        sys.stdout.write(f"\r{bar}\033[0m {percent:5.1f}% | {fmt(elapsed)}< {fmt(eta)} | {suffix}")
        sys.stdout.flush()

    def finish(self):
        sys.stdout.write("\033[0m\n")

# --- 3. The Model (CNN) ---
class DrowsinessNet(nn.Module):
    def __init__(self):
        super(DrowsinessNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32), # Added BatchNorm for speed/stability
            nn.ReLU(), nn.MaxPool2d(2, 2),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(), nn.MaxPool2d(2, 2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(), nn.MaxPool2d(2, 2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.5),
            nn.Linear(128 * 16 * 16, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
            # REMOVED: nn.Sigmoid() because BCEWithLogitsLoss handles it internally
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

def main():
    # üöÄ Speed Hack 1: Auto-tuner
    torch.backends.cudnn.benchmark = True
    
    print(f"PyTorch Version: {torch.__version__}")
    if device.type == 'cuda':
        print(f"‚úÖ Using GPU: {torch.cuda.get_device_name(0)}")
        print("üöÄ Turbo Mode: Enabled (AMP + Multi-worker)")
    else:
        print("‚ùå Running on CPU. This will be slow.")

    # Data Prep
    BASE_DIR = os.getcwd()
    TRAIN_DIR = os.path.join(BASE_DIR, 'data', 'train')
    VAL_DIR = os.path.join(BASE_DIR, 'data', 'val')
    TEST_DIR = os.path.join(BASE_DIR, 'data', 'test')

    # Cleanup
    for root, dirs, files in os.walk(os.path.join(BASE_DIR, 'data')):
        for d in dirs:
            if d == ".ipynb_checkpoints":
                try: shutil.rmtree(os.path.join(root, d))
                except: pass

    IMG_SIZE = 128
    # üöÄ CHANGED: Bumped Batch Size to 64 to fill GPU better
    # If you get "CUDA Out of Memory", change this back to 32
    BATCH_SIZE = 64 

    transform_train = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.RandomRotation(15),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])
    
    transform_val = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])

    print("\nLoading Data (Parallel)...")
    try:
        train_data = datasets.ImageFolder(TRAIN_DIR, transform=transform_train)
        val_data = datasets.ImageFolder(VAL_DIR, transform=transform_val)
        test_data = datasets.ImageFolder(TEST_DIR, transform=transform_val)
        
        # üöÄ Speed Hack 2: Parallel Workers & Pin Memory
        # num_workers=4 uses your CPU cores to load images ahead of time
        train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, 
                                num_workers=4, pin_memory=True, persistent_workers=True)
        val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, 
                              num_workers=4, pin_memory=True, persistent_workers=True)
        test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, 
                               num_workers=4, pin_memory=True)
        
        print(f"‚úÖ Loaded {len(train_data)} training images.")
    except Exception as e:
        print(f"‚ùå Error loading data: {e}")
        sys.exit()

    # Model & Scaler
    model = DrowsinessNet().to(device)
    # üöÄ CHANGED: Using BCEWithLogitsLoss for numeric stability in AMP
    criterion = nn.BCEWithLogitsLoss() 
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # üöÄ Speed Hack 3: Mixed Precision Scaler (Updated for PyTorch 2.4+)
    # Was: scaler = GradScaler()
    scaler = torch.amp.GradScaler('cuda') 

    EPOCHS = 10
    print(f"\nStarting Turbo Training for {EPOCHS} epochs...")

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        tracker = FancyProgress(total=len(train_loader))
        print(f"\nEpoch {epoch+1}/{EPOCHS}")
        
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True).float().unsqueeze(1)
            
            optimizer.zero_grad()
            
            # üöÄ Speed Hack 3: Mixed Precision Forward Pass (Updated for PyTorch 2.4+)
            # Was: with autocast():
            with torch.amp.autocast('cuda'):
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            
            # Scaled Backward Pass
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += loss.item()
            # ‚ö†Ô∏è CHANGED: Logits > 0.0 is equivalent to Sigmoid(x) > 0.5
            predicted = (outputs > 0.0).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            acc = 100 * correct / total
            tracker.update(i + 1, suffix=f"Loss: {loss.item():.4f} | Acc: {acc:.1f}%")

        tracker.finish()
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True).float().unsqueeze(1)
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
                # ‚ö†Ô∏è CHANGED: Logits threshold
                predicted = (outputs > 0.0).float()
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        avg_val_loss = val_loss / len(val_loader)
        val_acc = 100 * val_correct / val_total
        print(f"  ‚îî‚îÄ‚îÄ Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.2f}%")

    # Final Test
    print("\nüîç Final Test...")
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device), labels.to(device).float().unsqueeze(1)
            outputs = model(inputs[0]) # Fix for tuple unpacking if any
            predicted = (outputs > 0.0).float() # ‚ö†Ô∏è CHANGED: Logits threshold
            test_total += inputs[1].size(0)
            test_correct += (predicted == inputs[1]).sum().item()

    # Save
    torch.save(model.state_dict(), "drowsiness_model_gpu.pth")
    print("\nModel saved. üèÅ")

# ‚ö†Ô∏è MANDATORY FOR WINDOWS MULTIPROCESSING
if __name__ == '__main__':
    main()

PyTorch Version: 2.5.1+cu121
‚úÖ Using GPU: NVIDIA GeForce RTX 2050
üöÄ Turbo Mode: Enabled (AMP + Multi-worker)

Loading Data (Parallel)...
‚úÖ Loaded 50937 training images.

Starting Turbo Training for 10 epochs...

Epoch 1/10
[38;2;0;255;0m‚ñà[38;2;12;255;0m‚ñà[38;2;25;255;0m‚ñà[38;2;38;255;0m‚ñà[38;2;51;255;0m‚ñà[38;2;63;255;0m‚ñà[38;2;76;255;0m‚ñà[38;2;89;255;0m‚ñà[38;2;102;255;0m‚ñà[38;2;114;255;0m‚ñà[38;2;127;255;0m‚ñà[38;2;140;255;0m‚ñà[38;2;153;255;0m‚ñà[38;2;165;255;0m‚ñà[38;2;178;255;0m‚ñà[38;2;191;255;0m‚ñà[38;2;204;255;0m‚ñà[38;2;216;255;0m‚ñà[38;2;229;255;0m‚ñà[38;2;242;255;0m‚ñà[38;2;255;255;0m‚ñà[38;2;255;242;0m‚ñà[38;2;255;229;0m‚ñà[38;2;255;216;0m‚ñà[38;2;255;204;0m‚ñà[38;2;255;191;0m‚ñà[38;2;255;178;0m‚ñà[38;2;255;165;0m‚ñà[38;2;255;153;0m‚ñà[38;2;255;140;0m‚ñà[38;2;255;127;0m‚ñà[38;2;255;114;0m‚ñà[38;2;255;101;0m‚ñà[38;2;255;89;0m‚ñà[38;2;255;76;0m‚ñà[38;2;255;63;0m‚ñà[38;2;255;50;0m‚ñà[38;2;255;38;0m‚ñà[38;2;255;25;0m‚ñà[38;