# ü§ü MANO - Kaggle Training Notebook

Train the Colombian Sign Language gesture classifier using Kaggle's free GPU.

## Prerequisites

1. **Upload preprocessed tensors** to Kaggle as a dataset:
   - Go to Kaggle > Datasets > New Dataset
   - Upload `tensors_landmarks.pth` (or `tensors.pth`)
   - Name the dataset (e.g., "mano-tensors")

2. **Add the dataset** to this notebook:
   - Click "Add data" in the right sidebar
   - Search for your uploaded dataset
   - Dataset will be available at `/kaggle/input/{dataset-name}/`

## Configuration

- Models: MobileNetV2, MobileNetV3-Small
- Learning rates: 1e-3, 5e-4
- Batch size: 32
- Epochs: 30 (with early stopping)


In [None]:
# =============================================================================
# MANO - Colombian Sign Language Translator - Kaggle Training Notebook
# =============================================================================

import os
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö†Ô∏è No GPU detected. Enable GPU in Settings > Accelerator")


In [None]:
# =============================================================================
# ‚ö†Ô∏è CONFIGURATION - UPDATE THESE VALUES
# =============================================================================

# Path to your uploaded tensor file
# After adding dataset, check the exact path in /kaggle/input/
TENSOR_PATH = "/kaggle/input/mano-tensors/tensors_landmarks.pth"

# Output directory (Kaggle working directory persists after notebook run)
OUTPUT_DIR = "/kaggle/working/models"

# Experiment name for MLflow
EXPERIMENT_NAME = "V3_landmarks"

# =============================================================================

# List available input files to find your tensor path
print("Available input files:")
for root, dirs, files in os.walk("/kaggle/input"):
    for file in files:
        filepath = os.path.join(root, file)
        size_mb = os.path.getsize(filepath) / 1024 / 1024
        print(f"  {filepath} ({size_mb:.1f} MB)")


In [None]:
# Load preprocessed tensors
print(f"Loading tensors from {TENSOR_PATH}...")

if not os.path.exists(TENSOR_PATH):
    print(f"‚ùå File not found: {TENSOR_PATH}")
    print("Please update TENSOR_PATH to match your uploaded dataset path")
else:
    data = torch.load(TENSOR_PATH, weights_only=False)

    # Extract data
    train_images, train_labels = data['train_images'], data['train_labels']
    val_images, val_labels = data['val_images'], data['val_labels']
    test_images, test_labels = data['test_images'], data['test_labels']
    classes = data['classes']
    num_classes = data['num_classes']

    print(f"\n‚úÖ Data loaded successfully!")
    print(f"   Train: {train_images.shape} ({len(train_labels)} samples)")
    print(f"   Val: {val_images.shape} ({len(val_labels)} samples)")
    print(f"   Test: {test_images.shape} ({len(test_labels)} samples)")
    print(f"   Classes ({num_classes}): {classes}")


In [None]:
# Data augmentation for training
import torchvision.transforms.v2 as T

class AugmentedTensorDataset(torch.utils.data.Dataset):
    """Dataset that applies augmentation to pre-normalized tensors."""
    def __init__(self, images, labels, augment=False):
        self.images = images
        self.labels = labels
        self.augment = augment
        
        self.transforms = T.Compose([
            T.RandomHorizontalFlip(p=0.3),
            T.RandomRotation(degrees=15),
            T.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),
            T.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.15),
        ]) if augment else None
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        
        if self.augment and self.transforms:
            image = self.transforms(image)
        
        return image, label

print("‚úÖ Augmentation pipeline ready")


In [None]:
# Model and training utilities
import time
from pathlib import Path
from datetime import datetime

import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import models
import mlflow
import mlflow.pytorch

def get_model(model_name: str, num_classes: int, pretrained: bool = True) -> nn.Module:
    """Get a pretrained model with modified classifier."""
    if model_name == "mobilenet_v2":
        weights = models.MobileNet_V2_Weights.DEFAULT if pretrained else None
        model = models.mobilenet_v2(weights=weights)
        model.classifier[1] = nn.Linear(model.last_channel, num_classes)
    elif model_name == "mobilenet_v3_small":
        weights = models.MobileNet_V3_Small_Weights.DEFAULT if pretrained else None
        model = models.mobilenet_v3_small(weights=weights)
        model.classifier[3] = nn.Linear(1024, num_classes)
    else:
        raise ValueError(f"Unknown model: {model_name}")
    return model

def train_one_epoch(model, loader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    return running_loss / total, correct / total

def evaluate(model, loader, criterion, device):
    """Evaluate model on a dataset."""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    return running_loss / total, correct / total

# Create output directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
MLRUNS_DIR = "/kaggle/working/mlruns"
os.makedirs(MLRUNS_DIR, exist_ok=True)

# Setup MLflow (file-based tracking for Kaggle)
MLFLOW_TRACKING_URI = f"file://{MLRUNS_DIR}"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

print(f"Models will be saved to: {OUTPUT_DIR}")
print(f"MLflow tracking URI: {MLFLOW_TRACKING_URI}")


In [None]:
# =============================================================================
# TRAINING CONFIGURATION
# =============================================================================
MODELS_TO_TRAIN = ["mobilenet_v2", "mobilenet_v3_small"]
LEARNING_RATES = [1e-3, 5e-4]
BATCH_SIZE = 32
EPOCHS = 30
WEIGHT_DECAY = 1e-4
PATIENCE = 10

# Calculate total runs
total_runs = len(MODELS_TO_TRAIN) * len(LEARNING_RATES)
print("=" * 60)
print("TRAINING CONFIGURATION")
print("=" * 60)
print(f"Models: {MODELS_TO_TRAIN}")
print(f"Learning rates: {LEARNING_RATES}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Total experiments: {total_runs}")
print("=" * 60)

# Device setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")


In [None]:
# =============================================================================
# TRAINING LOOP WITH MLFLOW LOGGING
# =============================================================================
all_results = []
run_count = 0

for MODEL_NAME in MODELS_TO_TRAIN:
    for LEARNING_RATE in LEARNING_RATES:
        run_count += 1
        print("\n" + "=" * 70)
        print(f"üöÄ EXPERIMENT {run_count}/{total_runs}")
        print(f"   Model: {MODEL_NAME} | LR: {LEARNING_RATE}")
        print("=" * 70)
        
        # Create DataLoaders with augmentation
        train_dataset = AugmentedTensorDataset(train_images, train_labels, augment=True)
        val_dataset = AugmentedTensorDataset(val_images, val_labels, augment=False)
        test_dataset = AugmentedTensorDataset(test_images, test_labels, augment=False)
        
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
        
        # Create fresh model
        print(f"Initializing {MODEL_NAME} with pretrained weights...")
        model = get_model(MODEL_NAME, num_classes, pretrained=True)
        model = model.to(DEVICE)
        
        # Count parameters
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Total parameters: {total_params:,}")
        
        # Optimizer and scheduler
        criterion = nn.CrossEntropyLoss()
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=LEARNING_RATE / 100)
        
        # Training with MLflow tracking
        run_name = f"{MODEL_NAME}_lr{LEARNING_RATE}_bs{BATCH_SIZE}"
        best_val_loss = float('inf')
        best_val_acc = 0.0
        epochs_without_improvement = 0
        best_checkpoint_path = None
        
        with mlflow.start_run(run_name=run_name):
            # Log parameters (matching Colab/train.py format)
            mlflow.log_params({
                "model_name": MODEL_NAME,
                "epochs": EPOCHS,
                "batch_size": BATCH_SIZE,
                "learning_rate": LEARNING_RATE,
                "weight_decay": WEIGHT_DECAY,
                "patience": PATIENCE,
                "num_classes": num_classes,
                "classes": ",".join(classes),
                "optimizer": "AdamW",
                "scheduler": "CosineAnnealingLR",
                "device": str(DEVICE),
                "pretrained": True,
                "augmentation": True,
                "train_samples": len(train_loader.dataset),
                "val_samples": len(val_loader.dataset),
                "test_samples": len(test_loader.dataset),
                "total_params": total_params,
                "trainable_params": trainable_params,
            })
            
            print("-" * 60)
            for epoch in range(1, EPOCHS + 1):
                start_time = time.time()
                
                # Train
                train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, DEVICE)
                
                # Validate
                val_loss, val_acc = evaluate(model, val_loader, criterion, DEVICE)
                
                # Update scheduler
                scheduler.step()
                current_lr = scheduler.get_last_lr()[0]
                
                # Log metrics to MLflow
                mlflow.log_metrics({
                    "train_loss": train_loss,
                    "train_acc": train_acc,
                    "val_loss": val_loss,
                    "val_acc": val_acc,
                    "learning_rate": current_lr,
                }, step=epoch)
                
                # Console logging
                elapsed = time.time() - start_time
                print(
                    f"Epoch {epoch:3d}/{EPOCHS} | "
                    f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | "
                    f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} | "
                    f"LR: {current_lr:.6f} | "
                    f"Time: {elapsed:.1f}s"
                )
                
                # Save best model (based on validation loss)
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_val_acc = val_acc
                    epochs_without_improvement = 0
                    
                    # Save checkpoint
                    filename = f"{MODEL_NAME}_lr{LEARNING_RATE}_bs{BATCH_SIZE}_acc{val_acc:.2f}.pth"
                    filepath = Path(OUTPUT_DIR) / filename
                    
                    checkpoint = {
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "epoch": epoch,
                        "val_loss": val_loss,
                        "val_acc": val_acc,
                        "model_name": MODEL_NAME,
                        "learning_rate": LEARNING_RATE,
                        "batch_size": BATCH_SIZE,
                        "classes": classes,
                        "num_classes": num_classes,
                    }
                    torch.save(checkpoint, filepath)
                    
                    # Log checkpoint to MLflow
                    mlflow.log_artifact(str(filepath), artifact_path="checkpoints")
                    best_checkpoint_path = filepath
                    print(f"  ‚Ü≥ New best! (loss: {val_loss:.4f})")
                else:
                    epochs_without_improvement += 1
                
                # Early stopping
                if epochs_without_improvement >= PATIENCE:
                    print(f"\nEarly stopping at epoch {epoch} (val_loss not improving)")
                    mlflow.log_param("early_stopped_epoch", epoch)
                    break
            
            # Final evaluation on test set
            print("\nEvaluating on test set...")
            test_loss, test_acc = evaluate(model, test_loader, criterion, DEVICE)
            print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_acc:.4f}")
            
            # Log final metrics
            mlflow.log_metrics({
                "best_val_loss": best_val_loss,
                "best_val_acc": best_val_acc,
                "test_loss": test_loss,
                "test_acc": test_acc,
            })
            
            # Register model in MLflow
            if best_checkpoint_path:
                mlflow.pytorch.log_model(
                    model,
                    artifact_path="model",
                    registered_model_name=f"lsc_{MODEL_NAME}",
                )
            
            run_id = mlflow.active_run().info.run_id
        
        # Store results
        all_results.append({
            "model": MODEL_NAME,
            "lr": LEARNING_RATE,
            "best_val_loss": best_val_loss,
            "best_val_acc": best_val_acc,
            "test_acc": test_acc,
            "params": total_params,
            "checkpoint": str(best_checkpoint_path),
            "run_id": run_id,
        })
        
        print(f"\n‚úÖ Complete! Val Acc: {best_val_acc:.4f}, Test Acc: {test_acc:.4f}")
        print(f"   MLflow run ID: {run_id}")
        
        # Clear GPU memory
        del model, optimizer, scheduler, train_loader, val_loader, test_loader
        torch.cuda.empty_cache()

print("\n" + "=" * 70)
print("üèÅ TRAINING COMPLETE!")
print("=" * 70)


In [None]:
# =============================================================================
# RESULTS SUMMARY
# =============================================================================
import pandas as pd

print("=" * 70)
print("üìä TRAINING RESULTS")
print("=" * 70)

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values("best_val_loss", ascending=True)

# Format for display
display_df = results_df.copy()
display_df['lr'] = display_df['lr'].apply(lambda x: f"{x:.0e}")
display_df['best_val_loss'] = display_df['best_val_loss'].apply(lambda x: f"{x:.4f}")
display_df['best_val_acc'] = display_df['best_val_acc'].apply(lambda x: f"{x:.4f}")
display_df['test_acc'] = display_df['test_acc'].apply(lambda x: f"{x:.4f}")
print(display_df[['model', 'lr', 'best_val_loss', 'best_val_acc', 'test_acc']].to_string(index=False))

# Best configuration
best = results_df.iloc[0]
print(f"\nüèÜ BEST CONFIGURATION:")
print(f"   Model: {best['model']}")
print(f"   Learning Rate: {best['lr']}")
print(f"   Val Loss: {best['best_val_loss']:.4f}")
print(f"   Val Accuracy: {best['best_val_acc']:.4f}")
print(f"   Test Accuracy: {best['test_acc']:.4f}")
print(f"   Checkpoint: {best['checkpoint']}")


In [None]:
# =============================================================================
# LIST OUTPUT FILES FOR DOWNLOAD
# =============================================================================
import shutil

print("=" * 70)
print("üìÅ OUTPUT FILES")
print("=" * 70)

# List checkpoints
print(f"\nüì¶ Model Checkpoints ({OUTPUT_DIR}):")
for file in Path(OUTPUT_DIR).glob("*.pth"):
    size_mb = file.stat().st_size / 1024 / 1024
    print(f"  - {file.name} ({size_mb:.1f} MB)")

# List MLflow runs
print(f"\nüìä MLflow Tracking ({MLRUNS_DIR}):")
mlruns_path = Path(MLRUNS_DIR)
if mlruns_path.exists():
    # Count experiments and runs
    experiments = [d for d in mlruns_path.iterdir() if d.is_dir() and d.name not in ['0', '.trash', 'models']]
    total_runs = sum(1 for exp in experiments for run in (exp).iterdir() if run.is_dir())
    print(f"  - {len(experiments)} experiment(s), {total_runs} run(s)")
    
    # Create zip for easy download
    mlruns_zip = "/kaggle/working/mlruns.zip"
    shutil.make_archive("/kaggle/working/mlruns", 'zip', MLRUNS_DIR)
    zip_size = Path(mlruns_zip).stat().st_size / 1024 / 1024
    print(f"  - mlruns.zip created ({zip_size:.1f} MB)")

print("\n" + "=" * 70)
print("üì• DOWNLOAD INSTRUCTIONS")
print("=" * 70)
print("\n1. Click 'Save Version' (top right)")
print("2. Select 'Quick Save'")
print("3. After save completes, go to 'Output' tab")
print("4. Download:")
print("   - Individual .pth files from models/")
print("   - mlruns.zip (contains all MLflow tracking data)")

print("\nüí° TO USE LOCALLY:")
print("   1. Copy .pth files to: models/")
print("   2. Extract mlruns.zip to: models/mlruns/")
print("   3. Run inference:")
print("      python -m src.cv_model.inference --model models/<filename>.pth")
print("   4. View MLflow UI (use direct path on Windows, no file:// prefix):")
print("      mlflow ui --backend-store-uri models/mlruns")


In [None]:
# =============================================================================
# MLFLOW RUNS SUMMARY
# =============================================================================
print("=" * 70)
print("üìã MLFLOW RUNS SUMMARY")
print("=" * 70)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment:
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    print(f"\nExperiment: {EXPERIMENT_NAME} ({len(runs)} runs)")
    
    cols = ['params.model_name', 'params.learning_rate', 'params.batch_size',
            'metrics.best_val_loss', 'metrics.best_val_acc', 'metrics.test_acc', 'status']
    available_cols = [c for c in cols if c in runs.columns]
    
    if available_cols:
        display_runs = runs[available_cols].copy()
        display_runs.columns = [c.split('.')[-1] for c in display_runs.columns]
        if 'best_val_loss' in display_runs.columns:
            display_runs = display_runs.sort_values('best_val_loss', ascending=True)
        print(display_runs.to_string(index=False))
else:
    print(f"No experiment found: {EXPERIMENT_NAME}")
