# Phase 1 Training - LOCAL DRY RUN TEST

**Purpose:** Validate the entire training pipeline on local machine with CPU and minimal data.

---

## Step 0: Setup Check

Make sure you're in the correct directory and have dependencies installed.

In [1]:
import os
import sys
from pathlib import Path

# Get project root (assuming notebook is in notebooks/ folder)
notebook_dir = Path.cwd()
project_root = notebook_dir.parent if notebook_dir.name == 'notebooks' else notebook_dir

print(f"Current directory: {Path.cwd()}")
print(f"Project root: {project_root}")

# Change to project root
os.chdir(project_root)
print(f"\nChanged to: {os.getcwd()}")

# Verify we're in the right place
assert (Path.cwd() / 'src').exists(), "‚ùå 'src' folder not found! Are you in the project root?"
assert (Path.cwd() / 'requirements.txt').exists(), "‚ùå 'requirements.txt' not found!"

print("\n‚úÖ Directory setup correct!")

Current directory: /Users/cheneyyoon/Desktop/U of T/APS360/Miles/notebooks
Project root: /Users/cheneyyoon/Desktop/U of T/APS360/Miles

Changed to: /Users/cheneyyoon/Desktop/U of T/APS360/Miles

‚úÖ Directory setup correct!


## Step 1: Import All Modules

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.insert(0, str(Path.cwd() / 'src'))

import torch
import torch.nn as nn
import pandas as pd
import numpy as np

print("Testing imports...")
print(f"PyTorch version: {torch.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

try:
    # Import dataset adapter
    from data.dataset_adapter import (
        prepare_dataset_for_training,
        get_available_scalar_features,
        get_dataset_summary
    )
    print("‚úÖ Dataset adapter imported")
    
    from data.download import download_dataset
    from data.preprocessing import preprocess_dataset
    from data.dataset import create_train_val_test_split, create_data_loaders
    print("‚úÖ Data modules imported")

    from models.baseline import BaselineModel
    from models.fusion_model import MultimodalViralityPredictor
    print("‚úÖ Model modules imported")

    from training.utils import load_config, set_seed, get_device, save_checkpoint
    from training.evaluate import evaluate_model, print_evaluation_report
    from training.train import train_model
    print("‚úÖ Training modules imported")
    
    print("\n‚úÖ ALL IMPORTS SUCCESSFUL!")
    
except Exception as e:
    print(f"\n‚ùå IMPORT ERROR: {e}")
    print("\nTroubleshooting:")
    print("1. Make sure you installed dependencies: pip install -r requirements.txt")
    print("2. Check that all __init__.py files exist in src/ folders")
    print("3. Verify file structure matches implementation plan")
    raise


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/cheneyyoon/Desktop/U of T/APS360/Miles/venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/cheneyyoon/Desktop/U of T/APS360/Miles/venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/cheneyyoon/Desktop/U of T/APS360/Miles/venv/lib/python3.12/site-packages/ipyk

Testing imports...
PyTorch version: 2.2.2
Pandas version: 2.3.3
NumPy version: 2.3.4
‚úÖ Dataset adapter imported
‚úÖ Data modules imported
‚úÖ Model modules imported
‚úÖ Training modules imported

‚úÖ ALL IMPORTS SUCCESSFUL!


## Step 2: Load Tiny Dataset Sample

In [3]:
# Path to your CSV file (adjust if needed)
csv_path = 'data/raw/youtube_shorts_tiktok_trends_2025.csv'

print(f"Loading TINY dataset from: {csv_path}")
print("(Only 100 rows for dry run)")

try:
    # Check if file exists
    if not Path(csv_path).exists():
        print(f"‚ùå File not found: {csv_path}")
        print(f"Current directory: {os.getcwd()}")
        print(f"Looking for: {Path(csv_path).absolute()}")
        raise FileNotFoundError(f"Dataset not found at {csv_path}")
    
    df_raw = pd.read_csv(csv_path, nrows=100)  # Only 100 rows!
    print(f"‚úÖ Loaded {len(df_raw)} rows")
    print(f"\nShape: {df_raw.shape}")
    print(f"\nFirst 10 columns: {list(df_raw.columns[:10])}")
    
    # Show sample
    print("\nFirst 3 rows:")
    display(df_raw.head(3))
    
except Exception as e:
    print(f"‚ùå FAILED TO LOAD DATA: {e}")
    print("\nMake sure:")
    print("1. CSV file exists at data/raw/youtube_shorts_tiktok_trends_2025.csv")
    print("2. File is not corrupted")
    print("3. You're in the project root directory")
    raise

Loading TINY dataset from: data/raw/youtube_shorts_tiktok_trends_2025.csv
(Only 100 rows for dry run)
‚úÖ Loaded 100 rows

Shape: (100, 58)

First 10 columns: ['platform', 'country', 'region', 'language', 'category', 'hashtag', 'title_keywords', 'author_handle', 'sound_type', 'music_track']

First 3 rows:


Unnamed: 0,platform,country,region,language,category,hashtag,title_keywords,author_handle,sound_type,music_track,...,traffic_source,is_weekend,row_id,engagement_total,like_rate,dislike_rate,engagement_per_1k,engagement_like_rate,engagement_comment_rate,engagement_share_rate
0,TikTok,Jp,Asia,ja,Gaming,#Lifestyle,Night Routine ‚Äî College,NextVision,trending,8bit loop,...,External,1,2e681528d17a1fe1986857942536ec27,30317,0.086159,0.004004,120.069,0.086159,0.012555,0.00783
1,TikTok,Se,Europe,sv,Food,#Sports,Morning Routine ‚Äî College,DailyVlogsDiego,trending,Street vibe,...,Search,0,2e35fa0b2978b9cae635839c1d4e9e74,30577,0.085298,0.002421,113.005,0.085298,0.00785,0.007791
2,TikTok,Za,Africa,en,Art,#Workout,Night Routine ‚Äî College,BeyondHub,licensed,Gallery pad,...,External,1,0d88a011235a82244995ef52961f9502,503,0.049154,0.001625,68.111,0.049154,0.004469,0.005146


## Step 3: Test Data Preprocessing

In [4]:
print("Testing data preparation...")

try:
    # Basic filtering
    print("Filtering data...")
    if 'language' in df_raw.columns:
        df_filtered = df_raw[df_raw['language'] == 'en'].copy()
        print(f"  After language filter: {len(df_filtered)} rows")
    else:
        df_filtered = df_raw.copy()
        print(f"  No language column, using all rows")
    
    # Drop missing values
    critical_cols = ['row_id', 'title', 'views', 'likes']
    before_count = len(df_filtered)
    df_filtered = df_filtered.dropna(subset=critical_cols)
    print(f"  After dropping nulls: {len(df_filtered)} rows (removed {before_count - len(df_filtered)})")
    
    # Prepare with adapter
    print("\nPreparing dataset with adapter...")
    df_prepared = prepare_dataset_for_training(
        df_filtered,
        text_column='title',
        create_viral_labels=True,
        viral_threshold_percentile=80.0
    )
    
    print(f"‚úÖ Dataset prepared: {df_prepared.shape}")
    
    # Get summary
    summary = get_dataset_summary(df_prepared)
    print(f"\nDataset summary:")
    print(f"  Total videos: {summary.get('total_videos', len(df_prepared))}")
    print(f"  Viral rate: {summary.get('viral_percentage', 0):.1f}%")
    
    # Show prepared columns
    print(f"\nPrepared columns ({len(df_prepared.columns)}):")
    print(list(df_prepared.columns))
    
except Exception as e:
    print(f"‚ùå DATA PREPARATION FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise

INFO:data.dataset_adapter:Preparing dataset for training
INFO:data.dataset_adapter:Adapting dataset columns to expected format...
INFO:data.dataset_adapter:Renamed columns: ['row_id', 'publish_date_approx', 'hashtag']
INFO:data.dataset_adapter:‚úÖ All required columns present
INFO:data.dataset_adapter:engagement_velocity statistics:
INFO:data.dataset_adapter:  Mean: 13284.95
INFO:data.dataset_adapter:  Std: 19031.46
INFO:data.dataset_adapter:  Min: 338.14
INFO:data.dataset_adapter:  Max: 80313.67
INFO:data.dataset_adapter:Created viral labels: 4/19 viral (21.1%)
INFO:data.dataset_adapter:Viral threshold (engagement_velocity): 22970.74
INFO:data.dataset_adapter:Found 18 scalar features in dataset
INFO:data.dataset_adapter:Available scalar features (18):
INFO:data.dataset_adapter:  - views
INFO:data.dataset_adapter:  - likes
INFO:data.dataset_adapter:  - comments
INFO:data.dataset_adapter:  - shares
INFO:data.dataset_adapter:  - saves


Testing data preparation...
Filtering data...
  After language filter: 19 rows
  After dropping nulls: 19 rows (removed 0)

Preparing dataset with adapter...


INFO:data.dataset_adapter:  - engagement_rate
INFO:data.dataset_adapter:  - completion_rate
INFO:data.dataset_adapter:  - like_rate
INFO:data.dataset_adapter:  - comment_ratio
INFO:data.dataset_adapter:  - share_rate
INFO:data.dataset_adapter:  - save_rate
INFO:data.dataset_adapter:  - upload_hour
INFO:data.dataset_adapter:  - publish_dayofweek
INFO:data.dataset_adapter:  - is_weekend
INFO:data.dataset_adapter:  - duration_sec
INFO:data.dataset_adapter:  - title_length
INFO:data.dataset_adapter:  - has_emoji
INFO:data.dataset_adapter:  - creator_avg_views
INFO:data.dataset_adapter:Dataset preparation complete!
INFO:data.dataset_adapter:Shape: (19, 59)
INFO:data.dataset_adapter:Text column: title
INFO:data.dataset_adapter:Has viral labels: True
INFO:data.dataset_adapter:Has engagement_velocity: True


‚úÖ Dataset prepared: (19, 59)

Dataset summary:
  Total videos: 19
  Viral rate: 21.1%

Prepared columns (59):
['platform', 'country', 'region', 'language', 'category', 'primary_hashtag', 'title_keywords', 'author_handle', 'sound_type', 'music_track', 'week_of_year', 'duration_sec', 'views', 'likes', 'comments', 'shares', 'saves', 'engagement_rate', 'trend_label', 'source_hint', 'notes', 'device_type', 'upload_hour', 'genre', 'trend_duration_days', 'trend_type', 'engagement_velocity', 'dislikes', 'comment_ratio', 'share_rate', 'save_rate', 'like_dislike_ratio', 'publish_dayofweek', 'publish_period', 'event_season', 'tags', 'sample_comments', 'creator_avg_views', 'creator_tier', 'season', 'upload_date', 'year_month', 'title', 'title_length', 'has_emoji', 'avg_watch_time_sec', 'completion_rate', 'device_brand', 'traffic_source', 'is_weekend', 'video_id', 'engagement_total', 'like_rate', 'dislike_rate', 'engagement_per_1k', 'engagement_like_rate', 'engagement_comment_rate', 'engagement_s

## Step 4: Test Train/Val/Test Split

In [5]:
print("Testing data splits...")

try:
    train_df, val_df, test_df = create_train_val_test_split(
        df_prepared,
        train_ratio=0.7,
        val_ratio=0.15,
        test_ratio=0.15,
        stratify_column='is_viral',
        random_seed=42
    )
    
    print(f"‚úÖ Splits created:")
    print(f"  Train: {len(train_df)} samples ({100*train_df['is_viral'].mean():.1f}% viral)")
    print(f"  Val:   {len(val_df)} samples ({100*val_df['is_viral'].mean():.1f}% viral)")
    print(f"  Test:  {len(test_df)} samples ({100*test_df['is_viral'].mean():.1f}% viral)")
    print(f"\n  Total: {len(train_df) + len(val_df) + len(test_df)} (original: {len(df_prepared)})")
    
except Exception as e:
    print(f"‚ùå SPLIT FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise

Testing data splits...
‚ùå SPLIT FAILED: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.


Traceback (most recent call last):
  File "/var/folders/yg/1222qhpn71sclw0r8zspmz2h0000gn/T/ipykernel_16313/1566988088.py", line 4, in <module>
    train_df, val_df, test_df = create_train_val_test_split(
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/cheneyyoon/Desktop/U of T/APS360/Miles/src/data/dataset.py", line 305, in create_train_val_test_split
    val_df, test_df = train_test_split(
                      ^^^^^^^^^^^^^^^^^
  File "/Users/cheneyyoon/Desktop/U of T/APS360/Miles/venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 218, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/cheneyyoon/Desktop/U of T/APS360/Miles/venv/lib/python3.12/site-packages/sklearn/model_selection/_split.py", line 2940, in train_test_split
    train, test = next(cv.split(X=arrays[0], y=stratify))
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/cheneyyoon/Desktop/U of T/APS360/Miles/

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

## Step 5: Test Model Initialization (CPU)

In [None]:
print("Testing model initialization...")
print("(This will download BERT weights if not cached - may take 1-2 minutes)")
print()

try:
    # Get scalar features
    scalar_features = get_available_scalar_features(df_prepared)
    print(f"Using {len(scalar_features)} scalar features:")
    for feat in scalar_features:
        print(f"  - {feat}")
    
    # Initialize model on CPU
    print("\nInitializing model...")
    model = MultimodalViralityPredictor(
        num_scalar_features=len(scalar_features),
        freeze_encoders=True,
        fusion_hidden_dims=[1024, 256],
        dropout_rates=[0.3, 0.2],
        use_text=True,
        use_vision=False  # No images in this dataset
    )
    
    params = model.count_parameters()
    print(f"\n‚úÖ Model initialized on CPU")
    print(f"  Total params: {params['total']:,}")
    print(f"  Trainable: {params['trainable']:,}")
    print(f"  Frozen: {params['frozen']:,}")
    
except Exception as e:
    print(f"‚ùå MODEL INITIALIZATION FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise

## Step 6: Test DataLoader Creation

In [None]:
print("Testing DataLoader creation...")

try:
    from data.dataset import ViralShortsDataset, collate_multimodal_batch
    from torch.utils.data import DataLoader
    
    # Create small dataset
    print("Creating dataset...")
    train_dataset = ViralShortsDataset(
        train_df,
        text_column='title',
        scalar_columns=scalar_features,
        label_column='is_viral',
        velocity_column='engagement_velocity',
        text_max_length=128,
        use_images=False,
        augment_images=False
    )
    
    print(f"  Dataset size: {len(train_dataset)} samples")
    
    # Create DataLoader with small batch
    print("\nCreating DataLoader...")
    train_loader = DataLoader(
        train_dataset, 
        batch_size=4,  # Small batch for testing
        shuffle=True, 
        num_workers=0,  # No multiprocessing for local testing
        collate_fn=collate_multimodal_batch
    )
    
    print(f"‚úÖ DataLoader created: {len(train_loader)} batches")
    
    # Test one batch
    print("\nLoading test batch...")
    batch = next(iter(train_loader))
    print(f"‚úÖ Batch loaded successfully")
    print(f"  Text shape: {batch['text']['input_ids'].shape}")
    print(f"  Scalars shape: {batch['scalars'].shape}")
    print(f"  Labels shape: {batch['label'].shape}")
    print(f"  Velocity shape: {batch['velocity'].shape}")
    
except Exception as e:
    print(f"‚ùå DATALOADER CREATION FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise

## Step 7: Test Forward Pass

In [None]:
print("Testing forward pass...")

try:
    model.eval()
    
    print("Running inference...")
    with torch.no_grad():
        cls_logits, reg_output = model(
            text_input=batch['text'],
            image_input=None,
            scalar_features=batch['scalars']
        )
    
    print(f"\n‚úÖ Forward pass successful!")
    print(f"  Classification logits: {cls_logits.shape}")
    print(f"  Regression output: {reg_output.shape}")
    
    # Show sample predictions
    probs = torch.softmax(cls_logits, dim=1)
    print(f"\n  Sample predictions (viral probability):")
    for i in range(min(3, len(probs))):
        print(f"    Sample {i+1}: {probs[i, 1].item():.4f}")
    
except Exception as e:
    print(f"‚ùå FORWARD PASS FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise

## Step 8: Test One Training Step

In [None]:
print("Testing one training step (backward pass)...")

try:
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    
    cls_criterion = nn.CrossEntropyLoss()
    reg_criterion = nn.MSELoss()
    
    # One training step
    print("Running training step...")
    optimizer.zero_grad()
    
    cls_logits, reg_output = model(
        text_input=batch['text'],
        image_input=None,
        scalar_features=batch['scalars']
    )
    
    cls_loss = cls_criterion(cls_logits, batch['label'])
    reg_loss = reg_criterion(reg_output.squeeze(), batch['velocity'])
    
    total_loss = 0.7 * cls_loss + 0.3 * reg_loss
    
    print("Running backward pass...")
    total_loss.backward()
    
    print("Updating parameters...")
    optimizer.step()
    
    print(f"\n‚úÖ Training step successful!")
    print(f"  Total loss: {total_loss.item():.4f}")
    print(f"  Classification loss: {cls_loss.item():.4f}")
    print(f"  Regression loss: {reg_loss.item():.4f}")
    
except Exception as e:
    print(f"‚ùå TRAINING STEP FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise

## Step 9: Test Configuration Loading

In [None]:
print("Testing configuration loading...")

try:
    config = load_config('src/configs/training_config.yaml')
    
    print(f"\n‚úÖ Config loaded successfully")
    print(f"\nConfiguration:")
    print(f"  Model name: {config['model']['name']}")
    print(f"  Epochs: {config['training']['epochs']}")
    print(f"  Batch size: {config['data']['batch_size']}")
    print(f"  Learning rate: {config['training']['learning_rate']}")
    print(f"  Device: {config['hardware']['device']}")
    print(f"  Mixed precision: {config['hardware']['mixed_precision']}")
    
    # Validate required keys
    required_keys = ['model', 'data', 'training', 'hardware']
    for key in required_keys:
        assert key in config, f"Missing required key: {key}"
    
    print(f"\n  All required config keys present ‚úì")
    
except Exception as e:
    print(f"‚ùå CONFIG LOADING FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise

## ‚úÖ Final Summary

In [None]:
print("="*70)
print("LOCAL DRY RUN VALIDATION - FINAL SUMMARY")
print("="*70)
print()
print("‚úÖ All tests passed! Your pipeline is ready.")
print()
print("What was tested:")
print("  ‚úì All module imports")
print("  ‚úì Data loading and preprocessing")
print("  ‚úì Train/val/test splitting")
print("  ‚úì Model initialization (BERT + MLP fusion)")
print("  ‚úì DataLoader creation")
print("  ‚úì Forward pass (inference)")
print("  ‚úì Backward pass (training step)")
print("  ‚úì Configuration loading")
print()
print("Environment:")
print(f"  Device: CPU")
print(f"  Test samples: {len(df_prepared)}")
print(f"  Model params: {params['total']:,}")
print()
print("Next steps:")
print("  1. Your code is validated and ready!")
print("  2. Upload to Google Drive at MyDrive/Miles/")
print("  3. Open phase1_training_colab.ipynb in Colab")
print("  4. Select A100 GPU runtime")
print("  5. Run all cells with confidence! üöÄ")
print()
print("="*70)