In [10]:
import pandas as pd

In [13]:
df = pd.read_csv('df_encord_v1.csv')
df.head()

Unnamed: 0,id,video_title,video_path,video_type,event_time_sec,split
0,f28ece5faef3856b5e2fed1f78d6cf8a,anonymized_f28ece5faef3856b5e2fed1f78d6cf8a.mp4,../data/research-nvidia-data/nvidia-1/f28ece5f...,Normal,19.72,train
1,869fe26b504975fefb99701e979c3790,anonymized_869fe26b504975fefb99701e979c3790.mp4,../data/research-nvidia-data/nvidia-1/869fe26b...,Near Collision,19.08,train
2,dd8c7ddd37ac435d190b9b7824a062a5,anonymized_dd8c7ddd37ac435d190b9b7824a062a5.mp4,../data/research-nvidia-data/nvidia-1/dd8c7ddd...,Near Collision,20.08,train
3,83d1e898281daa3af960fef12a819fb8_dup,anonymized_83d1e898281daa3af960fef12a819fb8.mp4,../data/research-nvidia-data/nvidia-1/83d1e898...,Normal,8.33,train
4,a528fcc32cc15c7bf3ccedda6050e618_dup,anonymized_a528fcc32cc15c7bf3ccedda6050e618.mp4,../data/research-nvidia-data/nvidia-1/a528fcc3...,Normal,7.39,train


In [17]:
#!/usr/bin/env python3
"""
Simple debug script to test VideoDataset step by step
"""

import pandas as pd
import torch
from torch.utils.data import DataLoader
import traceback
import os

def test_step_1_basic_import():
    """Test 1: Basic imports"""
    print("=== TEST 1: Basic Imports ===")
    try:
        # Test your imports
        from nexar_video_aug import create_video_transforms
        print("✅ nexar_video_aug imported successfully")
        
        # Try creating transforms
        transform_train = create_video_transforms(mode='train')
        transform_val = create_video_transforms(mode='val')
        print("✅ Transforms created successfully")
        
        return True
    except Exception as e:
        print(f"❌ Import failed: {e}")
        traceback.print_exc()
        return False

def test_step_2_metadata():
    """Test 2: Load metadata"""
    print("\n=== TEST 2: Load Metadata ===")
    try:
        metadata_df = pd.read_csv("df_encord_v1.csv")
        print(f"✅ Metadata loaded: {len(metadata_df)} rows")
        print(f"Columns: {list(metadata_df.columns)}")
        print(f"First few rows:")
        print(metadata_df.head(2))
        
        # Check required columns
        required_cols = ['video_path', 'video_type', 'id', 'split']
        missing_cols = [col for col in required_cols if col not in metadata_df.columns]
        if missing_cols:
            print(f"❌ Missing columns: {missing_cols}")
            return False
        
        print("✅ All required columns present")
        return True
    except Exception as e:
        print(f"❌ Metadata loading failed: {e}")
        return False

def test_step_3_dataset_creation():
    """Test 3: Create dataset without loading data"""
    print("\n=== TEST 3: Dataset Creation ===")
    try:
        from nexar_complete_with_validation import create_datasets_with_manual_split
        from nexar_video_aug import create_video_transforms
        
        metadata_df = pd.read_csv("df_encord_v1.csv")
        
        transform_train = create_video_transforms(mode='train')
        transform_val = create_video_transforms(mode='val')
        
        train_ds, val_ds, test_ds = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train,
            transform_val=transform_val,
            video_path_column='video_path',
            label_column='video_type', 
            id_column='id',
            split_column='split',
            sample_strategy='center',  # Use simpler strategy first
            fps=10,
            duration=5
        )
        
        print(f"✅ Datasets created:")
        print(f"  Train: {len(train_ds) if train_ds else 0}")
        print(f"  Val: {len(val_ds) if val_ds else 0}")
        print(f"  Test: {len(test_ds) if test_ds else 0}")
        
        # Store datasets globally for other tests
        globals()['test_datasets'] = (train_ds, val_ds, test_ds)
        return True
    except Exception as e:
        print(f"❌ Dataset creation failed: {e}")
        traceback.print_exc()
        return False

def test_step_4_single_item():
    """Test 4: Load single item"""
    print("\n=== TEST 4: Single Item Loading ===")
    try:
        if 'test_datasets' not in globals():
            print("❌ Need to run dataset creation first")
            return False
        
        train_ds, val_ds, test_ds = globals()['test_datasets']
        
        if train_ds is None or len(train_ds) == 0:
            print("❌ No training data")
            return False
        
        print("Attempting to load first item...")
        item = train_ds[0]
        
        print(f"✅ First item loaded successfully:")
        print(f"  Frames shape: {item['frames'].shape}")
        print(f"  Target: {item['target']}")
        print(f"  ID: {item['id']}")
        
        return True
    except Exception as e:
        print(f"❌ Single item loading failed: {e}")
        traceback.print_exc()
        return False

def test_step_5_multiple_items():
    """Test 5: Load multiple items"""
    print("\n=== TEST 5: Multiple Items Loading ===")
    try:
        if 'test_datasets' not in globals():
            print("❌ Need to run dataset creation first")
            return False
        
        train_ds, val_ds, test_ds = globals()['test_datasets']
        
        print("Loading first 3 items...")
        for i in range(min(3, len(train_ds))):
            print(f"Loading item {i}...")
            item = train_ds[i]
            print(f"  Item {i}: frames {item['frames'].shape}, target {item['target']}")
        
        print("✅ Multiple items loaded successfully")
        return True
    except Exception as e:
        print(f"❌ Multiple items loading failed: {e}")
        traceback.print_exc()
        return False

def test_step_6_dataloader():
    """Test 6: Create DataLoader"""
    print("\n=== TEST 6: DataLoader Creation ===")
    try:
        if 'test_datasets' not in globals():
            print("❌ Need to run dataset creation first")
            return False
        
        train_ds, val_ds, test_ds = globals()['test_datasets']
        
        print("Creating DataLoader...")
        train_loader = DataLoader(
            train_ds,
            batch_size=2,  # Small batch size
            shuffle=False,  # No shuffle for debugging
            num_workers=0,  # No multiprocessing
            pin_memory=False,  # Disable pin_memory
            drop_last=False
        )
        
        print(f"✅ DataLoader created: {len(train_loader)} batches")
        globals()['test_dataloader'] = train_loader
        return True
    except Exception as e:
        print(f"❌ DataLoader creation failed: {e}")
        traceback.print_exc()
        return False

def test_step_7_batch():
    """Test 7: Load single batch"""
    print("\n=== TEST 7: Batch Loading ===")
    try:
        if 'test_dataloader' not in globals():
            print("❌ Need to run dataloader creation first")
            return False
        
        train_loader = globals()['test_dataloader']
        
        print("Loading first batch...")
        batch = next(iter(train_loader))
        
        print(f"✅ Batch loaded successfully:")
        print(f"  Frames shape: {batch['frames'].shape}")
        print(f"  Targets: {batch['target']}")
        print(f"  IDs: {batch['id']}")
        
        return True
    except Exception as e:
        print(f"❌ Batch loading failed: {e}")
        traceback.print_exc()
        return False

def test_step_8_video_files():
    """Test 8: Check video files exist"""
    print("\n=== TEST 8: Video Files Check ===")
    try:
        metadata_df = pd.read_csv("df_encord_v1.csv")
        train_df = metadata_df[metadata_df['split'].str.lower() == 'train']
        
        print(f"Checking first 5 video files...")
        for i, (idx, row) in enumerate(train_df.head(5).iterrows()):
            video_path = row['video_path']
            exists = os.path.exists(video_path)
            size = os.path.getsize(video_path) if exists else 0
            print(f"  {i+1}. {video_path}")
            print(f"     Exists: {exists}, Size: {size} bytes")
            
            if not exists:
                print(f"❌ File not found: {video_path}")
                return False
        
        print("✅ All checked video files exist")
        return True
    except Exception as e:
        print(f"❌ Video files check failed: {e}")
        traceback.print_exc()
        return False

def main():
    """Run all tests"""
    print("DATASET DEBUG SCRIPT")
    print("=" * 50)
    
    tests = [
        test_step_1_basic_import,
        test_step_2_metadata,
        test_step_8_video_files,  # Check files before trying to load
        test_step_3_dataset_creation,
        test_step_4_single_item,
        test_step_5_multiple_items,
        test_step_6_dataloader,
        test_step_7_batch,
    ]
    
    for test_func in tests:
        result = test_func()
        if not result:
            print(f"\n❌ FAILED AT: {test_func.__name__}")
            print("Stopping here to debug the issue.")
            break
        print("")
    else:
        print("🎉 ALL TESTS PASSED!")
        print("The VideoDataset is working correctly.")
        print("The segfault might be in the model creation or training loop.")

if __name__ == "__main__":
    main()

DATASET DEBUG SCRIPT
=== TEST 1: Basic Imports ===
✅ nexar_video_aug imported successfully
✅ Transforms created successfully


=== TEST 2: Load Metadata ===
✅ Metadata loaded: 14830 rows
Columns: ['id', 'video_title', 'video_path', 'video_type', 'event_time_sec', 'split']
First few rows:
                                 id  \
0  f28ece5faef3856b5e2fed1f78d6cf8a   
1  869fe26b504975fefb99701e979c3790   

                                       video_title  \
0  anonymized_f28ece5faef3856b5e2fed1f78d6cf8a.mp4   
1  anonymized_869fe26b504975fefb99701e979c3790.mp4   

                                          video_path      video_type  \
0  ../data/research-nvidia-data/nvidia-1/f28ece5f...          Normal   
1  ../data/research-nvidia-data/nvidia-1/869fe26b...  Near Collision   

   event_time_sec  split  
0           19.72  train  
1           19.08  train  
✅ All required columns present


=== TEST 8: Video Files Check ===
Checking first 5 video files...
  1. ../data/research-nvidia-data

In [19]:
#!/usr/bin/env python3
"""
Debug the model creation and forward pass
"""

import torch
import torch.nn as nn
import traceback

def test_model_import():
    """Test 1: Import model"""
    print("=== TEST 1: Model Import ===")
    try:
        from nexar_arch import EnhancedFrameCNN
        print("✅ EnhancedFrameCNN imported successfully")
        return True
    except Exception as e:
        print(f"❌ Model import failed: {e}")
        traceback.print_exc()
        return False

def test_model_creation():
    """Test 2: Create model"""
    print("\n=== TEST 2: Model Creation ===")
    try:
        from nexar_arch import EnhancedFrameCNN
        
        print("Creating model...")
        model = EnhancedFrameCNN(
            base_model='convnext_tiny',
            pretrained=True,
            dropout_rate=0.5,
            temporal_mode='gru'
        )
        
        print("✅ Model created successfully")
        print(f"Model type: {type(model)}")
        
        # Check if model has classifier
        if hasattr(model, 'classifier'):
            print(f"Classifier: {model.classifier}")
        else:
            print("❌ Model has no classifier attribute")
            return False
        
        return model
    except Exception as e:
        print(f"❌ Model creation failed: {e}")
        traceback.print_exc()
        return False

def test_model_modification():
    """Test 3: Modify classifier"""
    print("\n=== TEST 3: Model Modification ===")
    try:
        model = test_model_creation()
        if model is False:
            return False
        
        print("Modifying classifier for 3 classes...")
        
        # Get feature dimension
        feature_dim = model.classifier[-1].in_features
        print(f"Feature dimension: {feature_dim}")
        
        # Replace last layer
        model.classifier[-1] = nn.Linear(feature_dim, 3)
        print("✅ Classifier modified successfully")
        
        return model
    except Exception as e:
        print(f"❌ Model modification failed: {e}")
        traceback.print_exc()
        return False

def test_model_to_device():
    """Test 4: Move model to device"""
    print("\n=== TEST 4: Model to Device ===")
    try:
        model = test_model_modification()
        if model is False:
            return False
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Moving model to device: {device}")
        
        model = model.to(device)
        print("✅ Model moved to device successfully")
        
        return model, device
    except Exception as e:
        print(f"❌ Model to device failed: {e}")
        traceback.print_exc()
        return False

def test_model_forward():
    """Test 5: Forward pass"""
    print("\n=== TEST 5: Model Forward Pass ===")
    try:
        result = test_model_to_device()
        if result is False:
            return False
        
        model, device = result
        
        print("Creating dummy input...")
        # Create dummy input: batch_size=2, channels=3, frames=50, height=224, width=224
        dummy_input = torch.randn(2, 3, 50, 224, 224).to(device)
        print(f"Input shape: {dummy_input.shape}")
        
        print("Running forward pass...")
        model.eval()
        with torch.no_grad():
            output = model(dummy_input)
        
        print(f"✅ Forward pass successful!")
        print(f"Output shape: {output.shape}")
        print(f"Output: {output}")
        
        return True
    except Exception as e:
        print(f"❌ Forward pass failed: {e}")
        traceback.print_exc()
        return False

def test_model_training_mode():
    """Test 6: Training mode"""
    print("\n=== TEST 6: Training Mode ===")
    try:
        result = test_model_to_device()
        if result is False:
            return False
        
        model, device = result
        
        print("Setting model to training mode...")
        model.train()
        
        # Create dummy input and target
        dummy_input = torch.randn(2, 3, 50, 224, 224).to(device)
        dummy_target = torch.tensor([0, 1]).to(device)  # class indices
        
        print("Creating loss and optimizer...")
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
        
        print("Running training step...")
        optimizer.zero_grad()
        output = model(dummy_input)
        loss = criterion(output, dummy_target)
        loss.backward()
        optimizer.step()
        
        print(f"✅ Training step successful!")
        print(f"Loss: {loss.item():.4f}")
        
        return True
    except Exception as e:
        print(f"❌ Training step failed: {e}")
        traceback.print_exc()
        return False

def test_batch_from_dataset():
    """Test 7: Use real batch from dataset"""
    print("\n=== TEST 7: Real Batch Forward Pass ===")
    try:
        # Get a real batch from the dataset
        import pandas as pd
        from nexar_complete_with_validation import create_datasets_with_manual_split
        from nexar_video_aug import create_video_transforms
        from torch.utils.data import DataLoader
        
        print("Loading real dataset...")
        metadata_df = pd.read_csv("df_encord_v1.csv")
        transform_train = create_video_transforms(mode='train')
        
        train_ds, _, _ = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train,
            transform_val=None,
            video_path_column='video_path',
            label_column='video_type', 
            id_column='id',
            split_column='split',
            sample_strategy='center',
            fps=10,
            duration=5
        )
        
        train_loader = DataLoader(train_ds, batch_size=2, shuffle=False, num_workers=0)
        batch = next(iter(train_loader))
        
        print("Got real batch from dataset")
        print(f"Batch frames shape: {batch['frames'].shape}")
        print(f"Batch targets: {batch['target']}")
        
        # Get model
        result = test_model_to_device()
        if result is False:
            return False
        model, device = result
        
        # Prepare data - convert from [B, T, H, W, C] to [B, C, T, H, W]
        frames = batch['frames'].permute(0, 4, 1, 2, 3).float().to(device)
        print(f"Converted frames shape: {frames.shape}")
        
        # Convert targets
        class_map = {'Normal': 0, 'Near Collision': 1, 'Collision': 2}
        targets = torch.tensor([class_map[t] for t in batch['target']]).to(device)
        print(f"Targets: {targets}")
        
        print("Running forward pass with real data...")
        model.eval()
        with torch.no_grad():
            output = model(frames)
        
        print(f"✅ Real batch forward pass successful!")
        print(f"Output shape: {output.shape}")
        print(f"Output: {output}")
        
        return True
    except Exception as e:
        print(f"❌ Real batch forward pass failed: {e}")
        traceback.print_exc()
        return False

def main():
    """Run all model tests"""
    print("MODEL DEBUG SCRIPT")
    print("=" * 50)
    
    tests = [
        test_model_import,
        test_model_creation,
        test_model_modification,
        test_model_to_device,
        test_model_forward,
        test_model_training_mode,
        test_batch_from_dataset,
    ]
    
    for test_func in tests:
        result = test_func()
        if not result:
            print(f"\n❌ FAILED AT: {test_func.__name__}")
            print("Stopping here to debug the issue.")
            break
        print("")
    else:
        print("🎉 ALL MODEL TESTS PASSED!")
        print("The model is working correctly.")
        print("The segfault might be in the VideoClassifier setup or distributed training.")

if __name__ == "__main__":
    main()

MODEL DEBUG SCRIPT
=== TEST 1: Model Import ===
✅ EnhancedFrameCNN imported successfully


=== TEST 2: Model Creation ===
Creating model...
Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
✅ Model created successfully
Model type: <class 'nexar_arch.EnhancedFrameCNN'>
Classifier: Sequential(
  (0): Linear(in_features=768, out_features=512, bias=True)
  (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): Dropout(p=0.5, inplace=False)
  (4): Linear(in_features=512, out_features=256, bias=True)
  (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (6): ReLU()
  (7): Dropout(p=0.5, inplace=False)
  (8): Linear(in_features=256, out_features=1, bias=True)
)


=== TEST 3: Model Modification ===

=== TEST 2: Model Creation ===
Creating model...
Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
✅ Model created successfully
Model type: <class 'nexar_arch.En

In [21]:
#!/usr/bin/env python3
"""
Debug the VideoClassifier creation step by step
"""

import torch
import pandas as pd
import traceback
import os

def test_classifier_init():
    """Test VideoClassifier initialization"""
    print("=== TEST: VideoClassifier Creation ===")
    try:
        # Prepare datasets
        from nexar_complete_with_validation import create_datasets_with_manual_split
        from nexar_video_aug import create_video_transforms
        
        print("1. Loading metadata...")
        metadata_df = pd.read_csv("df_encord_v1.csv")
        
        print("2. Creating transforms...")
        transform_train = create_video_transforms(mode='train')
        transform_val = create_video_transforms(mode='val')
        
        print("3. Creating datasets...")
        train_ds, val_ds, test_ds = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train,
            transform_val=transform_val,
            video_path_column='video_path',
            label_column='video_type', 
            id_column='id',
            split_column='split',
            sample_strategy='center',
            fps=10,
            duration=5
        )
        
        print(f"4. Datasets ready: Train={len(train_ds)}, Val={len(val_ds)}, Test={len(test_ds)}")
        
        # Import VideoClassifier
        print("5. Importing VideoClassifier...")
        from nexar_complete_with_validation import VideoClassifier
        
        print("6. Creating VideoClassifier...")
        classifier = VideoClassifier(
            train_dataset=train_ds,
            val_dataset=val_ds,
            test_dataset=test_ds,
            base_model='convnext_tiny',
            temporal_mode='gru',
            num_classes=3,
            batch_size=4,  # Smaller batch size
            learning_rate=1e-4,
            save_dir='debug_checkpoints'
        )
        
        print("✅ VideoClassifier created successfully!")
        return classifier
        
    except Exception as e:
        print(f"❌ VideoClassifier creation failed: {e}")
        traceback.print_exc()
        return False

def test_classifier_step_by_step():
    """Test VideoClassifier creation step by step"""
    print("=== TEST: Step by Step VideoClassifier ===")
    try:
        # Mock the VideoClassifier creation manually
        from nexar_complete_with_validation import create_datasets_with_manual_split
        from nexar_video_aug import create_video_transforms
        
        print("1. Preparing datasets...")
        metadata_df = pd.read_csv("df_encord_v1.csv")
        transform_train = create_video_transforms(mode='train')
        transform_val = create_video_transforms(mode='val')
        
        train_ds, val_ds, test_ds = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train,
            transform_val=transform_val,
            video_path_column='video_path',
            label_column='video_type', 
            id_column='id',
            split_column='split',
            sample_strategy='center',
            fps=10,
            duration=5
        )
        
        print("2. Manual setup_distributed...")
        # Simulate setup_distributed (non-distributed mode)
        distributed = False
        local_rank = 0
        world_size = 1
        rank = 0
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        is_master = True
        
        print(f"   Device: {device}, distributed: {distributed}")
        
        print("3. Manual setup_data_loaders...")
        from torch.utils.data import DataLoader
        
        train_loader = DataLoader(
            train_ds,
            batch_size=4,
            shuffle=True,
            num_workers=0,
            pin_memory=True,
            drop_last=True
        )
        
        val_loader = DataLoader(
            val_ds,
            batch_size=4,
            shuffle=False,
            num_workers=0,
            pin_memory=True
        )
        
        test_loader = DataLoader(
            test_ds,
            batch_size=4,
            shuffle=False,
            num_workers=0
        )
        
        print(f"   DataLoaders created: Train={len(train_loader)}, Val={len(val_loader)}")
        
        print("4. Manual create_model...")
        from nexar_arch import EnhancedFrameCNN
        import torch.nn as nn
        
        model = EnhancedFrameCNN(
            base_model='convnext_tiny',
            pretrained=True,
            dropout_rate=0.5,
            temporal_mode='gru'
        )
        
        # Adapt to number of classes
        feature_dim = model.classifier[-1].in_features
        model.classifier[-1] = nn.Linear(feature_dim, 3)
        
        print("   Model created, moving to device...")
        model.to(device)
        
        print("5. Creating optimizer and loss...")
        import torch.optim as optim
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(model.parameters(), lr=1e-4)
        
        print("6. Testing one batch...")
        model.train()
        batch = next(iter(train_loader))
        
        frames = batch['frames'].permute(0, 4, 1, 2, 3).float().to(device)
        targets = batch['target']
        
        # Convert string targets to indices
        class_map = {'Normal': 0, 'Near Collision': 1, 'Collision': 2}
        targets = torch.tensor([class_map[t] for t in targets]).to(device)
        
        print(f"   Input shape: {frames.shape}, targets: {targets}")
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(frames)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        print(f"   ✅ Training step successful! Loss: {loss.item():.4f}")
        
        print("✅ All manual steps completed successfully!")
        return True
        
    except Exception as e:
        print(f"❌ Manual steps failed: {e}")
        traceback.print_exc()
        return False

def test_memory_usage():
    """Test memory usage during creation"""
    print("=== TEST: Memory Usage ===")
    try:
        import psutil
        import gc
        
        process = psutil.Process(os.getpid())
        
        def print_memory():
            mem = process.memory_info()
            print(f"   Memory: RSS={mem.rss / 1024**2:.1f}MB, VMS={mem.vms / 1024**2:.1f}MB")
        
        print("1. Initial memory:")
        print_memory()
        
        print("2. After imports:")
        from nexar_complete_with_validation import VideoClassifier
        print_memory()
        
        print("3. After dataset creation:")
        from nexar_complete_with_validation import create_datasets_with_manual_split
        from nexar_video_aug import create_video_transforms
        
        metadata_df = pd.read_csv("df_encord_v1.csv")
        transform_train = create_video_transforms(mode='train')
        transform_val = create_video_transforms(mode='val')
        
        train_ds, val_ds, test_ds = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train,
            transform_val=transform_val,
            video_path_column='video_path',
            label_column='video_type', 
            id_column='id',
            split_column='split',
            sample_strategy='center',
            fps=10,
            duration=5
        )
        print_memory()
        
        print("4. Creating VideoClassifier with very small batch size...")
        try:
            classifier = VideoClassifier(
                train_dataset=train_ds,
                val_dataset=val_ds,
                test_dataset=test_ds,
                base_model='convnext_tiny',
                temporal_mode='gru',
                num_classes=3,
                batch_size=1,  # Very small batch
                learning_rate=1e-4,
                save_dir='debug_checkpoints'
            )
            print("5. After VideoClassifier creation:")
            print_memory()
            print("✅ Memory test completed successfully!")
            return True
        except Exception as e:
            print(f"   Error during VideoClassifier creation: {e}")
            print_memory()
            return False
            
    except ImportError:
        print("psutil not available, skipping memory test")
        return True
    except Exception as e:
        print(f"❌ Memory test failed: {e}")
        traceback.print_exc()
        return False

def main():
    """Run all tests"""
    print("VIDEOCLASSIFIER DEBUG SCRIPT")
    print("=" * 50)
    
    tests = [
        test_memory_usage,
        test_classifier_step_by_step,
        test_classifier_init,
    ]
    
    for test_func in tests:
        result = test_func()
        if not result:
            print(f"\n❌ FAILED AT: {test_func.__name__}")
            print("Stopping here to debug the issue.")
            break
        print("")
    else:
        print("🎉 ALL VIDEOCLASSIFIER TESTS PASSED!")

if __name__ == "__main__":
    main()

16:01:58 [RANK:0] Data loaders created: Train=12605, Val=1113


VIDEOCLASSIFIER DEBUG SCRIPT
=== TEST: Memory Usage ===
1. Initial memory:
   Memory: RSS=2333.7MB, VMS=61276.1MB
2. After imports:
   Memory: RSS=2333.7MB, VMS=61276.1MB
3. After dataset creation:
   Memory: RSS=2337.0MB, VMS=61279.6MB
4. Creating VideoClassifier with very small batch size...


16:01:59 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:01:59 [RANK:0] Setup complete. Device: cuda, World size: 1


Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
5. After VideoClassifier creation:
   Memory: RSS=2337.3MB, VMS=61279.9MB
✅ Memory test completed successfully!

=== TEST: Step by Step VideoClassifier ===
1. Preparing datasets...
2. Manual setup_distributed...
   Device: cuda, distributed: False
3. Manual setup_data_loaders...
   DataLoaders created: Train=3151, Val=279
4. Manual create_model...
Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
   Model created, moving to device...
5. Creating optimizer and loss...
6. Testing one batch...
   Input shape: torch.Size([4, 3, 50, 224, 224]), targets: tensor([1, 1, 1, 1], device='cuda:0')


16:02:03 [RANK:0] Data loaders created: Train=3151, Val=279


   ✅ Training step successful! Loss: 1.2874
✅ All manual steps completed successfully!

=== TEST: VideoClassifier Creation ===
1. Loading metadata...
2. Creating transforms...
3. Creating datasets...
4. Datasets ready: Train=12605, Val=1113, Test=1112
5. Importing VideoClassifier...
6. Creating VideoClassifier...


16:02:04 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:02:04 [RANK:0] Setup complete. Device: cuda, World size: 1


Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
✅ VideoClassifier created successfully!

🎉 ALL VIDEOCLASSIFIER TESTS PASSED!


In [None]:
#!/usr/bin/env python3
"""
Test different batch sizes to find the segfault trigger
"""

import torch
import pandas as pd
import traceback
import gc

def test_batch_size(batch_size):
    """Test specific batch size"""
    print(f"=== TESTING BATCH SIZE: {batch_size} ===")
    try:
        # Clean up memory first
        gc.collect()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        from nexar_complete_with_validation import VideoClassifier, create_datasets_with_manual_split
        from nexar_video_aug import create_video_transforms
        
        print(f"1. Loading data for batch_size={batch_size}...")
        metadata_df = pd.read_csv("df_encord_v1.csv")
        transform_train = create_video_transforms(mode='train')
        transform_val = create_video_transforms(mode='val')
        
        train_ds, val_ds, test_ds = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train,
            transform_val=transform_val,
            video_path_column='video_path',
            label_column='video_type', 
            id_column='id',
            split_column='split',
            sample_strategy='center',
            fps=10,
            duration=5
        )
        
        print(f"2. Creating VideoClassifier with batch_size={batch_size}...")
        classifier = VideoClassifier(
            train_dataset=train_ds,
            val_dataset=val_ds,
            test_dataset=test_ds,
            base_model='convnext_tiny',
            temporal_mode='gru',
            num_classes=3,
            batch_size=batch_size,
            learning_rate=1e-4,
            save_dir=f'debug_checkpoints_bs{batch_size}'
        )
        
        print(f"3. Testing one training step with batch_size={batch_size}...")
        # Try to do one training step
        classifier.model.train()
        train_iter = iter(classifier.train_loader)
        batch = next(train_iter)
        
        frames = batch['frames'].permute(0, 4, 1, 2, 3).float().to(classifier.device)
        targets = batch['target']
        
        class_map = {'Normal': 0, 'Near Collision': 1, 'Collision': 2}
        targets = torch.tensor([class_map[t] for t in targets]).to(classifier.device)
        
        classifier.optimizer.zero_grad()
        outputs = classifier.model(frames)
        loss = classifier.criterion(outputs, targets)
        loss.backward()
        classifier.optimizer.step()
        
        print(f"✅ BATCH SIZE {batch_size} SUCCESS! Loss: {loss.item():.4f}")
        
        # Cleanup
        del classifier, train_ds, val_ds, test_ds
        gc.collect()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        return True
        
    except Exception as e:
        print(f"❌ BATCH SIZE {batch_size} FAILED: {e}")
        traceback.print_exc()
        
        # Cleanup
        gc.collect()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        return False

def test_exact_reproduction():
    """Try to reproduce the exact command that fails"""
    print("=== EXACT REPRODUCTION TEST ===")
    try:
        # Set up the exact same parameters as the failing script
        from nexar_complete_with_validation import run_single_experiment
        
        class TestArgs:
            metadata_csv = "df_encord_v1.csv"
            video_path_column = 'video_path'
            label_column = 'video_type'
            id_column = 'id'
            split_column = 'split'
            sample_strategy = 'center'
            center_time_column = None
            base_model = 'convnext_tiny'
            temporal_mode = 'gru'
            epochs = 1  # Just 1 epoch
            batch_size = 8  # The problematic batch size
            learning_rate = 1e-4
            fps = 10
            duration = 5
            experiment_name = "debug_reproduction"
            save_dir = 'debug_results'
            seed = 42
            run_grid_search = False
        
        args = TestArgs()
        
        print("Running exact reproduction with batch_size=8...")
        classifier, test_accuracy = run_single_experiment(args)
        
        print("✅ EXACT REPRODUCTION SUCCESS!")
        return True
        
    except Exception as e:
        print(f"❌ EXACT REPRODUCTION FAILED: {e}")
        traceback.print_exc()
        return False

def test_memory_pressure():
    """Test if it's a memory issue"""
    print("=== MEMORY PRESSURE TEST ===")
    try:
        import psutil
        process = psutil.Process()
        
        def print_memory():
            mem = process.memory_info()
            gpu_mem = torch.cuda.memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
            print(f"   CPU: {mem.rss / 1024**2:.1f}MB, GPU: {gpu_mem:.1f}MB")
        
        # Test progressively larger batch sizes
        batch_sizes = [1, 2, 4, 6, 8, 10, 12, 16]
        
        for bs in batch_sizes:
            print(f"\nTesting batch_size={bs}")
            print_memory()
            
            success = test_batch_size(bs)
            print_memory()
            
            if not success:
                print(f"❌ FAILED AT BATCH SIZE: {bs}")
                return False
        
        print("✅ ALL BATCH SIZES PASSED!")
        return True
        
    except ImportError:
        print("psutil not available, testing batch sizes without memory info")
        batch_sizes = [1, 2, 4, 6, 8, 10, 12, 16]
        
        for bs in batch_sizes:
            success = test_batch_size(bs)
            if not success:
                print(f"❌ FAILED AT BATCH SIZE: {bs}")
                return False
        
        print("✅ ALL BATCH SIZES PASSED!")
        return True
    except Exception as e:
        print(f"❌ Memory pressure test failed: {e}")
        return False

def main():
    """Run tests to find the exact trigger"""
    print("BATCH SIZE DEBUG SCRIPT")
    print("=" * 50)
    
    # Start with the exact failing batch size
    print("Testing the exact failing scenario first...")
    if test_batch_size(8):
        print("🤔 Batch size 8 works in isolation!")
        print("The issue might be elsewhere...")
        
        # Try exact reproduction
        if test_exact_reproduction():
            print("🤔 Even exact reproduction works!")
            print("This suggests the issue is timing/environment specific.")
        else:
            print("💡 Exact reproduction failed!")
            print("The issue is in the full script context.")
    else:
        print("💡 Found it! Batch size 8 fails.")
        print("Testing smaller batch sizes to find the limit...")
        test_memory_pressure()

if __name__ == "__main__":
    main()

BATCH SIZE DEBUG SCRIPT
Testing the exact failing scenario first...
=== TESTING BATCH SIZE: 8 ===
1. Loading data for batch_size=8...


16:04:32 [RANK:0] Data loaders created: Train=1575, Val=140


2. Creating VideoClassifier with batch_size=8...


16:04:33 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:04:33 [RANK:0] Setup complete. Device: cuda, World size: 1


Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
3. Testing one training step with batch_size=8...
✅ BATCH SIZE 8 SUCCESS! Loss: 1.2244


16:04:42 [RANK:0] Data loaders created: Train=1575, Val=140


🤔 Batch size 8 works in isolation!
The issue might be elsewhere...
=== EXACT REPRODUCTION TEST ===
Running exact reproduction with batch_size=8...
NEXAR VIDEO CLASSIFICATION - COMPREHENSIVE TRAINING WITH VALIDATION
Experiment: debug_reproduction
Base Model: convnext_tiny
Temporal Mode: gru
Epochs: 1
Batch Size: 8
Learning Rate: 0.0001
Single GPU training
Creating datasets...
Loaded metadata with 14830 rows
Using video transforms
Dataset sizes:
  Train: 12605
  Validation: 1113
  Test: 1112
Creating VideoClassifier...


16:04:43 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:04:43 [RANK:0] Setup complete. Device: cuda, World size: 1
16:04:43 [RANK:0] Starting training for 1 epochs


Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
Starting training...


Epoch 1:   0%|          | 0/1575 [00:00<?, ?it/s]

In [2]:
#!/usr/bin/env python3
"""
Test the sample_strategy bug
"""

import torch
import pandas as pd
import traceback

def test_sample_strategy(sample_strategy, center_time_column=None):
    """Test specific sample strategy"""
    print(f"=== TESTING SAMPLE STRATEGY: {sample_strategy} ===")
    print(f"Center time column: {center_time_column}")
    
    try:
        from nexar_complete_with_validation import VideoClassifier, create_datasets_with_manual_split
        from nexar_video_aug import create_video_transforms
        
        print(f"1. Loading data with sample_strategy={sample_strategy}...")
        metadata_df = pd.read_csv("df_encord_v1.csv")
        transform_train = create_video_transforms(mode='train')
        transform_val = create_video_transforms(mode='val')
        
        # First check if the required column exists
        if center_time_column and center_time_column not in metadata_df.columns:
            print(f"❌ Column '{center_time_column}' not found in metadata!")
            print(f"Available columns: {list(metadata_df.columns)}")
            return False
        
        print(f"2. Creating datasets with sample_strategy={sample_strategy}...")
        train_ds, val_ds, test_ds = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train,
            transform_val=transform_val,
            video_path_column='video_path',
            label_column='video_type', 
            id_column='id',
            split_column='split',
            sample_strategy=sample_strategy,
            center_time_column=center_time_column,
            fps=10,
            duration=5
        )
        
        print(f"3. Testing single item load...")
        item = train_ds[0]
        print(f"   Item loaded: {item['frames'].shape}, {item['target']}")
        
        print(f"4. Creating VideoClassifier...")
        classifier = VideoClassifier(
            train_dataset=train_ds,
            val_dataset=val_ds,
            test_dataset=test_ds,
            base_model='convnext_tiny',
            temporal_mode='gru',
            num_classes=3,
            batch_size=8,
            learning_rate=1e-4,
            save_dir=f'debug_checkpoints_{sample_strategy}'
        )
        
        print(f"5. Testing one training step...")
        classifier.model.train()
        train_iter = iter(classifier.train_loader)
        batch = next(train_iter)
        
        frames = batch['frames'].permute(0, 4, 1, 2, 3).float().to(classifier.device)
        targets = batch['target']
        
        class_map = {'Normal': 0, 'Near Collision': 1, 'Collision': 2}
        targets = torch.tensor([class_map[t] for t in targets]).to(classifier.device)
        
        classifier.optimizer.zero_grad()
        outputs = classifier.model(frames)
        loss = classifier.criterion(outputs, targets)
        loss.backward()
        classifier.optimizer.step()
        
        print(f"✅ SAMPLE STRATEGY {sample_strategy} SUCCESS! Loss: {loss.item():.4f}")
        return True
        
    except Exception as e:
        print(f"❌ SAMPLE STRATEGY {sample_strategy} FAILED: {e}")
        traceback.print_exc()
        return False

def test_metadata_center_debug():
    """Debug the metadata_center strategy specifically"""
    print("=== DEBUGGING METADATA_CENTER STRATEGY ===")
    
    try:
        # Load metadata and check the event_time_sec column
        metadata_df = pd.read_csv("df_encord_v1.csv")
        
        print("1. Checking metadata columns...")
        print(f"Columns: {list(metadata_df.columns)}")
        
        if 'event_time_sec' in metadata_df.columns:
            print(f"✅ event_time_sec column exists")
            print(f"Sample values: {metadata_df['event_time_sec'].head()}")
            print(f"Data type: {metadata_df['event_time_sec'].dtype}")
            print(f"Null values: {metadata_df['event_time_sec'].isnull().sum()}")
            print(f"Min: {metadata_df['event_time_sec'].min()}, Max: {metadata_df['event_time_sec'].max()}")
        else:
            print(f"❌ event_time_sec column missing!")
            return False
        
        print("2. Testing VideoDataset with metadata_center...")
        from nexar_complete_with_validation import VideoDataset
        from nexar_video_aug import create_video_transforms
        
        # Get a small sample for testing
        train_df = metadata_df[metadata_df['split'].str.lower() == 'train'].head(10)
        
        transform_train = create_video_transforms(mode='train')
        
        dataset = VideoDataset(
            video_paths=train_df['video_path'].tolist(),
            labels=train_df['video_type'].tolist(),
            video_ids=train_df['id'].tolist(),
            fps=10,
            duration=5,
            is_train=True,
            transform=transform_train,
            sample_strategy='metadata_center',
            center_time_column='event_time_sec',
            metadata_df=metadata_df
        )
        
        print("3. Testing individual items...")
        for i in range(min(3, len(dataset))):
            print(f"Loading item {i}...")
            item = dataset[i]
            print(f"   Item {i}: {item['frames'].shape}, {item['target']}, {item['id']}")
        
        print("✅ metadata_center strategy debugging successful!")
        return True
        
    except Exception as e:
        print(f"❌ metadata_center debugging failed: {e}")
        traceback.print_exc()
        return False

def main():
    """Test all sample strategies"""
    print("SAMPLE STRATEGY DEBUG SCRIPT")
    print("=" * 50)
    
    # First debug the metadata_center strategy
    print("Debugging metadata_center strategy...")
    if not test_metadata_center_debug():
        print("Failed at metadata_center debugging!")
        return
    
    # Test all strategies
    strategies = [
        ('center', None),
        ('random', None),
        ('metadata_center', 'event_time_sec'),  # This is the problematic one
    ]
    
    for strategy, center_col in strategies:
        result = test_sample_strategy(strategy, center_col)
        if not result:
            print(f"\n💡 FOUND THE BUG! Sample strategy '{strategy}' fails!")
            break
        print("")
    else:
        print("🤔 All sample strategies work... The bug is elsewhere!")

if __name__ == "__main__":
    main()

SAMPLE STRATEGY DEBUG SCRIPT
Debugging metadata_center strategy...
=== DEBUGGING METADATA_CENTER STRATEGY ===
1. Checking metadata columns...
Columns: ['id', 'video_title', 'video_path', 'video_type', 'event_time_sec', 'split']
✅ event_time_sec column exists
Sample values: 0    19.72
1    19.08
2    20.08
3     8.33
4     7.39
Name: event_time_sec, dtype: float64
Data type: float64
Null values: 0
Min: 0.08, Max: 39.98
2. Testing VideoDataset with metadata_center...
3. Testing individual items...
Loading item 0...
   Item 0: torch.Size([50, 224, 224, 3]), Normal, f28ece5faef3856b5e2fed1f78d6cf8a
Loading item 1...
   Item 1: torch.Size([50, 224, 224, 3]), Near Collision, 869fe26b504975fefb99701e979c3790
Loading item 2...
   Item 2: torch.Size([50, 224, 224, 3]), Near Collision, dd8c7ddd37ac435d190b9b7824a062a5
✅ metadata_center strategy debugging successful!
=== TESTING SAMPLE STRATEGY: center ===
Center time column: None
1. Loading data with sample_strategy=center...
2. Creating dataset

16:07:47 [RANK:0] Data loaders created: Train=1575, Val=140


   Item loaded: torch.Size([50, 224, 224, 3]), Normal
4. Creating VideoClassifier...
Created convnext_tiny with gru temporal aggregation
Feature dimension: 768


16:07:48 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:07:48 [RANK:0] Setup complete. Device: cuda, World size: 1


5. Testing one training step...
✅ SAMPLE STRATEGY center SUCCESS! Loss: 1.2775

=== TESTING SAMPLE STRATEGY: random ===
Center time column: None
1. Loading data with sample_strategy=random...
2. Creating datasets with sample_strategy=random...
3. Testing single item load...


16:07:59 [RANK:0] Data loaders created: Train=1575, Val=140


   Item loaded: torch.Size([50, 224, 224, 3]), Normal
4. Creating VideoClassifier...


16:07:59 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:07:59 [RANK:0] Setup complete. Device: cuda, World size: 1


Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
5. Testing one training step...
✅ SAMPLE STRATEGY random SUCCESS! Loss: 1.4588

=== TESTING SAMPLE STRATEGY: metadata_center ===
Center time column: event_time_sec
1. Loading data with sample_strategy=metadata_center...
2. Creating datasets with sample_strategy=metadata_center...
3. Testing single item load...


16:08:09 [RANK:0] Data loaders created: Train=1575, Val=140


   Item loaded: torch.Size([50, 224, 224, 3]), Normal
4. Creating VideoClassifier...


16:08:10 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:08:10 [RANK:0] Setup complete. Device: cuda, World size: 1


Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
5. Testing one training step...
✅ SAMPLE STRATEGY metadata_center SUCCESS! Loss: 1.0163

🤔 All sample strategies work... The bug is elsewhere!


In [None]:
#!/usr/bin/env python3
"""
Debug the exact difference between working tests and failing main script
"""

import torch
import pandas as pd
import traceback
import argparse
from datetime import datetime
import os

def test_exact_main_script_flow():
    """Test the exact flow from the main script"""
    print("=== EXACT MAIN SCRIPT FLOW TEST ===")
    
    try:
        # Import everything exactly like the main script
        from nexar_complete_with_validation import (
            VideoClassifier, create_datasets_with_manual_split, 
            set_random_seeds, parse_args, create_experiment_name,
            is_main_process, log_info
        )
        from nexar_video_aug import create_video_transforms
        
        print("1. Setting up args exactly like main script...")
        
        # Create args object exactly like parse_args() would
        class Args:
            metadata_csv = "df_encord_v1.csv"
            video_path_column = 'video_path'
            label_column = 'video_type'
            id_column = 'id'
            split_column = 'split'
            sample_strategy = 'center'  # Start with working strategy
            center_time_column = None
            base_model = 'convnext_tiny'
            temporal_mode = 'gru'
            epochs = 15
            batch_size = 8
            learning_rate = 1e-4
            fps = 10
            duration = 5
            experiment_name = None
            save_dir = 'model_results'
            seed = 42
            run_grid_search = False
        
        args = Args()
        
        print("2. Following exact main script steps...")
        
        # Set random seeds
        set_random_seeds(args.seed)
        
        # Create experiment name
        if args.experiment_name is None:
            args.experiment_name = create_experiment_name(args.base_model, args.temporal_mode)
        
        print(f"Experiment name: {args.experiment_name}")
        
        # Create save directory
        if is_main_process():
            os.makedirs(args.save_dir, exist_ok=True)
        
        print("3. Loading metadata...")
        metadata_df = pd.read_csv(args.metadata_csv)
        log_info(f"Loaded metadata with {len(metadata_df)} rows")
        
        print("4. Creating transforms...")
        transform_train = create_video_transforms(
            mode='train',
            enable_custom_augmentation=True,
            brightness_range=(0.95, 1.05),
            contrast_range=(0.95, 1.05),
            saturation_range=(0.95, 1.05),
            hue_range=(-0.02, 0.02),
            rotation_range=(-3, 3),
            scale_range=(0.98, 1.02),
            horizontal_flip_prob=0.5,
            aug_probability=0.8,
        )
        transform_val = create_video_transforms(mode='val')
        log_info("Using video transforms")
        
        print("5. Creating datasets...")
        train_data, val_data, test_data = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            video_path_column=args.video_path_column,
            label_column=args.label_column,
            id_column=args.id_column,
            split_column=args.split_column,
            transform_train=transform_train,
            transform_val=transform_val,
            sample_strategy=args.sample_strategy,
            center_time_column=args.center_time_column,
            fps=args.fps,
            duration=args.duration,
        )
        
        log_info(f"Dataset sizes:")
        log_info(f"  Train: {len(train_data) if train_data else 0}")
        log_info(f"  Validation: {len(val_data) if val_data else 0}")
        log_info(f"  Test: {len(test_data) if test_data else 0}")
        
        print("6. Creating VideoClassifier...")
        log_info("Creating VideoClassifier...")
        
        classifier = VideoClassifier(
            train_dataset=train_data,
            val_dataset=val_data,
            test_dataset=test_data,
            base_model=args.base_model,
            temporal_mode=args.temporal_mode,
            num_classes=3,
            batch_size=args.batch_size,
            learning_rate=args.learning_rate,
            save_dir=os.path.join(args.save_dir, args.experiment_name)
        )
        
        print("7. Testing one epoch...")
        log_info("Starting training...")
        classifier.train(epochs=1)
        
        print("✅ EXACT MAIN SCRIPT FLOW SUCCESS!")
        return True
        
    except Exception as e:
        print(f"❌ EXACT MAIN SCRIPT FLOW FAILED: {e}")
        traceback.print_exc()
        return False

def test_with_metadata_center():
    """Test with metadata_center like the failing script"""
    print("=== METADATA_CENTER MAIN SCRIPT TEST ===")
    
    try:
        from nexar_complete_with_validation import (
            VideoClassifier, create_datasets_with_manual_split, 
            set_random_seeds, create_experiment_name, log_info
        )
        from nexar_video_aug import create_video_transforms
        
        # Same setup but with metadata_center
        class Args:
            metadata_csv = "df_encord_v1.csv"
            video_path_column = 'video_path'
            label_column = 'video_type'
            id_column = 'id'
            split_column = 'split'
            sample_strategy = 'metadata_center'  # This is what the real script uses
            center_time_column = 'event_time_sec'  # This too
            base_model = 'convnext_tiny'
            temporal_mode = 'gru'
            epochs = 15
            batch_size = 8
            learning_rate = 1e-4
            fps = 10
            duration = 5
            experiment_name = "metadata_center_test"
            save_dir = 'model_results'
            seed = 42
        
        args = Args()
        
        set_random_seeds(args.seed)
        
        metadata_df = pd.read_csv(args.metadata_csv)
        
        # Use EXACT same transforms as main script
        transform_train = create_video_transforms(
            mode='train',
            enable_custom_augmentation=True,
            brightness_range=(0.95, 1.05),
            contrast_range=(0.95, 1.05),
            saturation_range=(0.95, 1.05),
            hue_range=(-0.02, 0.02),
            rotation_range=(-3, 3),
            scale_range=(0.98, 1.02),
            horizontal_flip_prob=0.5,
            aug_probability=0.8,
        )
        transform_val = create_video_transforms(mode='val')
        
        train_data, val_data, test_data = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            video_path_column=args.video_path_column,
            label_column=args.label_column,
            id_column=args.id_column,
            split_column=args.split_column,
            transform_train=transform_train,
            transform_val=transform_val,
            sample_strategy=args.sample_strategy,
            center_time_column=args.center_time_column,
            fps=args.fps,
            duration=args.duration,
        )
        
        print(f"Creating VideoClassifier with metadata_center...")
        
        classifier = VideoClassifier(
            train_dataset=train_data,
            val_dataset=val_data,
            test_dataset=test_data,
            base_model=args.base_model,
            temporal_mode=args.temporal_mode,
            num_classes=3,
            batch_size=args.batch_size,
            learning_rate=args.learning_rate,
            save_dir=os.path.join(args.save_dir, args.experiment_name)
        )
        
        print("Testing one epoch with metadata_center...")
        classifier.train(epochs=1)
        
        print("✅ METADATA_CENTER SUCCESS!")
        return True
        
    except Exception as e:
        print(f"❌ METADATA_CENTER FAILED: {e}")
        traceback.print_exc()
        return False

def test_different_transforms():
    """Test if the issue is with the complex transforms"""
    print("=== TRANSFORMS COMPARISON TEST ===")
    
    try:
        from nexar_complete_with_validation import VideoClassifier, create_datasets_with_manual_split
        from nexar_video_aug import create_video_transforms
        
        metadata_df = pd.read_csv("df_encord_v1.csv")
        
        # Test 1: Simple transforms (like our working tests)
        print("1. Testing with simple transforms...")
        transform_train_simple = create_video_transforms(mode='train')
        transform_val_simple = create_video_transforms(mode='val')
        
        train_data, val_data, test_data = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train_simple,
            transform_val=transform_val_simple,
            video_path_column='video_path',
            label_column='video_type',
            id_column='id',
            split_column='split',
            sample_strategy='metadata_center',
            center_time_column='event_time_sec',
            fps=10,
            duration=5,
        )
        
        classifier1 = VideoClassifier(
            train_dataset=train_data,
            val_dataset=val_data,
            test_dataset=test_data,
            base_model='convnext_tiny',
            temporal_mode='gru',
            num_classes=3,
            batch_size=8,
            learning_rate=1e-4,
            save_dir='debug_simple_transforms'
        )
        print("✅ Simple transforms work!")
        
        # Test 2: Complex transforms (like the main script)
        print("2. Testing with complex transforms...")
        transform_train_complex = create_video_transforms(
            mode='train',
            enable_custom_augmentation=True,
            brightness_range=(0.95, 1.05),
            contrast_range=(0.95, 1.05),
            saturation_range=(0.95, 1.05),
            hue_range=(-0.02, 0.02),
            rotation_range=(-3, 3),
            scale_range=(0.98, 1.02),
            horizontal_flip_prob=0.5,
            aug_probability=0.8,
        )
        
        train_data2, val_data2, test_data2 = create_datasets_with_manual_split(
            metadata_df=metadata_df,
            transform_train=transform_train_complex,
            transform_val=transform_val_simple,
            video_path_column='video_path',
            label_column='video_type',
            id_column='id',
            split_column='split',
            sample_strategy='metadata_center',
            center_time_column='event_time_sec',
            fps=10,
            duration=5,
        )
        
        classifier2 = VideoClassifier(
            train_dataset=train_data2,
            val_dataset=val_data2,
            test_dataset=test_data2,
            base_model='convnext_tiny',
            temporal_mode='gru',
            num_classes=3,
            batch_size=8,
            learning_rate=1e-4,
            save_dir='debug_complex_transforms'
        )
        print("✅ Complex transforms work too!")
        
        return True
        
    except Exception as e:
        print(f"❌ TRANSFORMS TEST FAILED: {e}")
        traceback.print_exc()
        return False

def main():
    """Run all debug tests"""
    print("MAIN SCRIPT DIFFERENCE DEBUG")
    print("=" * 50)
    
    tests = [
        test_exact_main_script_flow,
        test_different_transforms,
        test_with_metadata_center,
    ]
    
    for test_func in tests:
        result = test_func()
        if not result:
            print(f"\n💡 FOUND THE ISSUE IN: {test_func.__name__}")
            break
        print("")
    else:
        print("🤔 All tests work! The issue must be environment/timing specific.")
        print("Possible causes:")
        print("- Random seed timing differences")
        print("- Memory fragmentation")
        print("- Race conditions in imports")
        print("- CUDA context differences")

if __name__ == "__main__":
    main()

16:09:27 [RANK:0] Data loaders created: Train=1575, Val=140


MAIN SCRIPT DIFFERENCE DEBUG
=== EXACT MAIN SCRIPT FLOW TEST ===
1. Setting up args exactly like main script...
2. Following exact main script steps...
Experiment name: convnext_tiny_gru_20250605_160927
3. Loading metadata...
Loaded metadata with 14830 rows
4. Creating transforms...
Using video transforms
5. Creating datasets...
Dataset sizes:
  Train: 12605
  Validation: 1113
  Test: 1112
6. Creating VideoClassifier...
Creating VideoClassifier...


16:09:27 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:09:27 [RANK:0] Setup complete. Device: cuda, World size: 1
16:09:27 [RANK:0] Starting training for 1 epochs


Created convnext_tiny with gru temporal aggregation
Feature dimension: 768
7. Testing one epoch...
Starting training...


Epoch 1:   0%|          | 0/1575 [00:00<?, ?it/s]

COMMAND LINE ARGUMENTS DEBUG
=== DEFAULT ARGUMENTS FROM parse_args() ===


16:10:33 [RANK:0] Data loaders created: Train=1575, Val=140


Default arguments:
  base_model: convnext_tiny
  batch_size: 8
  center_time_column: None
  duration: 5
  epochs: 15
  experiment_name: None
  fps: 10
  id_column: id
  label_column: video_type
  learning_rate: 0.0001
  metadata_csv: df_encord_v1.csv
  run_grid_search: False
  sample_strategy: center
  save_dir: model_results
  seed: 42
  split_column: split
  temporal_mode: gru
  video_path_column: video_path

=== ARGUMENTS COMPARISON ===
COMPARISON:
Key differences between working tests and defaults:

🎯 KEY SUSPECTS:
  sample_strategy: center
  center_time_column: None

=== TESTING SPECIFIC FAILING COMBINATION ===
Testing configuration:
  sample_strategy: center
  center_time_column: None
  batch_size: 8
Creating datasets with exact default args...
✅ Dataset creation with default args SUCCESS!

=== TESTING WITH DEFAULT ARGUMENTS ===
Using default sample_strategy: center
Using default center_time_column: None
Running with default args...
NEXAR VIDEO CLASSIFICATION - COMPREHENSIVE TRAI

16:10:34 [RANK:0] Model created: convnext_tiny + gru, Classes: 3
16:10:34 [RANK:0] Setup complete. Device: cuda, World size: 1
16:10:34 [RANK:0] Starting training for 15 epochs


Starting training...


Epoch 1:   0%|          | 0/1575 [00:00<?, ?it/s]

In [8]:
train_ds, val_ds, test_ds = create_datasets_with_manual_split(
    metadata_df=df,
    transform_train=create_video_transforms(
        mode='train',
        enable_custom_augmentation=True,
        
        brightness_range=(0.95, 1.05),
        contrast_range=(0.95, 1.05),
        saturation_range=(0.95, 1.05),
        hue_range=(-0.02, 0.02),
        
        rotation_range=(-3, 3),
        
        scale_range=(0.98, 1.02),
        horizontal_flip_prob=0.5,
        aug_probability=0.8,
    ),
    transform_val=create_video_transforms(mode='val'),
    video_path_column='video_path',
    label_column='video_type', 
    id_column='id',
    split_column='split',
    sample_strategy='metadata_center',
    center_time_column='event_time_sec',
    fps=10,
    duration=5
)

train_viewer = train_ds.show_batch(m=3, rows_per_page=3)

In [11]:
val_viewer = val_ds.show_batch(m=3, rows_per_page=3)

In [14]:
batch_size = 12

loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)

# Get a single batch
batch = next(iter(loader))

# Print batch information
print("Batch contents:")
print(f"- Frames: {batch['frames'].shape}, {batch['frames'].dtype}")
print(f"- Target: {batch['target']}")
print(f"- IDs: {batch['id']}")

Batch contents:
- Frames: torch.Size([12, 50, 224, 224, 3]), torch.float32
- Target: ['normal', 'collision', 'normal', 'normal', 'normal', 'normal', 'near_collision', 'normal', 'near_collision', 'normal', 'normal', 'normal']
- IDs: ['46f7f9467fc61c2a98ee9da7e2317342_dup', '3165df9a5754fbd20beaaf9cefd5eb33', '67fe4b68900c04889f8726cca480efbf_dup', '59f3d0479624a8928c4fde20808e5992_dup', '3a3c19858906c6a5784f9629727384b2_dup', '78ac2d6033fced98f6292463d9b25eb4_dup', '3d7aeb5f61194933d12afc9d550db776', 'e0661515db4963714f10518c08c39452', 'fea90afe789a84831bfe9246a4ed658b', '1f98cee6916347d1141636e289f71727', 'c12b2668-090b-4d4e-b507-8fb4e9eb3525', 'c4863600685f763d25fafdae039159aa_dup']
