# GNN Training - All Datasets with Normalization

**Configuration:**
- Datasets: Cora, PubMed, Roman-empire, Minesweeper
- Model: GCN with `normalize=True` (self-loops enabled)
- K values: 0-8
- Seeds: 0-3
- **Total: 144 models**

**Estimated runtime on A100:** 30-60 minutes

In [None]:
# Install dependencies (optimized for Colab)
import torch
print(f'Pre-installed PyTorch: {torch.__version__}')
print(f'CUDA: {torch.version.cuda}')

# Install PyTorch Geometric (auto-detects PyTorch version)
import sys
!{sys.executable} -m pip install -q torch-geometric

# Install PyG extensions without pyg-lib (not needed for GCN)
!{sys.executable} -m pip install -q torch-scatter torch-sparse --no-index --find-links https://data.pyg.org/whl/torch-2.5.0+cu128.html

# Install other dependencies
!{sys.executable} -m pip install -q scikit-learn pandas matplotlib seaborn

print('\n✓ All dependencies installed')

# Verify PyG installation
import torch_geometric
print(f'PyTorch Geometric: {torch_geometric.__version__}')

In [None]:
# Verify GPU
import torch
print(f'GPU available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU name: {torch.cuda.get_device_name(0)}')
    print(f'GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')

In [None]:
# Navigate to project directory
import os
import sys

# Check current directory and find project root
print(f'Starting directory: {os.getcwd()}')
!ls -la

# Try common Colab sync locations
possible_paths = [
    '/content/entropy-selection',
    '/content',
    os.getcwd(),  # Current directory
]

project_dir = None
for path in possible_paths:
    src_path = os.path.join(path, 'src')
    if os.path.exists(src_path) and os.path.isdir(src_path):
        project_dir = path
        break

if project_dir:
    os.chdir(project_dir)
    print(f'\n✓ Found project at: {project_dir}')
    print(f'Current directory: {os.getcwd()}')
    print(f'\nSource modules:')
    !ls src/
else:
    print('\n❌ ERROR: Could not find src/ directory')
    print('Available files in current directory:')
    !ls -la
    raise FileNotFoundError('Project src/ directory not found. Files may not have synced from local.')

In [None]:
# Train all models
import subprocess
from datetime import datetime

datasets = ['Cora', 'PubMed', 'Roman-empire', 'Minesweeper']
K_values = list(range(9))  # 0-8
seeds = [0, 1, 2, 3]

total = len(datasets) * len(K_values) * len(seeds)
completed = 0
failed = 0

start_time = datetime.now()
print(f'Starting training of {total} models...')
print(f'Start time: {start_time.strftime("%H:%M:%S")}')
print('=' * 60)

for dataset in datasets:
    for K in K_values:
        for seed in seeds:
            cmd = f'python -m src.train_gnn --dataset {dataset} --model GCN --K {K} --seed {seed}'
            
            print(f'\n[{completed+1}/{total}] {dataset} K={K} seed={seed}...', end=' ')
            
            result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
            
            if result.returncode == 0:
                print('✓')
                completed += 1
            else:
                print('✗')
                failed += 1
                # Print error for debugging
                if result.stderr:
                    print(f'  Error: {result.stderr[-200:]}')

end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 60

print('\n' + '=' * 60)
print(f'COMPLETE: {completed}/{total} successful, {failed} failed')
print(f'Duration: {duration:.1f} minutes')
print(f'End time: {end_time.strftime("%H:%M:%S")}')
print('=' * 60)

In [None]:
# Training complete!
print('✓ Training complete!')
print('Results saved in results/runs/')
print('\nNext steps (run locally):')
print('1. Extract embeddings')
print('2. Run probing')
print('3. Generate plots')