# Hyperparameter Sweep — Entropy Selection Project

Runs `scripts/run_sweep.py` on an A100 GPU. Results are saved to Google Drive.

**Workflow:**
1. Install dependencies
2. Clone / pull repo from GitHub
3. Mount Google Drive (results saved there, survives session restarts)
4. Configure and dry-run the sweep
5. Run the sweep (use `--skip-existing` to resume across sessions)
6. Select best hyperparams → final evaluation

In [None]:
# ── 1. Install dependencies ──────────────────────────────────────────
print('Installing PyTorch 2.5.1 with CUDA 12.4...')
!pip install -q torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124

print('Updating system libraries...')
!sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y > /dev/null 2>&1
!sudo apt-get update > /dev/null 2>&1
!sudo apt-get install --only-upgrade libstdc++6 -y > /dev/null 2>&1

print('Installing PyTorch Geometric...')
!pip install -q torch-geometric
!pip install -q torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.5.0+cu124.html
!pip install -q filelock

import torch, torch_geometric
print(f'\n✓ PyTorch: {torch.__version__}')
print(f'✓ PyG: {torch_geometric.__version__}')
print(f'✓ CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'✓ GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# ── 2. Clone / pull repo ─────────────────────────────────────────────
import os, sys

GITHUB_REPO = 'econci474/GDL'
CLONE_DIR   = 'entropy-selection'

if os.path.exists(f'{CLONE_DIR}/src'):
    print('Repo already present — pulling latest...')
    !git -C {CLONE_DIR} pull
else:
    # Add GITHUB_TOKEN to Colab Secrets (left panel > key icon) for private repos
    try:
        from google.colab import userdata
        token = userdata.get('GITHUB_TOKEN')
        clone_url = f'https://{token}@github.com/{GITHUB_REPO}.git'
        print('Cloning with token...')
    except Exception:
        clone_url = f'https://github.com/{GITHUB_REPO}.git'
        print('Cloning public...')
    !git clone {clone_url} {CLONE_DIR}

os.chdir(CLONE_DIR)
sys.path.insert(0, os.getcwd())
print(f'\n✓ Working directory: {os.getcwd()}')

In [None]:
# ── 3. Mount Google Drive and redirect results ───────────────────────
# Results are written here after EVERY run, so they survive session restarts.
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_RESULTS = '/content/drive/MyDrive/GDL/sweep_results'
os.makedirs(DRIVE_RESULTS, exist_ok=True)

# Symlink results/ -> Drive
if not os.path.islink('results'):
    if os.path.exists('results'):
        !rm -rf results
    !ln -s {DRIVE_RESULTS} results
    print(f'✓ results/ -> {DRIVE_RESULTS}')
else:
    print(f'✓ results/ already linked to {os.readlink("results")}')

for d in ['results/runs', 'results/classifier_heads', 'results/tables', 'results/figures']:
    os.makedirs(d, exist_ok=True)
print('✓ Drive mounted and results directory ready')

In [None]:
# ── 4. Sweep configuration ───────────────────────────────────────────
# Estimated runtime: ~35 hrs on A100 with these settings.
# Use --skip-existing to resume safely across sessions.

DATASETS   = 'Cora PubMed Roman-empire Squirrel'
MODELS     = 'GCN'
LOSS_TYPES = 'ce_only ce_plus_R'
K_VALUES   = 'all'          # K=1..8
SEEDS      = '0 1 2'
SPLIT_MODE = 'first'        # split 0 only for heterophilous (faster)

print('Sweep configuration:')
print(f'  Datasets:   {DATASETS}')
print(f'  Models:     {MODELS}')
print(f'  Loss types: {LOSS_TYPES}')
print(f'  K values:   {K_VALUES}')
print(f'  Seeds:      {SEEDS}')
print(f'  Split mode: {SPLIT_MODE}')
print()
print('Estimated total runs: ~12,480')
print('Estimated runtime:    ~35 hrs on A100')
print('Use --skip-existing to resume if session expires.')

In [None]:
# ── 5. Dry run — preview commands ────────────────────────────────────
import subprocess, sys

def build_sweep_cmd(dry_run=True, skip_existing=False):
    cmd = (
        [sys.executable, 'scripts/run_sweep.py']
        + ['--datasets']   + DATASETS.split()
        + ['--models']     + MODELS.split()
        + ['--loss-types'] + LOSS_TYPES.split()
        + ['--K-values']   + K_VALUES.split()
        + ['--seeds']      + SEEDS.split()
        + ['--split-mode', SPLIT_MODE]
    )
    if dry_run:
        cmd.append('--dry-run')
    if skip_existing:
        cmd.append('--skip-existing')
    return cmd

dry_cmd = build_sweep_cmd(dry_run=True)
print('Command:', ' '.join(dry_cmd))
print()
subprocess.run(dry_cmd)

In [None]:
# ── 6. Run the sweep ─────────────────────────────────────────────────
# ⚠️  Verify the dry run output above before running this cell!
#
# RESUMING after a session restart:
#   - Re-run cells 1-3 to reinstall deps, pull latest code, remount Drive
#   - Then run this cell — --skip-existing will skip already-completed runs

import subprocess, sys, time

cmd = build_sweep_cmd(dry_run=False, skip_existing=True)
print('Starting sweep...')
print('Command:', ' '.join(cmd))
print()

t0 = time.time()
result = subprocess.run(cmd)
elapsed = (time.time() - t0) / 60

print(f'\n✓ Sweep complete in {elapsed:.1f} min')
print(f'Exit code: {result.returncode}')

In [None]:
# ── 7. Check sweep progress ──────────────────────────────────────────
# Run this any time to see how many runs have completed.
import pandas as pd
from pathlib import Path

sweep_csv = Path('results/sweep_results.csv')
if sweep_csv.exists():
    df = pd.read_csv(sweep_csv)
    print(f'Completed runs: {len(df)}')
    print(f'Datasets:  {sorted(df["dataset"].unique().tolist())}')
    print(f'Models:    {sorted(df["model"].unique().tolist())}')
    print(f'Loss types:{sorted(df["loss_type"].unique().tolist())}')
    print(f'\nRuns per (dataset, model, loss_type):')
    print(df.groupby(['dataset','model','loss_type']).size().to_string())
else:
    print('No sweep_results.csv yet — sweep has not started.')

In [None]:
# ── 8. Select best hyperparameters ───────────────────────────────────
# Run AFTER the sweep is complete.
import subprocess, sys, pandas as pd
from pathlib import Path

result = subprocess.run(
    [sys.executable, 'src/select_hyperparams.py', '--hetero-split-mode', 'first'],
    capture_output=False
)
print(f'Exit code: {result.returncode}')

best_csv = Path('results/best_hyperparams.csv')
if best_csv.exists():
    best = pd.read_csv(best_csv)
    print(f'\nBest hyperparams ({len(best)} configs):')
    print(best[['dataset','model','loss_type','lr','weight_decay','hidden_dim','total_val_loss']].to_string(index=False))

In [None]:
# ── 9. Final test evaluation ─────────────────────────────────────────
# ⚠️  Run ONCE after selecting best hyperparams.
# This touches the test set — do not use for hyperparameter decisions!
import subprocess, sys, pandas as pd
from pathlib import Path

result = subprocess.run(
    [sys.executable, 'src/evaluate_final.py',
     '--from-best-hyperparams',
     '--seeds', '0', '1', '2',
     '--K-values', 'all',
     '--split-mode', 'first'],
    capture_output=False
)
print(f'Exit code: {result.returncode}')

final_csv = Path('results/tables/final_results.csv')
if final_csv.exists():
    final = pd.read_csv(final_csv)
    print(f'\nFinal results ({len(final)} rows):')
    print(final[['dataset','model','loss_type','K','seed','test_acc']]
          .sort_values(['dataset','model','loss_type','K'])
          .to_string(index=False))