# Hyperparameter Sweep — Entropy Selection Project

Runs `scripts/run_sweep.py` on an A100 GPU. Results are saved to Google Drive.

**Workflow:**
1. Install dependencies
2. Clone repo from GitHub
3. Mount Google Drive (results saved there)
4. Configure and run the sweep
5. (Optional) Select best hyperparams and run final evaluation

In [None]:
# ── 1. Install dependencies ──────────────────────────────────────────
print('Installing PyTorch 2.5.1 with CUDA 12.4...')
!pip install -q torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124

print('Updating system libraries...')
!sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y > /dev/null 2>&1
!sudo apt-get update > /dev/null 2>&1
!sudo apt-get install --only-upgrade libstdc++6 -y > /dev/null 2>&1

print('Installing PyTorch Geometric...')
!pip install -q torch-geometric
!pip install -q torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.5.0+cu124.html
!pip install -q filelock

import torch
import torch_geometric
print(f'\n✓ PyTorch: {torch.__version__}')
print(f'✓ PyG: {torch_geometric.__version__}')
print(f'✓ CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'✓ GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# ── 2. Clone repo ────────────────────────────────────────────────────
import os, sys

GITHUB_REPO = 'econci474/GDL'  # <-- your repo
CLONE_DIR   = 'entropy-selection'

if os.path.exists(f'{CLONE_DIR}/src'):
    print('Repo already cloned — pulling latest...')
    !git -C {CLONE_DIR} pull
else:
    # Use Colab Secrets: Secrets > Add > Name: GITHUB_TOKEN, Value: your PAT
    try:
        from google.colab import userdata
        token = userdata.get('GITHUB_TOKEN')
        clone_url = f'https://{token}@github.com/{GITHUB_REPO}.git'
        print('Cloning with token...')
    except Exception:
        clone_url = f'https://github.com/{GITHUB_REPO}.git'
        print('Cloning public...')
    !git clone {clone_url} {CLONE_DIR}

os.chdir(CLONE_DIR)
sys.path.insert(0, os.getcwd())
print(f'\n✓ Working directory: {os.getcwd()}')
!ls -1

In [None]:
# ── 3. Mount Google Drive and redirect results ───────────────────────
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_RESULTS = '/content/drive/MyDrive/GDL/sweep_results'
os.makedirs(DRIVE_RESULTS, exist_ok=True)

# Symlink results/ -> Drive so checkpoints and CSVs persist
if not os.path.islink('results'):
    if os.path.exists('results'):
        !cp -r results {DRIVE_RESULTS}/results_backup 2>/dev/null || true
        !rm -rf results
    !ln -s {DRIVE_RESULTS} results
    print(f'✓ results/ -> {DRIVE_RESULTS}')
else:
    print(f'✓ results/ already symlinked to {os.readlink("results")}')

# Create required subdirectories
for d in ['results/runs', 'results/classifier_heads', 'results/tables', 'results/figures']:
    os.makedirs(d, exist_ok=True)
print('✓ Drive mounted and results directory ready')

In [None]:
# ── 4. Sweep configuration ───────────────────────────────────────────
# Edit these to control what gets swept.
# Use 'all' to sweep everything defined in config.py.

DATASETS   = 'all'          # or e.g. 'Cora PubMed'
MODELS     = 'GCN'          # or 'all' for GCN GAT GraphSAGE
LOSS_TYPES = 'all'          # or e.g. 'ce_only weighted_ce'
K_VALUES   = 'all'          # or e.g. '2 4 6 8'
SEEDS      = '0 1'          # or 'all' for seeds 0-3
SPLIT_MODE = 'first'        # 'first' = split 0 only for hetero (faster)

# Dry run first to see what will be run
DRY_RUN = True

print('Sweep configuration:')
print(f'  Datasets:   {DATASETS}')
print(f'  Models:     {MODELS}')
print(f'  Loss types: {LOSS_TYPES}')
print(f'  K values:   {K_VALUES}')
print(f'  Seeds:      {SEEDS}')
print(f'  Split mode: {SPLIT_MODE}')
print(f'  Dry run:    {DRY_RUN}')

In [None]:
# ── 5. Dry run — preview commands ────────────────────────────────────
import subprocess, sys

def build_sweep_cmd(dry_run=True):
    cmd = [
        sys.executable, 'scripts/run_sweep.py',
        '--datasets'] + DATASETS.split() + [
        '--models']     + MODELS.split() + [
        '--loss-types'] + LOSS_TYPES.split() + [
        '--K-values']   + K_VALUES.split() + [
        '--seeds']      + SEEDS.split() + [
        '--split-mode', SPLIT_MODE,
    ]
    if dry_run:
        cmd.append('--dry-run')
    return cmd

dry_cmd = build_sweep_cmd(dry_run=True)
print('Command:', ' '.join(dry_cmd))
print()
result = subprocess.run(dry_cmd, capture_output=False)
print(f'\nDry run exit code: {result.returncode}')

In [None]:
# ── 6. Run the sweep ─────────────────────────────────────────────────
# ⚠️  Only run this after verifying the dry run output above!
# This will take a long time. Results are saved to Drive as each run completes.

import subprocess, sys, time

cmd = build_sweep_cmd(dry_run=False)
print('Starting sweep...')
print('Command:', ' '.join(cmd))
print()

t0 = time.time()
result = subprocess.run(cmd)
elapsed = (time.time() - t0) / 60

print(f'\n✓ Sweep complete in {elapsed:.1f} min')
print(f'Exit code: {result.returncode}')

In [None]:
# ── 7. Check sweep results ───────────────────────────────────────────
import pandas as pd
from pathlib import Path

sweep_csv = Path('results/sweep_results.csv')
if sweep_csv.exists():
    df = pd.read_csv(sweep_csv)
    print(f'Sweep results: {len(df)} rows')
    print(f'\nDatasets covered: {df["dataset"].unique().tolist()}')
    print(f'Models covered:   {df["model"].unique().tolist()}')
    print(f'Loss types:       {df["loss_type"].unique().tolist()}')
    print(f'\nBest val losses per (dataset, model, loss_type):')
    print(df.groupby(['dataset','model','loss_type'])['best_val_loss'].min().to_string())
else:
    print('No sweep_results.csv yet — run the sweep first.')

In [None]:
# ── 8. Select best hyperparameters ───────────────────────────────────
# Run after the sweep to pick the best config per (dataset, model, loss_type)

result = subprocess.run(
    [sys.executable, 'src/select_hyperparams.py', '--hetero-split-mode', 'first'],
    capture_output=False
)
print(f'\nExit code: {result.returncode}')

best_csv = Path('results/best_hyperparams.csv')
if best_csv.exists():
    best = pd.read_csv(best_csv)
    print(f'\nBest hyperparams ({len(best)} configs):')
    print(best[['dataset','model','loss_type','lr','weight_decay','hidden_dim','total_val_loss']].to_string(index=False))

In [None]:
# ── 9. Final test evaluation (run ONCE after hyperparameter selection) ─
# ⚠️  Only run this AFTER selecting best hyperparams.
# This evaluates on the test set — do not use for hyperparameter decisions!

result = subprocess.run(
    [sys.executable, 'src/evaluate_final.py',
     '--from-best-hyperparams',
     '--seeds', 'all',
     '--K-values', 'all',
     '--split-mode', 'first'],
    capture_output=False
)
print(f'\nExit code: {result.returncode}')

final_csv = Path('results/tables/final_results.csv')
if final_csv.exists():
    final = pd.read_csv(final_csv)
    print(f'\nFinal results ({len(final)} rows):')
    print(final[['dataset','model','loss_type','K','seed','test_acc']]
          .sort_values(['dataset','model','loss_type','K'])
          .to_string(index=False))