In [None]:
!pip install transformers datasets torch seaborn scikit-learn

# Mount Google Drive so results persist across disconnects
from google.colab import drive
drive.mount('/content/drive')

import os
RESULTS_DIR = '/content/drive/MyDrive/landscape-probes-results'
os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"Results will be saved to: {RESULTS_DIR}")

In [None]:
import os

if os.path.exists('/content/hedgehog'):
    # Already cloned — pull latest changes
    !cd /content/hedgehog && git pull
else:
    !git clone https://github.com/dbal0503/hedgehog.git /content/hedgehog

In [None]:
import os
os.chdir('/content/hedgehog/landscape-probes')
!pwd

In [6]:
import torch
print(torch.cuda.get_device_name(0))
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Tesla T4
VRAM: 15.8 GB


In [None]:
import os, json, subprocess

RESULTS_DIR = '/content/drive/MyDrive/landscape-probes-results'

for seed in range(1, 6):
    # Check if this seed already has a complete results file (16 configs)
    results_file = f'{RESULTS_DIR}/sst2_seed{seed}_results.json'
    if os.path.exists(results_file):
        with open(results_file) as f:
            existing = json.load(f)
        if len(existing) >= 16:
            print(f"Seed {seed} already complete ({len(existing)} configs), skipping.")
            continue
        else:
            print(f"Seed {seed} has {len(existing)}/16 configs, resuming...")

    resume_path = results_file if os.path.exists(results_file) else ""
    cmd = [
        "python3", "experiments/run_sweep.py",
        "--task", "sst2",
        "--seed", str(seed),
        "--device", "cuda",
        "--batch-size", "32",
        "--output-dir", RESULTS_DIR,
    ]
    if resume_path:
        cmd += ["--resume", resume_path]

    subprocess.run(cmd)

In [None]:
# Run multi-step probes sweep (probes measured at 6 points during training)
# Results saved as sst2_seed{N}_multistep_results.json — does NOT overwrite existing results

import os, json, subprocess

RESULTS_DIR = '/content/drive/MyDrive/landscape-probes-results'
PROBE_STEPS = "10,25,50,100,200,400"

for seed in range(1, 6):
    results_file = f'{RESULTS_DIR}/sst2_seed{seed}_multistep_results.json'
    if os.path.exists(results_file):
        with open(results_file) as f:
            existing = json.load(f)
        if len(existing) >= 16:
            print(f"Seed {seed} (multistep) already complete ({len(existing)} configs), skipping.")
            continue
        else:
            print(f"Seed {seed} (multistep) has {len(existing)}/16 configs, resuming...")

    resume_path = results_file if os.path.exists(results_file) else ""
    cmd = [
        "python3", "experiments/run_sweep.py",
        "--task", "sst2",
        "--seed", str(seed),
        "--device", "cuda",
        "--batch-size", "32",
        "--output-dir", RESULTS_DIR,
        "--probe-steps", PROBE_STEPS,
    ]
    if resume_path:
        cmd += ["--resume", resume_path]

    subprocess.run(cmd)

In [None]:
# Results are already on Google Drive - just verify they're there
RESULTS_DIR = '/content/drive/MyDrive/landscape-probes-results'
import os, json

for f in sorted(os.listdir(RESULTS_DIR)):
    if f.endswith('.json'):
        with open(f'{RESULTS_DIR}/{f}') as fh:
            data = json.load(fh)
        print(f"{f}: {len(data)} configs")

In [None]:
RESULTS_DIR = '/content/drive/MyDrive/landscape-probes-results'
!python3 analysis/correlation_analysis.py --results-dir $RESULTS_DIR

In [None]:
# Download results from Google Drive as a zip (optional backup)
RESULTS_DIR = '/content/drive/MyDrive/landscape-probes-results'
!zip -r /content/results.zip $RESULTS_DIR
from google.colab import files
files.download('/content/results.zip')