In [None]:
# Remote Training on SSH with CUDA - Hydra Sweep

This notebook lets you launch a **Hydra multirun sweep** on an SSH machine with CUDA and safely disconnect while training continues.

**Sweep Configuration:**
- `model.hidden_channels`: [32, 64, 128, 256, 512]
- `model.conv_types`: 1-layer (GEN, GCN, GAT) and 2-layer combinations

**How it works:**
- Cell 2: Launches sweep as a background process with `nohup` (survives SSH disconnect)
- Cell 3: Monitor recent sweep logs
- Cell 4: Check GPU utilization
- Cell 5: Stop sweep if needed
- Cell 6: View sweep results summary

**Workflow:**
1. Run Cell 2 to start sweep
2. Disconnect from SSH/close notebook - sweep continues!
3. Reconnect later and run Cell 3 to check progress
4. Check results in `multirun/` directory when done

In [3]:
# This is for sweep

import os
from datetime import datetime

# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Generate unique log filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/sweep_{timestamp}.log"

print(f"Starting Hydra sweep in background...")
print(f"Logs will be written to: {log_file}")

# Hydra multirun command with sweep parameters
# -m flag enables multirun mode
# Sweep over hidden_channels and conv_types
cmd = (
    f"nohup python main.py"
#    f"-m training.loss.adaptive.strategy=fixed,equal_init,equal_init_ema,ema "
#    f"training.loss.use_pi=true,false "
#    f"training.loss.use_rk4=true,false "
#    f"training.loss.use_energy=true,false "
    f"&> {log_file} &"
)

print(f"\nCommand: {cmd}")
os.system(cmd)

print("\n✓ Hydra sweep launched in background!")
print(f"\nUseful commands:")
print(f"  Monitor live:     tail -f {log_file}")
print(f"  Check status:     ps aux | grep 'python main.py'")
print(f"  Kill if needed:   pkill -f 'python main.py'")
print(f"  GPU usage:        nvidia-smi")
print(f"  Results:          ls -lh multirun/")
print(f"\nResults will be saved in: multirun/YYYY-MM-DD/HH-MM-SS/")

Starting Hydra sweep in background...
Logs will be written to: logs/sweep_20251029_102245.log

Command: nohup python main.py&> logs/sweep_20251029_102245.log &

✓ Hydra sweep launched in background!

Useful commands:
  Monitor live:     tail -f logs/sweep_20251029_102245.log
  Check status:     ps aux | grep 'python main.py'
  Kill if needed:   pkill -f 'python main.py'
  GPU usage:        nvidia-smi
  Results:          ls -lh multirun/

Results will be saved in: multirun/YYYY-MM-DD/HH-MM-SS/
[2025-10-29 10:22:47,239][__main__][INFO] - CONFIGURATION
[2025-10-29 10:22:47,243][__main__][INFO] - 
dataset:
  h5_path: data/simulations.h5
  num_samples: 100
  num_timesteps: 200
  window_length: 200
  train_ratio: 0.8
  batch_size: 4
  shuffle: true
  drop_last: false
  num_graphs: 5000
  num_steps: 6
  dt: 0.01
  wave_speed: 1.0
  damping: 1.0
  num_nodes: 100
  domain_length: 1.0
  force:
    margin: 0.1
    sign: -1.0
    location: casual
    forcing_type: middle
  scaling:
    enabled: fa

Error executing job with overrides: []
Traceback (most recent call last):
  File "/home/jupyter-gcap/wave_gnn/main.py", line 169, in main
    train_set, val_set = create_and_save_dataset(cfg, output_dir, plot_dataset=cfg.plot.plot_dataset, run_name=run_name)
  File "/home/jupyter-gcap/wave_gnn/main.py", line 88, in create_and_save_dataset
    dataset = H5RandomTimeStepDataset(
  File "/home/jupyter-gcap/wave_gnn/random_window_dataset.py", line 44, in __init__
    with h5py.File(self.h5_path, 'r') as f:
  File "/home/jupyter-gcap/.local/lib/python3.10/site-packages/h5py/_hl/files.py", line 566, in __init__
    fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
  File "/home/jupyter-gcap/.local/lib/python3.10/site-packages/h5py/_hl/files.py", line 241, in make_fid
    fid = h5f.open(name, flags, fapl=fapl)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
  File "h5py/h5f.pyx", li

In [None]:
# Monitor the most recent sweep log
import glob
import time

# Find most recent log file
log_files = sorted(glob.glob('logs/sweep_*.log'))
if log_files:
    latest_log = log_files[-1]
    print(f"Monitoring: {latest_log}\n")
    print("="*70)
    
    # Display last 50 lines to see multiple runs
    with open(latest_log, 'r') as f:
        lines = f.readlines()
        for line in lines[-50:]:
            print(line.rstrip())
    
    print("\n" + "="*70)
    print(f"Total lines in log: {len(lines)}")
else:
    print("No sweep logs found yet.")

In [None]:
# Check GPU utilization
import subprocess

result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print(result.stdout)

In [2]:
import subprocess

print("Stopping all training processes...")
subprocess.run(['pkill', '-f', 'python main.py'])
print("✓ Training processes stopped")

# Verify
result = subprocess.run(['pgrep', '-f', 'python main.py'], capture_output=True)
if result.stdout:
    print("⚠ Some processes still running, try again or use: kill -9 <PID>")
else:
    print("✓ All training processes terminated")

Stopping all training processes...
✓ Training processes stopped
✓ All training processes terminated
