In [None]:
# Remote Training on SSH with CUDA - Hydra Sweep

This notebook lets you launch a **Hydra multirun sweep** on an SSH machine with CUDA and safely disconnect while training continues.

**Sweep Configuration:**
- `model.hidden_channels`: [32, 64, 128, 256, 512]
- `model.conv_types`: 1-layer (GEN, GCN, GAT) and 2-layer combinations

**How it works:**
- Cell 2: Launches sweep as a background process with `nohup` (survives SSH disconnect)
- Cell 3: Monitor recent sweep logs
- Cell 4: Check GPU utilization
- Cell 5: Stop sweep if needed
- Cell 6: View sweep results summary

**Workflow:**
1. Run Cell 2 to start sweep
2. Disconnect from SSH/close notebook - sweep continues!
3. Reconnect later and run Cell 3 to check progress
4. Check results in `multirun/` directory when done

In [1]:
# This is for sweep

import os
from datetime import datetime

# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Generate unique log filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/sweep_{timestamp}.log"

print(f"Starting Hydra sweep in background...")
print(f"Logs will be written to: {log_file}")

# Hydra multirun command with sweep parameters
# -m flag enables multirun mode
# Sweep over hidden_channels and conv_types
cmd = (
    f"nohup python main.py -m "
    f"'training.loss.adaptive.strategy=equal_init, equal_init_ema, ema, fixed' "
    f"&> {log_file} &"
)

print(f"\nCommand: {cmd}")
os.system(cmd)

print("\n✓ Hydra sweep launched in background!")
print(f"\nUseful commands:")
print(f"  Monitor live:     tail -f {log_file}")
print(f"  Check status:     ps aux | grep 'python main.py'")
print(f"  Kill if needed:   pkill -f 'python main.py'")
print(f"  GPU usage:        nvidia-smi")
print(f"  Results:          ls -lh multirun/")
print(f"\nResults will be saved in: multirun/YYYY-MM-DD/HH-MM-SS/")

Starting Hydra sweep in background...
Logs will be written to: logs/sweep_20251028_112301.log

Command: nohup python main.py -m 'training.loss.adaptive.strategy=equal_init, equal_init_ema, ema, fixed' &> logs/sweep_20251028_112301.log &

✓ Hydra sweep launched in background!

Useful commands:
  Monitor live:     tail -f logs/sweep_20251028_112301.log
  Check status:     ps aux | grep 'python main.py'
  Kill if needed:   pkill -f 'python main.py'
  GPU usage:        nvidia-smi
  Results:          ls -lh multirun/

Results will be saved in: multirun/YYYY-MM-DD/HH-MM-SS/
[2025-10-28 11:23:04,161][HYDRA] Launching 4 jobs locally
[2025-10-28 11:23:04,162][HYDRA] 	#0 : training.loss.adaptive.strategy=equal_init
[2025-10-28 11:23:04,257][__main__][INFO] - CONFIGURATION
[2025-10-28 11:23:04,260][__main__][INFO] - 
dataset:
  num_graphs: 5000
  num_steps: 6
  dt: 0.01
  train_ratio: 0.8
  batch_size: 4
  shuffle: true
  drop_last: false
  wave_speed: 1.0
  damping: 1.0
  num_nodes: 100
  domain

In [2]:
# Monitor the most recent sweep log
import glob
import time

# Find most recent log file
log_files = sorted(glob.glob('logs/sweep_*.log'))
if log_files:
    latest_log = log_files[-1]
    print(f"Monitoring: {latest_log}\n")
    print("="*70)
    
    # Display last 50 lines to see multiple runs
    with open(latest_log, 'r') as f:
        lines = f.readlines()
        for line in lines[-50:]:
            print(line.rstrip())
    
    print("\n" + "="*70)
    print(f"Total lines in log: {len(lines)}")
else:
    print("No sweep logs found yet.")

Monitoring: logs/sweep_20251028_103052.log


Total lines in log: 0
[2025-10-28 10:30:56,589][HYDRA] Launching 30 jobs locally
[2025-10-28 10:30:56,589][HYDRA] 	#0 : model.hidden_channels=32 model.conv_types=GEN
[2025-10-28 10:30:56,680][__main__][INFO] - CONFIGURATION
[2025-10-28 10:30:56,684][__main__][INFO] - 
dataset:
  num_graphs: 100
  num_steps: 6
  dt: 0.1
  train_ratio: 0.8
  batch_size: 4
  shuffle: true
  drop_last: false
  wave_speed: 1.0
  damping: 1.0
  num_nodes: 100
  domain_length: 1.0
  force:
    margin: 0.1
    sign: -1.0
    location: casual
    forcing_type: middle
  scaling:
    enabled: false
    method: standard
    per_feature: true
    epsilon: 1.0e-08
model:
  in_channels: 3
  hidden_channels: 32
  out_channels: 2
  conv_types: GEN
  final_layer_type: Linear
  activation: relu
  dropout: 0.2
  block: res
  use_bn: true
  gat_heads: 4
  cheb_K: 3
  residual: true
  use_global_pooling: true
  pooling_position: middle
  pooling_type: attention
  encoder_layers: 

In [6]:
# Check GPU utilization
import subprocess

result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print(result.stdout)

Mon Oct 27 19:41:42 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:01:00.0 Off |                  N/A |
| 30%   51C    P2             123W / 370W |    372MiB / 24576MiB |     15%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import subprocess

print("Stopping all training processes...")
subprocess.run(['pkill', '-f', 'python main.py'])
print("✓ Training processes stopped")

# Verify
result = subprocess.run(['pgrep', '-f', 'python main.py'], capture_output=True)
if result.stdout:
    print("⚠ Some processes still running, try again or use: kill -9 <PID>")
else:
    print("✓ All training processes terminated")

Stopping all training processes...
✓ Training processes stopped
✓ All training processes terminated
