In [None]:
# Remote Training on SSH with CUDA

This notebook lets you launch training on an SSH machine with CUDA and safely disconnect while training continues.

**How it works:**
- Cell 2: Launches training as a background process with `nohup` (survives SSH disconnect)
- Cell 3: Monitor recent training logs
- Cell 4: Check GPU utilization
- Cell 5: Stop training if needed

**Workflow:**
1. Run Cell 2 to start training
2. Disconnect from SSH/close notebook - training continues!
3. Reconnect later and run Cell 3 to check progress
4. Check outputs in `checkpoints/` and `outputs/` directories when done

In [11]:
import os
from datetime import datetime

# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Generate unique log filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/training_{timestamp}.log"

print(f"Starting training in background...")
print(f"Logs will be written to: {log_file}")
print(f"To monitor progress: tail -f {log_file}")
print(f"To check if still running: ps aux | grep 'python main.py'")

# Launch training as detached background process
# nohup ensures it continues after SSH disconnect
# &> redirects both stdout and stderr to log file
# & runs in background
# The process will continue even if you close this notebook or disconnect SSH
cmd = f"nohup python main.py &> {log_file} &"
os.system(cmd)

print("\n✓ Training launched in background!")
print(f"\nUseful commands:")
print(f"  Monitor live:     tail -f {log_file}")
print(f"  Check status:     ps aux | grep 'python main.py'")
print(f"  Kill if needed:   pkill -f 'python main.py'")
print(f"  GPU usage:        nvidia-smi")

Starting training in background...
Logs will be written to: logs/training_20251027_152646.log
To monitor progress: tail -f logs/training_20251027_152646.log
To check if still running: ps aux | grep 'python main.py'

✓ Training launched in background!

Useful commands:
  Monitor live:     tail -f logs/training_20251027_152646.log
  Check status:     ps aux | grep 'python main.py'
  Kill if needed:   pkill -f 'python main.py'
  GPU usage:        nvidia-smi
[2025-10-27 15:26:48,854][__main__][INFO] - CONFIGURATION
[2025-10-27 15:26:48,858][__main__][INFO] - 
dataset:
  num_graphs: 5000
  num_steps: 6
  dt: 0.01
  train_ratio: 0.8
  batch_size: 4
  shuffle: true
  drop_last: false
  wave_speed: 1.0
  damping: 1.0
  num_nodes: 100
  domain_length: 1.0
  force:
    margin: 0.1
    sign: -1.0
    location: casual
    forcing_type: middle
  scaling:
    enabled: false
    method: standard
    per_feature: true
    epsilon: 1.0e-08
model:
  in_channels: 3
  hidden_channels:
  - 64
  - 128
  out

In [None]:
# Monitor the most recent training log
import glob
import time

# Find most recent log file
log_files = sorted(glob.glob('logs/training_*.log'))
if log_files:
    latest_log = log_files[-1]
    print(f"Monitoring: {latest_log}\n")
    print("="*70)
    
    # Display last 30 lines
    with open(latest_log, 'r') as f:
        lines = f.readlines()
        for line in lines[-30:]:
            print(line.rstrip())
else:
    print("No training logs found yet.")

In [5]:
# Check GPU utilization
import subprocess

result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print(result.stdout)

Mon Oct 27 14:04:42 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:01:00.0 Off |                  N/A |
|  0%   51C    P2             133W / 370W |    502MiB / 24576MiB |     23%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# Stop training if needed (kills all python main.py processes)
import subprocess

print("Stopping all training processes...")
subprocess.run(['pkill', '-f', 'python main.py'])
print("✓ Training processes stopped")

# Verify
result = subprocess.run(['pgrep', '-f', 'python main.py'], capture_output=True)
if result.stdout:
    print("⚠ Some processes still running, try again or use: kill -9 <PID>")
else:
    print("✓ All training processes terminated")

Stopping all training processes...
✓ Training processes stopped
✓ All training processes terminated
