In [None]:
# Remote Training on SSH with CUDA

This notebook lets you launch training on an SSH machine with CUDA and safely disconnect while training continues.

**How it works:**
- Cell 2: Launches training as a background process with `nohup` (survives SSH disconnect)
- Cell 3: Monitor recent training logs
- Cell 4: Check GPU utilization
- Cell 5: Stop training if needed

**Workflow:**
1. Run Cell 2 to start training
2. Disconnect from SSH/close notebook - training continues!
3. Reconnect later and run Cell 3 to check progress
4. Check outputs in `checkpoints/` and `outputs/` directories when done

In [None]:
import os
from datetime import datetime

# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Generate unique log filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/training_{timestamp}.log"

print(f"Starting training in background...")
print(f"Logs will be written to: {log_file}")
print(f"To monitor progress: tail -f {log_file}")
print(f"To check if still running: ps aux | grep 'python main.py'")

# Launch training as detached background process
# nohup ensures it continues after SSH disconnect
# &> redirects both stdout and stderr to log file
# & runs in background
# The process will continue even if you close this notebook or disconnect SSH
cmd = (
    f"nohup python main.py  "
    f"& --filter-window hann"
    f"&> {log_file} &"
)
os.system(cmd)

print("\n✓ Training launched in background!")
print(f"\nUseful commands:")
print(f"  Monitor live:     tail -f {log_file}")
print(f"  Check status:     ps aux | grep 'python main.py'")
print(f"  Kill if needed:   pkill -f 'python main.py'")
print(f"  GPU usage:        nvidia-smi")

Starting training in background...
Logs will be written to: logs/training_20260115_184556.log
To monitor progress: tail -f logs/training_20260115_184556.log
To check if still running: ps aux | grep 'python main.py'

✓ Training launched in background!

Useful commands:
  Monitor live:     tail -f logs/training_20260115_184556.log
  Check status:     ps aux | grep 'python main.py'
  Kill if needed:   pkill -f 'python main.py'
  GPU usage:        nvidia-smi


(0.2, 0.2)
[0.         0.00436332 0.00872665 0.01308997 0.01745329]


2026-01-15 18:45:57 - INFO - SARRP CBCT GPU Reconstruction
2026-01-15 18:45:57 - INFO - ✓ GPU acceleration enabled (CuPy)
2026-01-15 18:45:57 - INFO - 
2026-01-15 18:45:57 - INFO - Step 1: Loading geometry...
2026-01-15 18:45:57 - INFO -   Geometry: SOD=353.27 mm, SDD=634.00 mm, IDD=280.73 mm
2026-01-15 18:45:57 - INFO -   Detector: 1024 × 1024 pixels
2026-01-15 18:45:57 - INFO -   Projections: 1440 angles from 0.0° to 360.0°
2026-01-15 18:45:57 - INFO -   Volume: (271, 438, 438) voxels, spacing=(0.26, 0.26, 0.26) mm
2026-01-15 18:45:57 - INFO -   Volume origin: (-35.230000000000004, -56.940000000000005, -56.940000000000005) mm
2026-01-15 18:45:57 - INFO -   Ramp filter window: shepp-logan
2026-01-15 18:45:57 - INFO -   Step 1 completed in 0.00s
2026-01-15 18:45:57 - INFO - 
2026-01-15 18:45:57 - INFO - Step 2: Loading projections...
2026-01-15 18:45:57 - INFO - Loading projections from: projections.mhd
2026-01-15 18:45:59 - INFO - Projections shape after reordering: (1440, 1024, 1024)

In [2]:
# Monitor the most recent training log
import glob
import time

# Find most recent log file
log_files = sorted(glob.glob('logs/training_*.log'))
if log_files:
    latest_log = log_files[-1]
    print(f"Monitoring: {latest_log}\n")
    print("="*70)
    
    # Display last 30 lines
    with open(latest_log, 'r') as f:
        lines = f.readlines()
        for line in lines[-30:]:
            print(line.rstrip())
else:
    print("No training logs found yet.")

Monitoring: logs/training_20251222_024019.log



In [None]:
# Check if still running
!ps aux | grep 'python main.py' | grep -v grep


In [7]:
# Check GPU utilization
import subprocess

result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print(result.stdout)

Thu Jan 15 18:19:25 2026       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.274.02             Driver Version: 535.274.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:01:00.0 Off |                  N/A |
|  0%   42C    P8              25W / 370W |     27MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [43]:
# Stop training if needed (kills all python main.py processes)
import subprocess

print("Stopping all training processes...")
subprocess.run(['pkill', '-f', 'python main.py'])
print("✓ Training processes stopped")

# Verify
result = subprocess.run(['pgrep', '-f', 'python main_wave2D_2branch.py'], capture_output=True)
if result.stdout:
    print("⚠ Some processes still running, try again or use: kill -9 <PID>")
else:
    print("✓ All training processes terminated")

Stopping all training processes...
✓ Training processes stopped
⚠ Some processes still running, try again or use: kill -9 <PID>


In [45]:
kill -9 2508066

SyntaxError: invalid syntax (3406397356.py, line 1)