# Generate slurm sbatch files for CONFIGS

In [None]:
import os
from training_configs import CONFIGS
import time

In [None]:
todays_date = time.strftime("%Y%m%d_%H%M%S")
todays_date_no_hours = time.strftime("%Y%m%d")

OUTPUT_DIR = "configs_bamboo/" + todays_date

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
def create_slurm_file(config_name, config_dict, fold, partition, time_batch, gpu, batch_size, size, output_training_dir, output_dir):
    """
    Create a SLURM batch file for a given configuration.
    
    Args:
        config_name: Name of the configuration (key from CONFIGS)
        config_dict: Configuration dictionary
        time_batch: Time batch for SLURM job (in days)
        gpu: GPU specification for SLURM
        output_dir: Base directory to save the SLURM files
    """
    
    # Create subdirectories based on architecture and backbone_size
    arch_dir = os.path.join(output_dir, config_dict['architecture'])
    final_dir = os.path.join(arch_dir, config_dict['backbone_size'])
    
    # Create output directory if it doesn't exist
    os.makedirs(final_dir, exist_ok=True)
    
    # Add fold to the configuration name
    job_name = f"{size}_{config_name}_fold_{fold}"

    # SLURM file template
    slurm_template = f"""#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --partition={partition}
#SBATCH --time={time_batch}
#SBATCH {gpu}
#SBATCH --cpus-per-task=8
#SBATCH --mem=64G
#SBATCH --output=%x-%J.out
#SBATCH --error=%x-%J.err
#SBATCH --requeue

# ============================================================================
# SLURM Job with Auto-Resume Functionality
# Config: {config_name} | Fold: {fold} | Size: {size}
# ============================================================================

# Configuration variables
CONFIG_NAME="{config_name}"
CONFIG_NAME_BACKBONE="{config_dict['architecture']}_{config_dict['backbone']}_{config_dict['encoder_weights']}"
FOLD={fold}
OUTPUT_DIR="{output_training_dir}"
BATCH_SIZE={batch_size}
SIZE="{size}"

# Set strict error handling
set -e
set -u

# Function to handle early termination signals
cleanup() {{
    echo "$(date): Job received termination signal"
    echo "Attempting graceful shutdown..."
    if [[ ! -z "${{TRAIN_PID:-}}" ]]; then
        kill -TERM $TRAIN_PID 2>/dev/null || true
        wait $TRAIN_PID 2>/dev/null || true
    fi
    exit 0
}}

trap cleanup SIGUSR1 SIGTERM SIGINT

# Load Anaconda module
source /opt/ebsofts/Anaconda3/2024.02-1/etc/profile.d/conda.sh

# Activate conda environment
conda activate myenv

# Look for existing checkpoints
echo "Checking for Existing Checkpoints..."
CHECKPOINT_PATTERN="${{OUTPUT_DIR}}/${{CONFIG_NAME_BACKBONE}}/single_fold_${{FOLD}}_*/01_model_output/checkpoint_epoch_*_fold_${{FOLD}}.pth"
RESUME_ARG=""

if ls $CHECKPOINT_PATTERN 2>/dev/null | head -1 >/dev/null; then
    LATEST_CHECKPOINT=$(ls -t $CHECKPOINT_PATTERN 2>/dev/null | head -1)
    if [[ -f "$LATEST_CHECKPOINT" ]]; then
        EPOCH_NUM=$(basename "$LATEST_CHECKPOINT" | sed 's/checkpoint_epoch_\\([0-9]*\\)_fold_.*/\\1/')
        echo "✓ Found checkpoint at epoch $EPOCH_NUM: $LATEST_CHECKPOINT"
        RESUME_ARG="--resume $LATEST_CHECKPOINT"
    fi
else
    echo "✓ No checkpoints found - starting fresh training"
fi

# Create output directory
mkdir -p "$OUTPUT_DIR"

# Start training
python3 02_training_v21.py \\
    --config "$CONFIG_NAME" \\
    --fold "$FOLD" \\
    --gpu 0 \\
    --output_dir "$OUTPUT_DIR" \\
    --batch_size "$BATCH_SIZE" \\
    --no_graphics_every_epoch \\
    --verbose_logging \\
    $RESUME_ARG \\
    2>&1 | tee "{job_name}-${{SLURM_JOB_ID}}.log" &

TRAIN_PID=$!
wait $TRAIN_PID
TRAIN_EXIT_CODE=$?

echo "============================================================================"
echo "Training Completed: $(date) | Exit Code: $TRAIN_EXIT_CODE"
if [[ $TRAIN_EXIT_CODE -eq 0 ]]; then
    echo "Status: ✓ SUCCESS"
else
    echo "Status: ✗ FAILED - Check log: {job_name}-${{SLURM_JOB_ID}}.log"
fi
echo "============================================================================"

exit $TRAIN_EXIT_CODE

"""
    
    # Write to file
    filename = os.path.join(final_dir, f"{config_name}_fold_{fold}.sh")
    with open(filename, 'w') as f:
        f.write(slurm_template)
    
    print(f"Created: {filename}")
    return filename

In [None]:
# Number of configurations in CONFIGS
print(f"Total configurations: {len(CONFIGS)}")

print("="*50)

# Number of configurations per architecture
for arch in set(config['architecture'] for config in CONFIGS.values()):
    count = sum(1 for config in CONFIGS.values() if config['architecture'] == arch)
    print(f"Architecture {arch}: {count} configurations")

print("="*50)

In [None]:
# classify the backbones per size
backbone_sizes = {
    # 0-25M PARAMETER RANGE - Lightweight Champions
    'small': [
        'timm-efficientnet-b3',
        'tu-efficientvit_b2.r224_in1k',
        'tu-fastvit_t8.apple_in1k',
        'tu-repvit_m1.dist_in1k',
        'tu-regnety_032.ra_in1k'
    ],
    
    # 25M-50M PARAMETER RANGE - Sweet Spot Performance
    'medium': [
        'tu-mambaout_small',
        'tu-efficientnetv2_rw_s.ra2_in1k',
        'tu-regnety_080.ra3_in1k',
        'timm-res2net101_26w_4s',
        'resnext50_32x4d'
    ],
    
    # 50M-100M PARAMETER RANGE - High Performance
    'large': [
        'tu-mambaout_base',
        'tu-efficientnetv2_rw_m.agc_in1k',
        'timm-resnest200e',
        'resnext101_32x8d',
        'timm-efficientnet-b5'
    ],
    
    # 100M+ PARAMETER RANGE - Foundation Model Territory
    'huge': [
        'tu-tf_efficientnetv2_xl.in21k_ft_in1k'
    ]
}

# add backbone_sizes to CONFIGS
for config in CONFIGS.values():
    config['backbone_size'] = 'unknown'
    for size, backbones in backbone_sizes.items():
        if config['backbone'] in backbones:
            config['backbone_size'] = size
            break

# assert that all backbones are classified
for config in CONFIGS.values():
    assert config['backbone_size'] != 'unknown', f"Backbone {config['backbone']} not classified!"

# Show me the backbone per architecture and backbone_size
print("\nBackbones per architecture and backbone_size:")
for arch in set(config['architecture'] for config in CONFIGS.values()):
    print(f"\nArchitecture: {arch}")
    for size in ['small', 'medium', 'large', 'huge']:
        backbones = set(config['backbone'] for config in CONFIGS.values() 
                       if config['architecture'] == arch and config['backbone_size'] == size)
        if backbones:
            print(f"  {size.capitalize()}:")
            for backbone in sorted(backbones):
                print(f"    {backbone}")


In [None]:
# Create SLURM files for each configuration
print(f"Using output directory: {OUTPUT_DIR}")
print("-" * 50)

created_files = []

for config_name, config_dict in CONFIGS.items():
    # Determine time and GPU requirements based on backbone size
    if config_dict['architecture'] == 'deeplabv3':
        partition = "shared-gpu"
        time_batch = "12:00:00"
        gpu = "--gres=gpu:1,VramPerGpu:30G"
        batch_size = int(config_dict['batch_size'] * 2)
        size = config_dict['backbone_size']
        architecture = config_dict['architecture']
        output_training_dir = f"01_training_{size}_{architecture}_" + todays_date_no_hours
    elif config_dict['backbone_size'] in ['small']:
        partition = "shared-gpu"
        time_batch = "12:00:00"
        gpu = "--gres=gpu:1,VramPerGpu:24G"
        batch_size = config_dict['batch_size']
        size = "small"
        architecture = config_dict['architecture']
        output_training_dir = f"01_training_{size}_{architecture}_" + todays_date_no_hours
    elif config_dict['backbone_size'] in ['medium']:
        partition = "shared-gpu"
        time_batch = "12:00:00"
        gpu = "--gres=gpu:1,VramPerGpu:24G"
        batch_size = config_dict['batch_size']
        size = "medium"
        architecture = config_dict['architecture']
        output_training_dir = f"01_training_{size}_{architecture}_" + todays_date_no_hours
    elif config_dict['backbone_size'] in ["large"]:
        partition = "shared-gpu"
        time_batch = "12:00:00"
        gpu = "--gres=gpu:1,VramPerGpu:24G"
        batch_size = config_dict['batch_size']
        size = "large"
        architecture = config_dict['architecture']
        output_training_dir = f"01_training_{size}_{architecture}_" + todays_date_no_hours
    elif config_dict['backbone_size'] in ['huge']:
        partition = "public-gpu"
        time_batch = "2-00:00:00"
        gpu = "--gpus=nvidia_a100_80gb_pcie:1"
        batch_size = config_dict['batch_size']
        size = "huge"
        output_training_dir = f"01_training_{size}_{architecture}_" + todays_date_no_hours
    else:
        raise ValueError(f"Unknown backbone size: {config_dict['backbone_size']} for config {config_name}")
    
    # Create SLURM files for each fold (0-4)
    for fold in range(5):
        filename = create_slurm_file(
            config_name=config_name, 
            config_dict=config_dict,
            fold=fold,
            partition=partition, 
            time_batch=time_batch, 
            gpu=gpu,
            batch_size=batch_size,
            size=size,
            output_training_dir=output_training_dir,
            output_dir=OUTPUT_DIR
        )
        created_files.append(filename)

print("-" * 50)

# Verify all SLURM files were created correctly
verified_configs = set()
for root, dirs, files in os.walk(OUTPUT_DIR):
    for file in files:
        if file.endswith('.sh') and 'submit_' not in file:
            # Extract config name from filename (remove _fold_X.sh suffix)
            # Example: "config_name_fold_0.sh" -> "config_name"
            config_name = file.replace('.sh', '').rsplit('_fold_', 1)[0]
            verified_configs.add(config_name)
            # Print the full path for verification
            full_path = os.path.join(root, file)
            print(f"Verified: {full_path}")

# Convert to set for comparison
expected_configs = set(CONFIGS.keys())

assert verified_configs == expected_configs, f"Mismatch in created files. Missing: {expected_configs - verified_configs}, Extra: {verified_configs - expected_configs}"
print(f"\nSuccessfully created {len(created_files)} SLURM files for all configurations!")
print(f"All files are organized under: {OUTPUT_DIR}/architecture/backbone_size/")

In [None]:
# Optional: Create submission scripts for each category
def create_category_submission_scripts():
    """Create submission scripts for each backbone size category with fold-by-fold delays."""
    
    categories = ['small', 'medium', 'large', 'huge']
    
    # Get the directory name for relative paths
    output_dir_name = os.path.basename(OUTPUT_DIR)
    
    for category in categories:
        category_configs = [name for name, config in CONFIGS.items() 
                          if config['backbone_size'] == category]
        
        if not category_configs:
            continue
        
        total_jobs = len(category_configs) * 5  # 5 folds per config
        
        script_content = f"""#!/bin/bash
# Submit all {category} models with fold-by-fold delays

echo "Submitting {total_jobs} {category} model jobs ({len(category_configs)} configs × 5 folds)..."
echo "Strategy: Submit all fold 0, wait 1min, submit all fold 1, etc."

"""
        
        # Submit fold by fold with delays
        for fold in range(5):
            script_content += f"echo 'Submitting all FOLD {fold} jobs for {category} models...'\n"
            n = 30
            # Submit all configs for this fold
            if fold == 0:
                for config_name in category_configs[:n]:
                    config = CONFIGS[config_name]
                    arch = config['architecture']
                    script_path = f"configs_bamboo/{output_dir_name}/{arch}/{category}/{config_name}_fold_{fold}.sh"
                    script_content += f"sbatch {script_path}\n"
                    script_content += "sleep 30\n"
                for config_name in category_configs[n:]:
                    config = CONFIGS[config_name]
                    arch = config['architecture']
                    script_path = f"configs_bamboo/{output_dir_name}/{arch}/{category}/{config_name}_fold_{fold}.sh"
                    script_content += f"sbatch {script_path}\n"
            else:
                for config_name in category_configs:
                    config = CONFIGS[config_name]
                    arch = config['architecture']
                    script_path = f"configs_bamboo/{output_dir_name}/{arch}/{category}/{config_name}_fold_{fold}.sh"
                    script_content += f"sbatch {script_path}\n"
            
            script_content += f"echo 'Submitted {len(category_configs)} jobs for fold {fold}'\n"
            
            # Add delay except after the last fold
            if fold < 4:
                script_content += f"echo 'Waiting 1 minute before submitting fold {fold + 1}...'\n"
                script_content += "sleep 60\n"
            
            script_content += "\n"
        
        script_content += f'echo "All {category} model jobs submitted! ({total_jobs} jobs total)"'
        
        script_filename = os.path.join(OUTPUT_DIR, f"submit_{category}_models_all_folds.sh")
        with open(script_filename, 'w') as f:
            f.write(script_content)
        
        # Make it executable
        os.chmod(script_filename, 0o755)
        print(f"Created: {script_filename}")

In [None]:
# Create architecture-size specific submission scripts
def create_arch_size_submission_scripts():
    """Create submission scripts for each architecture-size combination with fold-by-fold delays."""
    
    # Get all unique architecture-size combinations
    arch_size_combinations = {}
    for config_name, config in CONFIGS.items():
        arch = config['architecture']
        size = config['backbone_size']
        key = f"{arch}_{size}"
        
        if key not in arch_size_combinations:
            arch_size_combinations[key] = []
        arch_size_combinations[key].append(config_name)
    
    print(f"\nCreating {len(arch_size_combinations)} architecture-size specific submission scripts...")
    
    # Get the directory name for relative paths
    output_dir_name = os.path.basename(OUTPUT_DIR)
    
    for combo_key, config_names in arch_size_combinations.items():
        arch, size = combo_key.split('_')
        
        total_jobs = len(config_names) * 5  # 5 folds per config
        
        script_content = f"""#!/bin/bash
# Submit all {size} {arch} models with fold-by-fold delays

echo "Submitting {total_jobs} {size} {arch} model jobs ({len(config_names)} configs × 5 folds)..."
echo "Strategy: Submit all fold 0, wait 1min, submit all fold 1, etc."

"""
        
        # Submit fold by fold with delays
        for fold in range(5):
            script_content += f"echo 'Submitting all FOLD {fold} jobs for {size} {arch} models...'\n"
            
            # Submit all configs for this fold
            for config_name in sorted(config_names):
                script_path = f"configs_bamboo/{output_dir_name}/{arch}/{size}/{config_name}_fold_{fold}.sh"
                script_content += f"sbatch {script_path}\n"
            
            script_content += f"echo 'Submitted {len(config_names)} jobs for fold {fold}'\n"
            
            # Add delay except after the last fold
            if fold < 4:
                script_content += f"echo 'Waiting 1 minute before submitting fold {fold + 1}...'\n"
                script_content += "sleep 60\n"
            
            script_content += "\n"
        
        script_content += f'echo "All {size} {arch} model jobs submitted! ({total_jobs} jobs total)"'
        
        script_filename = os.path.join(OUTPUT_DIR, f"submit_{size}_{arch}_models.sh")
        with open(script_filename, 'w') as f:
            f.write(script_content)
        
        # Make it executable
        os.chmod(script_filename, 0o755)
        print(f"Created: {script_filename} ({len(config_names)} configs, {total_jobs} jobs)")

In [None]:
print("\nRe-creating submission scripts with fold-by-fold delays...")
create_category_submission_scripts()
create_arch_size_submission_scripts()

print("\n" + "="*60)
print("UPDATED SUBMISSION SCRIPTS WITH FOLD-BY-FOLD DELAYS:")
print("="*60)
print("\nNow each script will:")
print("1. Submit all fold 0 jobs")
print("2. Wait 1 minute") 
print("3. Submit all fold 1 jobs")
print("4. Wait 1 minute")
print("5. Continue for folds 2, 3, 4")
print("\nThis spreads the computational load more evenly over time.")

In [None]:
# Get all SLURM files organized by architecture and size
slurm_files_by_arch_size = {}

# Walk through the directory structure
for root, dirs, files in os.walk(OUTPUT_DIR):
    for file in files:
        if file.endswith('.sh') and 'submit_' not in file:  # Exclude submission scripts
            # Extract architecture and size from path
            path_parts = root.replace(OUTPUT_DIR, '').strip('/').split('/')
            if len(path_parts) >= 2:
                arch = path_parts[0]
                size = path_parts[1]
                key = f"{arch}_{size}"
                
                if key not in slurm_files_by_arch_size:
                    slurm_files_by_arch_size[key] = []
                
                slurm_files_by_arch_size[key].append(os.path.join(root, file))

# Print sbatch commands for each architecture and size combination
print("\nSbatch commands per architecture and size:")
print("="*60)

for key, files in slurm_files_by_arch_size.items():
    arch, size = key.split('_')
    print(f"\n{arch.upper()} - {size.upper()}:")
    for file in sorted(files):  # Sort for better readability
        print(f"sbatch {file}")


In [None]:

# Summary of created submission scripts
print("\n" + "="*60)
print("SUBMISSION SCRIPTS CREATED:")
print("="*60)

print("\n1. BY SIZE ONLY (submit all architectures of a given size):")
for size in ['small', 'medium', 'large', 'huge']:
    script_path = os.path.join(OUTPUT_DIR, f"submit_{size}_models_all_folds.sh")
    if os.path.exists(script_path):
        count = sum(1 for name, config in CONFIGS.items() if config['backbone_size'] == size)
        print(f"   bash submit_{size}_models_all_folds.sh  # {count} configs × 5 folds = {count*5} jobs")

print("\n2. BY ARCHITECTURE + SIZE (submit specific combinations):")
arch_size_scripts = []
for root, dirs, files in os.walk(OUTPUT_DIR):
    for file in files:
        if file.startswith('submit_') and file.endswith('_models.sh') and 'all_folds' not in file:
            arch_size_scripts.append(file)

for script in sorted(arch_size_scripts):
    # Extract size and arch from filename like "submit_huge_unet_models.sh"
    parts = script.replace('submit_', '').replace('_models.sh', '').split('_')
    if len(parts) >= 2:
        size, arch = parts[0], '_'.join(parts[1:])
        count = sum(1 for name, config in CONFIGS.items() 
                   if config['backbone_size'] == size and config['architecture'] == arch)
        print(f"   bash {script}  # {count} configs × 5 folds = {count*5} jobs")

print(f"\nAll scripts are located in: {OUTPUT_DIR}/")
print("\nExample usage:")
print("   cd " + OUTPUT_DIR)
print("   bash submit_huge_unet_models.sh      # Submit only huge UNet models")
print("   bash submit_small_segformer_models.sh # Submit only small Segformer models") 
print("   bash submit_large_models_all_folds.sh # Submit ALL large models (any architecture)")

In [None]:
def create_job_monitoring_script(output_dir):
    """Create a comprehensive job monitoring script."""
    script_content = """#!/bin/bash
# Clean terminal-friendly job monitoring script

clear
echo "=================================================="
echo "           TRAINING MONITOR $(uname -n)"
echo "=================================================="
echo "Date: $(date '+%Y-%m-%d %H:%M:%S')"
echo "User: $USER"
echo ""

# Current jobs
echo "CURRENT JOBS:"
echo "--------------------------------------------------"
RUNNING=$(squeue -u $USER -t RUNNING -h | wc -l)
PENDING=$(squeue -u $USER -t PENDING -h | wc -l)

if [ "$RUNNING" -eq 0 ] && [ "$PENDING" -eq 0 ]; then
    echo "  No jobs in queue"
else
    if [ "$RUNNING" -gt 0 ]; then
        echo "  Running: $RUNNING jobs"
        echo ""
        # Fixed formatting without justification flags
        printf "  %-25s %-8s %-10s %-10s\\n" "JOB NAME" "STATE" "TIME" "TIME LEFT"
        echo "  --------------------------------------------------------------"
        squeue -u $USER -t RUNNING --format="%25j|%8T|%10M|%10L" --noheader | head -8 | awk -F '|' '{
            gsub(/^[ \t]+|[ \t]+$/, "", $1);
            gsub(/^[ \t]+|[ \t]+$/, "", $2);
            gsub(/^[ \t]+|[ \t]+$/, "", $3);
            gsub(/^[ \t]+|[ \t]+$/, "", $4);
            printf "  %-25s %-8s %-10s %-10s\\n", $1, $2, $3, $4;
        }'
        if [ "$RUNNING" -gt 8 ]; then
            echo "  ... and $((RUNNING - 8)) more running jobs"
        fi
        echo ""
    fi
    
    if [ "$PENDING" -gt 0 ]; then
        echo "  Pending: $PENDING jobs"
        # Fixed formatting without justification flags
        squeue -u $USER -t PENDING --format="%25j|%8T|%15R" --noheader | head -5 | awk -F '|' '{
            gsub(/^[ \t]+|[ \t]+$/, "", $1);
            gsub(/^[ \t]+|[ \t]+$/, "", $2);
            gsub(/^[ \t]+|[ \t]+$/, "", $3);
            printf "  %-25s %-8s %-15s\\n", $1, $2, $3;
        }'
        if [ "$PENDING" -gt 5 ]; then
            echo "  ... and $((PENDING - 5)) more pending jobs"
        fi
        echo ""
    fi
fi

# Recent failures
echo "RECENT ISSUES:"
echo "--------------------------------------------------"
TODAY=$(date '+%Y-%m-%d')

# Get failed jobs with timing information
TEMP_FAILED_FILE="/tmp/monitor_failed_$.txt"
sacct -u $USER --starttime=$TODAY --format=JobID,JobName%35,State,Start,End --noheader 2>/dev/null | \
    grep -v batch | grep -v extern | \
    grep -E "FAILED|CANCELLED|TIMEOUT" > "$TEMP_FAILED_FILE"

FAILED_TODAY=$(wc -l < "$TEMP_FAILED_FILE" 2>/dev/null || echo "0")

if [ "$FAILED_TODAY" -eq 0 ]; then
    echo "  No failed jobs today"
else
    echo "  $FAILED_TODAY failed jobs today (showing most recent 5):"
    echo ""
    printf "    %-30s %-10s %-8s %-8s\\n" "JOB NAME" "STATE" "START" "END"
    echo "    ----------------------------------------------------------------"
    # Sort by job ID to get most recent first, then take last 5
    sort -n "$TEMP_FAILED_FILE" | tail -5 | while read line; do
        job_id=$(echo "$line" | awk '{print $1}')
        job_name=$(echo "$line" | awk '{print $2}' | cut -c1-30)
        job_state=$(echo "$line" | awk '{print $3}')
        job_start=$(echo "$line" | awk '{print $4}')
        job_end=$(echo "$line" | awk '{print $5}')
        
        # Format times to show just time (HH:MM) if today, or date if older
        if [ -n "$job_start" ] && [ "$job_start" != "Unknown" ]; then
            start_time=$(date -d "$job_start" '+%H:%M' 2>/dev/null || echo "$job_start" | cut -c12-16)
        else
            start_time="--:--"
        fi
        
        if [ -n "$job_end" ] && [ "$job_end" != "Unknown" ]; then
            end_time=$(date -d "$job_end" '+%H:%M' 2>/dev/null || echo "$job_end" | cut -c12-16)
        else
            end_time="--:--"
        fi
        
        printf "    %-30s %-10s %-8s %-8s\\n" "$job_name" "$job_state" "$start_time" "$end_time"
    done
    
    # Show summary of failure types
    echo ""
    echo "  Failure breakdown:"
    FAILED_COUNT=$(grep "FAILED" "$TEMP_FAILED_FILE" | wc -l)
    TIMEOUT_COUNT=$(grep "TIMEOUT" "$TEMP_FAILED_FILE" | wc -l)
    CANCELLED_COUNT=$(grep "CANCELLED" "$TEMP_FAILED_FILE" | wc -l)
    
    if [ "$FAILED_COUNT" -gt 0 ]; then
        echo "    FAILED: $FAILED_COUNT jobs"
    fi
    if [ "$TIMEOUT_COUNT" -gt 0 ]; then
        echo "    TIMEOUT: $TIMEOUT_COUNT jobs"
    fi
    if [ "$CANCELLED_COUNT" -gt 0 ]; then
        echo "    CANCELLED: $CANCELLED_COUNT jobs"
    fi
fi
echo ""

# Clean up temp file
rm -f "$TEMP_FAILED_FILE"

# Training progress
echo "TRAINING PROGRESS:"
echo "--------------------------------------------------"

# Count recent checkpoints and logs
RECENT_CHECKPOINTS=$(find . -name "checkpoint_epoch_*.pth" -mtime -1 2>/dev/null | wc -l)
ACTIVE_LOGS=$(find . -name "training_*.log" -mtime -0.1 2>/dev/null | wc -l)

echo "  Recent checkpoints (24h): $RECENT_CHECKPOINTS"
echo "  Active training logs:     $ACTIVE_LOGS"
echo ""

# Disk usage
echo "DISK USAGE:"
echo "--------------------------------------------------"
if ls 01_training_*/ >/dev/null 2>&1; then
    echo "  Training directories:"
    du -sh 01_training_*/ 2>/dev/null | sort -hr | head -5 | while read size dir; do
        printf "    %-8s %s\\n" "$size" "$dir"
    done
    
    total_size=$(du -sh 01_training_*/ 2>/dev/null | awk '{sum+=$1} END {printf "%.0fG", sum}')
    echo "  Total: $total_size"
else
    echo "  No training directories found"
fi

# Available space
avail_space=$(df -h . 2>/dev/null | tail -1 | awk '{print $4}')
used_percent=$(df -h . 2>/dev/null | tail -1 | awk '{print $5}')
echo "  Available space: $avail_space ($used_percent used)"
echo ""
"""
    
    script_path = os.path.join(output_dir, "monitor_training.sh")
    with open(script_path, 'w') as f:
        f.write(script_content)
    os.chmod(script_path, 0o755)
    print(f"Created monitoring script: {script_path}")
    return script_path

In [None]:
def create_restart_failed_jobs_script(output_dir):
    """Create a script to restart failed jobs."""
    script_content = """#!/bin/bash
# Resume/restart failed jobs script with time range option

# Default to 24 hours if no argument provided
HOURS_BACK=${1:-24}

# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Script directory: $SCRIPT_DIR"

clear
echo "=================================================="
echo "         RESTART FAILED JOBS"
echo "=================================================="
echo "Looking back: $HOURS_BACK hours"
echo ""

# Calculate start time based on hours back
if command -v date >/dev/null 2>&1; then
    if date -d "1 hour ago" >/dev/null 2>&1; then
        # GNU date (Linux)
        START_TIME=$(date -d "$HOURS_BACK hours ago" '+%Y-%m-%dT%H:%M:%S')
        START_DATE=$(date -d "$HOURS_BACK hours ago" '+%Y-%m-%d')
        START_EPOCH=$(date -d "$HOURS_BACK hours ago" '+%s')
    else
        # BSD date (macOS)
        START_TIME=$(date -v-${HOURS_BACK}H '+%Y-%m-%dT%H:%M:%S')
        START_DATE=$(date -v-${HOURS_BACK}H '+%Y-%m-%d')
        START_EPOCH=$(date -v-${HOURS_BACK}H '+%s')
    fi
else
    echo "Error: date command not available"
    exit 1
fi

echo "Searching for failed jobs since: $START_TIME"
echo ""

# Get failed jobs from the specified time range
echo "Scanning job database..."
TEMP_FILE="/tmp/failed_jobs_$.txt"

# Use the start time with sacct (try full timestamp first, fall back to date)
sacct -u $USER --starttime="$START_TIME" --format=JobID,JobName%50,State,Start,End --noheader 2>/dev/null | \
    grep -v batch | grep -v extern | \
    grep -E "FAILED|CANCELLED|TIMEOUT" > "$TEMP_FILE"

# If that didn't work, try with just the date and filter manually
if [ ! -s "$TEMP_FILE" ]; then
    sacct -u $USER --starttime="$START_DATE" --format=JobID,JobName%50,State,Start,End --noheader 2>/dev/null | \
        grep -v batch | grep -v extern | \
        grep -E "FAILED|CANCELLED|TIMEOUT" > "$TEMP_FILE"
fi

# Filter results to only include jobs started after our time threshold
FILTERED_FILE="/tmp/filtered_jobs_$.txt"
> "$FILTERED_FILE"  # Create empty file

while IFS= read -r line; do
    if [ -n "$line" ]; then
        job_start=$(echo "$line" | awk '{print $4}')
        
        # Convert job start time to epoch for comparison
        job_start_epoch=$(date -d "$job_start" +%s 2>/dev/null || echo "0")
        
        # Only include jobs that started after our threshold
        if [ "$job_start_epoch" -ge "$START_EPOCH" ]; then
            echo "$line" >> "$FILTERED_FILE"
        fi
    fi
done < "$TEMP_FILE"

# Use the filtered file
mv "$FILTERED_FILE" "$TEMP_FILE"

if [ ! -s "$TEMP_FILE" ]; then
    echo "No failed jobs found in the last $HOURS_BACK hours."
    rm -f "$TEMP_FILE"
    echo ""
    echo "Current running jobs: $(squeue -u $USER -t RUNNING -h | wc -l)"
    echo "Use './monitor_training.sh' for detailed status"
    echo "=================================================="
    exit 0
fi

# Parse and collect all failed jobs
declare -a ALL_JOB_IDS=()
declare -a ALL_JOB_NAMES=()
declare -a ALL_JOB_STATES=()
declare -a ALL_JOB_STARTS=()

job_count=0
while IFS= read -r line; do
    if [ -n "$line" ]; then
        job_id=$(echo "$line" | awk '{print $1}')
        job_name=$(echo "$line" | awk '{print $2}' | sed 's/+$//')
        job_state=$(echo "$line" | awk '{print $3}')
        job_start=$(echo "$line" | awk '{print $4}')
        
        if [ -n "$job_id" ] && [ -n "$job_name" ]; then
            job_count=$((job_count + 1))
            
            ALL_JOB_IDS+=("$job_id")
            ALL_JOB_NAMES+=("$job_name")
            ALL_JOB_STATES+=("$job_state")
            ALL_JOB_STARTS+=("$job_start")
        fi
    fi
done < "$TEMP_FILE"

rm -f "$TEMP_FILE"

if [ $job_count -eq 0 ]; then
    echo "  No failed jobs found in specified time range"
    echo "=================================================="
    exit 0
fi

echo "Found $job_count failed jobs total."
echo "Deduplicating jobs (keeping most recent attempt per job name)..."

# Create associative arrays for deduplication
declare -A LATEST_JOB_ID
declare -A LATEST_JOB_STATE  
declare -A LATEST_JOB_START
declare -A LATEST_JOB_START_EPOCH

# Process each job to find the latest attempt for each job name
for i in "${!ALL_JOB_NAMES[@]}"; do
    job_name="${ALL_JOB_NAMES[$i]}"
    job_id="${ALL_JOB_IDS[$i]}"
    job_state="${ALL_JOB_STATES[$i]}"
    job_start="${ALL_JOB_STARTS[$i]}"
    
    # Convert start time to epoch for comparison
    job_start_epoch=$(date -d "$job_start" +%s 2>/dev/null || echo "0")
    
    # Check if this is the latest attempt for this job name
    if [[ ! -v LATEST_JOB_START_EPOCH["$job_name"] ]] || [[ $job_start_epoch -gt ${LATEST_JOB_START_EPOCH["$job_name"]} ]]; then
        LATEST_JOB_ID["$job_name"]="$job_id"
        LATEST_JOB_STATE["$job_name"]="$job_state"
        LATEST_JOB_START["$job_name"]="$job_start"
        LATEST_JOB_START_EPOCH["$job_name"]="$job_start_epoch"
    fi
done

# Create deduplicated arrays grouped by status
declare -a FAILED_JOB_NAMES=()
declare -a FAILED_JOB_IDS=()
declare -a FAILED_JOB_STARTS=()

declare -a CANCELLED_JOB_NAMES=()
declare -a CANCELLED_JOB_IDS=()
declare -a CANCELLED_JOB_STARTS=()

declare -a TIMEOUT_JOB_NAMES=()
declare -a TIMEOUT_JOB_IDS=()
declare -a TIMEOUT_JOB_STARTS=()

# Group jobs by status
for job_name in "${!LATEST_JOB_ID[@]}"; do
    job_id="${LATEST_JOB_ID["$job_name"]}"
    job_state="${LATEST_JOB_STATE["$job_name"]}"
    job_start="${LATEST_JOB_START["$job_name"]}"
    
    case "$job_state" in
        "FAILED")
            FAILED_JOB_NAMES+=("$job_name")
            FAILED_JOB_IDS+=("$job_id")
            FAILED_JOB_STARTS+=("$job_start")
            ;;
        "CANCELLED")
            CANCELLED_JOB_NAMES+=("$job_name")
            CANCELLED_JOB_IDS+=("$job_id")
            CANCELLED_JOB_STARTS+=("$job_start")
            ;;
        "TIMEOUT")
            TIMEOUT_JOB_NAMES+=("$job_name")
            TIMEOUT_JOB_IDS+=("$job_id")
            TIMEOUT_JOB_STARTS+=("$job_start")
            ;;
    esac
done

dedup_count=$((${#FAILED_JOB_NAMES[@]} + ${#CANCELLED_JOB_NAMES[@]} + ${#TIMEOUT_JOB_NAMES[@]}))
echo "After deduplication: $dedup_count unique jobs (removed $((job_count - dedup_count)) duplicates)"
echo ""

# Display jobs grouped by status
echo "Failed jobs by status:"
echo "--------------------------------------------------"

if [ ${#FAILED_JOB_NAMES[@]} -gt 0 ]; then
    echo "FAILED Jobs (${#FAILED_JOB_NAMES[@]} found):"
    for i in "${!FAILED_JOB_NAMES[@]}"; do
        echo "  $((i+1)). ${FAILED_JOB_NAMES[$i]} - Started: ${FAILED_JOB_STARTS[$i]}"
    done
    echo ""
fi

if [ ${#CANCELLED_JOB_NAMES[@]} -gt 0 ]; then
    echo "CANCELLED Jobs (${#CANCELLED_JOB_NAMES[@]} found):"
    for i in "${!CANCELLED_JOB_NAMES[@]}"; do
        echo "  $((i+1)). ${CANCELLED_JOB_NAMES[$i]} - Started: ${CANCELLED_JOB_STARTS[$i]}"
    done
    echo ""
fi

if [ ${#TIMEOUT_JOB_NAMES[@]} -gt 0 ]; then
    echo "TIMEOUT Jobs (${#TIMEOUT_JOB_NAMES[@]} found):"
    for i in "${!TIMEOUT_JOB_NAMES[@]}"; do
        echo "  $((i+1)). ${TIMEOUT_JOB_NAMES[$i]} - Started: ${TIMEOUT_JOB_STARTS[$i]}"
    done
    echo ""
fi

# Check how many are already in queue
echo "Checking for jobs already in queue..."
TEMP_QUEUE_FILE="/tmp/queue_check_$$.txt"
squeue -u $USER --format="%j" --noheader > "$TEMP_QUEUE_FILE"

ALREADY_IN_QUEUE=0
ALL_JOB_NAMES_DEDUP=($(printf '%s\\n' "${FAILED_JOB_NAMES[@]}" "${CANCELLED_JOB_NAMES[@]}" "${TIMEOUT_JOB_NAMES[@]}"))

for job_name in "${ALL_JOB_NAMES_DEDUP[@]}"; do
    if grep -q "^$job_name$" "$TEMP_QUEUE_FILE"; then
        ALREADY_IN_QUEUE=$((ALREADY_IN_QUEUE + 1))
    fi
done
rm -f "$TEMP_QUEUE_FILE"

echo "SUMMARY:"
echo "--------------------------------------------------"
echo "FAILED jobs: ${#FAILED_JOB_NAMES[@]}"
echo "CANCELLED jobs: ${#CANCELLED_JOB_NAMES[@]}"
echo "TIMEOUT jobs: ${#TIMEOUT_JOB_NAMES[@]}"
echo "Total unique jobs: $dedup_count"
echo "Jobs already in queue: $ALREADY_IN_QUEUE"
echo ""

if [ $dedup_count -eq 0 ]; then
    echo "No jobs to restart."
    echo "=================================================="
    exit 1
fi

if [ $ALREADY_IN_QUEUE -eq $dedup_count ]; then
    echo "All failed jobs are already running or pending in the queue."
    echo "=================================================="
    exit 0
fi

PROCESSABLE_JOBS=$((dedup_count - ALREADY_IN_QUEUE))
echo "Will process $PROCESSABLE_JOBS jobs (skipping $ALREADY_IN_QUEUE already in queue)"
echo ""

# Ask user what to do
echo "What would you like to do?"
echo "  1) Restart ALL failed jobs"
echo "  2) Restart only FAILED jobs"
echo "  3) Restart only CANCELLED jobs"
echo "  4) Restart only TIMEOUT jobs"
echo "  5) Cancel"
echo ""
echo -n "Choose option (1-5): "
read -r choice

# Determine which jobs to process based on choice
declare -a TARGET_JOB_NAMES=()

case $choice in
    1)
        echo ""
        echo "RESTARTING all failed jobs..."
        echo "--------------------------------------------------"
        TARGET_JOB_NAMES=($(printf '%s\\n' "${FAILED_JOB_NAMES[@]}" "${CANCELLED_JOB_NAMES[@]}" "${TIMEOUT_JOB_NAMES[@]}"))
        ;;
    2)
        echo ""
        echo "RESTARTING FAILED jobs only..."
        echo "--------------------------------------------------"
        TARGET_JOB_NAMES=("${FAILED_JOB_NAMES[@]}")
        ;;
    3)
        echo ""
        echo "RESTARTING CANCELLED jobs only..."
        echo "--------------------------------------------------"
        TARGET_JOB_NAMES=("${CANCELLED_JOB_NAMES[@]}")
        ;;
    4)
        echo ""
        echo "RESTARTING TIMEOUT jobs only..."
        echo "--------------------------------------------------"
        TARGET_JOB_NAMES=("${TIMEOUT_JOB_NAMES[@]}")
        ;;
    *)
        echo "Cancelled."
        echo "=================================================="
        exit 0
        ;;
esac

if [ ${#TARGET_JOB_NAMES[@]} -eq 0 ]; then
    echo "No jobs selected for restart."
    echo "=================================================="
    exit 0
fi

# Find scripts for target jobs
echo "Finding scripts for ${#TARGET_JOB_NAMES[@]} jobs..."
echo ""

declare -a SCRIPTS_FOUND=()

for job_name in "${TARGET_JOB_NAMES[@]}"; do
    echo "Finding script for: $job_name"
    
    # Find the corresponding script
    SCRIPT=""
    
    # Method 1: Look for exact job name match
    SCRIPT=$(find "$SCRIPT_DIR" -name "*.sh" -exec grep -l "#SBATCH --job-name=$job_name" {} \; 2>/dev/null | head -1)

    if [ -n "$SCRIPT" ] && [ -f "$SCRIPT" ]; then
        RELATIVE_SCRIPT=$(echo "$SCRIPT" | sed "s|^$SCRIPT_DIR/||")
        echo "  ✓ Script found: $RELATIVE_SCRIPT"
        SCRIPTS_FOUND+=("$SCRIPT")
    else
        echo "  ✗ Script not found"
        SCRIPTS_FOUND+=("")
    fi
done

echo ""

# Execute restart for jobs with scripts found
echo "Checking current queue status..."
QUEUE_FILE="/tmp/current_queue_$$.txt"
squeue -u $USER --format="%j" --noheader > "$QUEUE_FILE"

echo "Current running/pending jobs: $(wc -l < "$QUEUE_FILE")"
echo ""

SUCCESS=0
SKIPPED=0
ALREADY_QUEUED=0

for i in "${!TARGET_JOB_NAMES[@]}"; do
    job_name="${TARGET_JOB_NAMES[$i]}"
    script="${SCRIPTS_FOUND[$i]}"
    
    # Skip if no script found
    if [ -z "$script" ]; then
        echo "  Skipping $job_name (no script found)"
        SKIPPED=$((SKIPPED + 1))
        continue
    fi
    
    # Check if job is already in queue (running or pending)
    if grep -q "^$job_name$" "$QUEUE_FILE"; then
        job_status=$(squeue -u $USER --name="$job_name" --format="%T" --noheader | head -1)
        echo "  Skipping $job_name (already $job_status)"
        ALREADY_QUEUED=$((ALREADY_QUEUED + 1))
        continue
    fi

    # RESTART job
    echo -n "  Restarting $(basename $script)... "
    result=$(sbatch "$script" 2>&1)
    
    if [ $? -eq 0 ]; then
        job_id=$(echo "$result" | grep -o '[0-9]\+')
        echo "SUCCESS (Job $job_id)"
        SUCCESS=$((SUCCESS + 1))
    else
        echo "FAILED - $result"
    fi
    
    sleep 1
done

echo ""
echo "=================================================="
echo "RESULTS:"
echo "  Successfully restarted: $SUCCESS"
echo "  Already in queue: $ALREADY_QUEUED"
echo "  Skipped (no script): $SKIPPED"
echo ""
echo "Check status: squeue -u $USER"
echo "Monitor: ./monitor_training.sh"
echo "=================================================="

# Clean up temp files
rm -f "$QUEUE_FILE"
"""
    
    script_path = os.path.join(output_dir, "restart_failed_jobs.sh")
    with open(script_path, 'w') as f:
        f.write(script_content)
    os.chmod(script_path, 0o755)
    print(f"Created restart script: {script_path}")
    return script_path

In [None]:
print("\n" + "="*60)
print("CREATING UTILITY SCRIPTS")
print("="*60)

monitor_script = create_job_monitoring_script(OUTPUT_DIR)
restart_script = create_restart_failed_jobs_script(OUTPUT_DIR)

print("\n" + "="*60)
print("UTILITY SCRIPTS CREATED:")
print("="*60)
print(f"1. Monitor jobs:        bash {os.path.basename(monitor_script)}")
print(f"2. Restart failed jobs: bash {os.path.basename(restart_script)}")
print(f"\nAll utility scripts are in: {OUTPUT_DIR}/")