# Run Confluence on an HPC

# Requirements
* docker and dockerHub account installed somewhere where you have sudo priveledges to the point where "docker --version" completes successfully OR GitHub
* singularity or apptainer installed on your HPC
* Basic python environment


# Overall Tasks
### Alter 1. and 2. for your local setup
1. Git fork all of the repos you want to run, make sure you have sudo priveledges on a machine where "docker --version" works (locally)
2. Prep an empty_mnt directory to store confluence run (requires gdown package in environment) and clone modules of interest
3. Run the "Prepare Images Locally" section of the local notebook on GitHub
4. Run the "Confluence Module SLURM Script Generator" section of this notebook on your HPC to create SLURM submission scripts for each module
5. Run the Confluence Driver Script Generator section of this notebook on your HPC to create a SLURM submission script that runs each of the modules one by one (the one click run)

---
## Functions (IGNORE)

In [27]:
# FUNCTIONS IGNORE
def build_and_push_images(repo_directory:str, target_repo_names:list, target_docker_names:list, docker_username:str, push:bool = True, custom_tag_name:str = 'latest'):
    # Validate that lists are the same length
    if len(target_repo_names) != len(target_docker_names):
        raise ValueError("target_repo_names and target_docker_names must have the same length")
    
    for a_repo_name, a_docker_name in zip(target_repo_names, target_docker_names):
        repo_path = os.path.join(repo_directory, a_repo_name)
        a_docker_name_lower = a_docker_name.lower()
        docker_path = f'{docker_username}/{a_docker_name_lower}:{custom_tag_name}'
        build_cmd = ['docker', 'build','--quiet', '-f', os.path.join(repo_path, "Dockerfile"), '-t', docker_path, repo_path]
        try:
            sp.run(build_cmd)
        except Exception as e:
            raise RuntimeError(
                f"Docker build failed...\n"
                f"Build Command: {build_cmd}\n"
                f"Error: {e}"
            )
        if push:
            try:
                push_cmd = ['docker', 'push','--quiet', docker_path]
                sp.run(push_cmd)
            except Exception as e:
                raise RuntimeError(
                    f"Docker push failed...\n"
                    f"Push Command: {push_cmd}\n"
                    f"Error: {e}"
                )
            
def build_sifs_and_create_slurm_scripts(run_list, included_modules, base_dir, docker_username, build, custom_tag_name):

    for run in run_list:
        
        # Fail safe directory creation
        # Has to exist with 'mnt' structure (Do it exister avec la structure 'mnt')
        mnt_dir = os.path.join(base_dir, f'confluence_{run}', f'{run}_mnt')       
        # Create the sh_scripts directory (Cree le repertoire sh_scripts)
        sh_dir = os.path.join(base_dir, f'confluence_{run}', 'sh_scripts')
        if not os.path.exists(sh_dir):
            os.makedirs(sh_dir)
        # Create the sif directory (Cree la repertoire sif)
        sif_dir = os.path.join(base_dir, f'confluence_{run}', 'sif')
        if not os.path.exists(sif_dir):
            os.makedirs(sif_dir)
        # Create the report directory (Cree la repertoire report)
        report_dir = os.path.join(base_dir, f'confluence_{run}', 'report')
        if not os.path.exists(report_dir):
            os.makedirs(report_dir)

        # Create batchs script details
        submission_prefix = '#SBATCH'

        job_details = {
        'partition': 'cpu-preempt',
        'nodes' : '1',
        'cpus-per-task': '1',
        'job-name': f'{run}_cfl',
        }
        
        command_dict = {
            'expanded_setfinder': f'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'setfinder.simg') + ' -r reaches_of_interest.json -c continent.json -e -s 17 -o /data -n /data -a MetroMan HiVDI SIC -i ${SLURM_ARRAY_TASK_ID}',
            'expanded_combine_data': f'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'combine_data.simg') + ' -d /data  -e -s 17',
            'input': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind ' + f'{mnt_dir}/input:/mnt/data ' + os.path.join(sif_dir, 'input.simg') + ' -v 17 -r /mnt/data/expanded_reaches_of_interest.json -c SWOT_L2_HR_RiverSP_D -i ${GLOBAL_INDEX}',
            'non_expanded_setfinder': f'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'setfinder.simg') + ' -c continent.json -s 17 -o /data -n /data -a MetroMan HiVDI SIC -i ${SLURM_ARRAY_TASK_ID}',
            'non_expanded_combine_data': f'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'combine_data.simg') + ' -d /data -s 17',
            'prediagnostics': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind ' + f'{mnt_dir}/input:/mnt/data/input,{mnt_dir}/diagnostics/prediagnostics:/mnt/data/output ' + os.path.join(sif_dir, f'prediagnostics.simg') + ' -i ${GLOBAL_INDEX} -r reaches.json',
            'constrained_priors': f'singularity run -c --writable-tmpfs --bind {mnt_dir}/input:/mnt/data {os.path.join(sif_dir, "priors.simg")} ' + ' -i ${SLURM_ARRAY_TASK_ID} -r constrained -p usgs riggs -g -s local',
            'unconstrained_priors': f'singularity run -c --writable-tmpfs --bind {mnt_dir}/input:/mnt/data {os.path.join(sif_dir, "priors.simg")} ' + ' -i ${SLURM_ARRAY_TASK_ID} -r unconstrained -p usgs riggs -g -s local',
            'hivdi': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/hivdi:/mnt/data/flpe/hivdi ' + os.path.join(sif_dir, 'hivdi.simg') + ' /mnt/data/input/reaches.json --input-dir /mnt/data/input -i ${SLURM_ARRAY_TASK_ID}',
            'sic4dvar': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/sic4dvar:/mnt/data/output,{mnt_dir}/logs:/mnt/data/logs '+ os.path.join(sif_dir, 'sic4dvar.simg') + ' -r reaches.json --index ${GLOBAL_INDEX}',
            'metroman': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --env AWS_BATCH_JOB_ID="foo" --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/metroman:/mnt/data/output ' + os.path.join(sif_dir, "metroman.simg") + ' -i ${GLOBAL_INDEX} -r metrosets.json -s local -v',
            'metroman_consolidation': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/metroman:/mnt/data/flpe ' + os.path.join(sif_dir, 'metroman_consolidation.simg') + ' -i ${GLOBAL_INDEX}',
            'unconstrained_momma': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/momma:/mnt/data/output ' + os.path.join(sif_dir, 'momma.simg') + ' -r reaches.json -m 3 -i ${GLOBAL_INDEX}',
            'constrained_momma': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/momma:/mnt/data/output ' + os.path.join(sif_dir, 'momma.simg') + ' -r reaches.json -m 3 -c -i ${GLOBAL_INDEX}',
            'sad': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/sad:/mnt/data/output ' + os.path.join(sif_dir, 'sad.simg') + ' --reachfile reaches.json --index ${GLOBAL_INDEX}',
            'moi': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --env AWS_BATCH_JOB_ID="foo" --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/output ' + os.path.join(sif_dir, 'moi.simg') + ' -j basin.json -v -b unconstrained -i ${GLOBAL_INDEX}', # -s local
            'consensus': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe ' + os.path.join(sif_dir, 'consensus.simg') + ' --mntdir /mnt/data -r /mnt/data/input/reaches.json -i ${GLOBAL_INDEX}',
            'unconstrained_offline': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/output ' + os.path.join(sif_dir, 'offline.simg') + ' unconstrained timeseries integrator reaches.json ${GLOBAL_INDEX}',
            'validation': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\nsingularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/output ' + os.path.join(sif_dir, 'validation.simg') + ' -r reaches.json -t unconstrained -i ${GLOBAL_INDEX}',
            # 'output': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/diagnostics:/mnt/data/diagnostics,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/validation,{mnt_dir}/output:/mnt/data/output ' + os.path.join(sif_dir, 'output.simg') + ' -s local -j /app/metadata/metadata.json -m input prediagnostics momma hivdi neobam metroman sic4dvar sad consensus validation swot priors -v 17 -i ${SLURM_ARRAY_TASK_ID}'
            'output': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/diagnostics:/mnt/data/diagnostics,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/validation,{mnt_dir}/output:/mnt/data/output ' + os.path.join(sif_dir, 'output.simg') + ' -s local -j /app/metadata/metadata.json -m input prediagnostics momma metroman sic4dvar consensus swot -v 17 -i ${SLURM_ARRAY_TASK_ID}'
        }
        
        built_images = set() #prevents setfinder and combine_data redundant double build


        def create_slurm_script(job_details=job_details, build_image=False, sif_dir='foo'):
            submission_prefix = job_details['submission_prefix']
            if build_image:
                module_name = job_details['module_name']
                image_name = module_name.replace('expanded_', '').replace('non_', '').replace('unconstrained_', '').replace('constrained_', '')
                sp.run(['singularity', 'build', '-F', os.path.join(sif_dir, image_name + '.simg'), f"docker://{job_details['docker_username']}/{image_name}:{custom_tag_name}"])

            file = open(os.path.join(sh_dir, f'{module_to_run}.sh'), 'w')
            file.write('#!/bin/bash \n')
            file.write(f'{submission_prefix} -o {os.path.join(report_dir, f"{module_to_run}.%j_%a.out")}' + ' \n')

            for item in job_details:
                if item not in ['run_command', 'module_name', 'docker_username', 'submission_prefix']:
                    file.write(f'{submission_prefix} --{item}={job_details[item]} \n')
            file.write(job_details["run_command"])
            file.close()


        for module_to_run, run_command in command_dict.items():
            
            if module_to_run == 'moi':
                time_to_use = '00:30:00'
                mem_to_use = '2G'
            elif module_to_run == 'output':
                time_to_use = '05:00:00'
                mem_to_use = '4G'
            else:
                time_to_use = '00:20:00'
                mem_to_use = '4G'
                
            if included_modules:
                if module_to_run not in included_modules:
                    continue

            print('DIRECTORY NAME: ', run, '\nMODULE: ', module_to_run)
            


            job_details.update({
                'run_command': run_command,
                'module_name': module_to_run,
                'mem': mem_to_use,
                'time': time_to_use,
                'docker_username': docker_username,
                'submission_prefix': submission_prefix,
                'job-name': f'{module_to_run}_{run}_cfl',

            })
            
            create_slurm_script(job_details=job_details, build_image=build, sif_dir=sif_dir)

                
def generate_slurm_driver(
    job_name: str,
    output_log_dir: str,
    partition: str,
    time_limit: str,
    nodes: int,
    ntasks: int,
    cpus_per_task: int,
    mem: str,
    run: str,
    directory: str,
    json_file: str,
    expanded_json_file: str,
    reach_json_file: str,
    basin_json_file: str,
    metroman_json_file: str,
    batch_size: int,
    concurrent_jobs: int,
    script_jobs: dict[str, str],
    scripts: list[str]
) -> str:
    slurm_header = f"""#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --output={output_log_dir}/{job_name}_%j_%a.out
#SBATCH --error={output_log_dir}/{job_name}_%j_%a.err
#SBATCH --partition={partition}
#SBATCH --time={time_limit}
#SBATCH --nodes={nodes}
#SBATCH --ntasks={ntasks}
#SBATCH --cpus-per-task={cpus_per_task}
#SBATCH --mem={mem}

run='{run}'
echo "Run: $run"

directory="{directory}"

# Parameters
json_file="{json_file}"
expanded_json_file="{expanded_json_file}"
reach_json_file="{reach_json_file}"
basin_json_file="{basin_json_file}"
metroman_json_file="{metroman_json_file}"
default_jobs=$(jq length "$json_file")

# Adjust to HPC requirements
batch_size={batch_size}
concurrent_jobs={concurrent_jobs}

# Map specific script names to their job counts
declare -A script_jobs=(
"""

    # Inject job counts into script_jobs associative array
    for script, jobs in script_jobs.items():
        slurm_header += f"    [{script}]={jobs}\n"
    slurm_header += ")\n\n"

    # Build scripts array
    script_array = '    ' + '\n    '.join(scripts)
    scripts_block = f"""scripts=(
{script_array}
)
"""

    body = rf"""{scripts_block}




for slurm_script in "${{scripts[@]}}"; do
    echo "Starting submission for: $slurm_script"
    date

    # Initialize num_jobs from script_jobs array FIRST
    num_jobs="${{script_jobs[$slurm_script]}}"

    # Dynamic job count updates (files created during workflow)
    if [[ -s "$expanded_json_file" ]]; then
      expanded_jobs=$(jq length "$expanded_json_file")
      script_jobs["input.sh"]=$expanded_jobs
      # Update num_jobs if this is the input script
      if [[ "$slurm_script" == "input.sh" ]]; then
        num_jobs=$expanded_jobs
      fi
    fi

    if [[ -s "$basin_json_file" ]]; then
      basin_jobs=$(jq length "$basin_json_file")
      script_jobs["moi.sh"]=$basin_jobs
      if [[ "$slurm_script" == "moi.sh" ]]; then
        num_jobs=$basin_jobs
      fi
    fi

    if [[ -s "$metroman_json_file" ]]; then
      metroman_jobs=$(jq length "$metroman_json_file")
      script_jobs["metroman.sh"]=$metroman_jobs
      if [[ "$slurm_script" == "metroman.sh"  ]]; then
        num_jobs=$metroman_jobs
      fi
    fi

    # Fallback: all remaining $default_jobs modules use reaches.json once available,
    # otherwise fall back to reaches_of_interest.json
    if [[ -z "$num_jobs" || "$num_jobs" == "\$default_jobs" ]]; then
        if [[ -s "$reach_json_file" ]]; then
            num_jobs=$(jq length "$reach_json_file")
            echo "Using reach_json_file job count ($num_jobs) for $slurm_script"
        else
            num_jobs=$default_jobs
            echo "Using reaches_of_interest.json job count ($num_jobs) for $slurm_script"
        fi
    fi

    # Safety check
    if [[ -z "$num_jobs" ]]; then
        echo "Warning: No job count found for $slurm_script. Skipping."
        continue
    fi

    start=0
    while [ $start -lt $num_jobs ]; do
        end=$((start + batch_size - 1))
        if [ $end -ge $num_jobs ]; then
            end=$((num_jobs - 1))
        fi

        echo "Submitting jobs $start to $end from $slurm_script"
        job_id=$(sbatch --export=ALL,OFFSET=${{start}} --array=0-$((end - start))%${{concurrent_jobs}} "${{directory}}/${{slurm_script}}")
        # job_id=$(sbatch --array=${{start}}-${{end}}%${{concurrent_jobs}} "${{directory}}/${{slurm_script}}")
        job_id_number=$(echo $job_id | awk '{{print $4}}')

        echo "Waiting for job array $job_id_number to finish..."
        while squeue -j "$job_id_number" 2>/dev/null | grep -q "$job_id_number"; do
            job_info=$(squeue -j "${{job_id_number}}[]" --noheader -o "%i %T %R")
            held_tasks=$(echo "$job_info" | grep -i "requeued held" | awk '{{print $1}}')

            if [[ -n "$held_tasks" ]]; then
                echo "Detected held tasks in array $job_id_number:"
                echo "$held_tasks"
                for task in $held_tasks; do
                    echo "Cancelling task $task..."
                    scancel "$task"
                done
            fi

            sleep 10
        done

        echo "Batch $job_id_number has finished. Submitting next batch."
        date

        start=$((end + 1))
        sleep 5
    done      
    
done

echo "Run $run has finished successfully."
"""
    return slurm_header + body



## 1. Prepare Run

---
* Assumes you have local Docker images built and pushed to DockerHub
* Download or copy empty mnt and point to necessary directories

In [28]:
import os
import shutil
import subprocess as sp
from pathlib import Path
import json
import glob
import numpy as np
import pandas as pd

# Change these inputs to point wherever you need confluence to be on local machine

BASE_DIR = Path('/path/confluence/') #directory storing confluence runs
REPO_DIR = os.path.join(BASE_DIR, 'modules') #directory storing repos i.e ./modules/
RUN_NAME = 'runTest' #Specific run name i.e. 'test'
os.chdir(BASE_DIR)

run_dir = BASE_DIR / f'confluence_{RUN_NAME}' # new directory for run
src_dir = BASE_DIR / 'confluence_empty'

#------------------------------------------------

# SETUP, GitHub, DOCKER (DOCKER MUST BE OPEN)
github_name = 'github_username' # GitHub username or organization name where repos are located
push = True # Only select True if want to store images on dockerhub (one way to move to HPC)
docker_username = 'docker_username'
custom_tag_name = 'latest' # version control, will default to 'latest'
run_list = [f'{RUN_NAME}'] #one name per mnt, can build if you preset multiple mnts


In [29]:
# Choose modules of interest to run

#Name of confluence offline module
#expanded and non_expanded modules each work from single 'setfinder' and 'combine_data' module
INCLUDED_MODULES = [
    'expanded_setfinder',
    'expanded_combine_data',
    'input',
    'non_expanded_setfinder',
    'non_expanded_combine_data',
    'prediagnostics',
    # 'priors',
    'metroman',
    'metroman_consolidation',
    'unconstrained_momma',
    'hivdi',
    # 'sad',
    'sic4dvar',
    'consensus',
    # 'moi',
    # 'unconstrained_offline',
    # 'validation',
    'output'
]

# Modules to pull/build
TARGET_MODULES = [
    'setfinder',
    'combine_data',
    'input',
    'prediagnostics',
    # 'priors',
    'metroman',
    'metroman_consolidation',
    'momma',
    'hivdi',
    # 'sad',
    'sic4dvar',
    'consensus',
    # 'moi',
    # 'offline',
    # 'validation',
    'output'
]



In [33]:
###############################
## INITIAL OR NEW MNT DOWNLOAD:
# comment out after first install
###############################

## Install empty /mnt directory with input data and eventual output data
# ! pip install gdown
! gdown 10gJwg0wsl51K_mcoXGq1uQVW34oQwrJc



In [31]:
####################
## SUBSEQUENT RUNS:
# Use this to make new empty directory to run new reaches
####################

## Extract from tar.gz
tar_path = src_dir.with_suffix('.tar.gz')
with tarfile.open(tar_path, 'r:gz') as tar:
    tar.extractall(path=src_dir.parent)

# Rename to your run
src_dir.rename(run_dir)  # Rename to your run directory
p = run_dir / "empty_mnt" # rename internal mnt to run name
p.rename(p.with_name(f"{RUN_NAME}_mnt"))



RUN_NAME: svs17
REPO_DIR: /nas/cee-water/cjgleason/ellie/SWOT/confluence/modules/D
SIF_DIR: /nas/cee-water/cjgleason/ellie/SWOT/confluence/confluence_svs17/sif


In [None]:
# Point to necessary directories 
SIF_DIR = run_dir / 'sif' # Store built Docker images
sh_dir = run_dir / 'sh_scripts' # Store the sh scripts to run each module
report_dir = run_dir / 'report' # Job logs
mnt_dir = run_dir / f'{RUN_NAME}_mnt' #the mnt storing all confluence run data

os.environ['RUN_NAME'] = RUN_NAME
os.environ['BASE_DIR'] = str(BASE_DIR)
os.environ['REPO_DIR'] = str(REPO_DIR)
os.environ['SIF_DIR'] = str(SIF_DIR)

print(f'RUN_NAME: {RUN_NAME}')
print(f'REPO_DIR: {REPO_DIR}')
print(f'SIF_DIR: {SIF_DIR}')

## 2. Create sh Scripts

---
### Confluence Module SLURM Script Generator (RUN ON HPC, NOT LOCALLY)
* Build simg files from your dockerhub and generates scripts to submit to a SLURM job scheduler

In [32]:
# Build the scripts and singularity files

build_sifs_and_create_slurm_scripts(run_list=run_list, \
                                    included_modules = TARGET_MODULES, \
                                    base_dir = BASE_DIR, \
                                    docker_username = docker_username, \
                                    build = push, \
                                    custom_tag_name = custom_tag_name
                                   )

DIRECTORY NAME:  svs17 
MODULE:  hivdi


INFO:    Starting build...
INFO:    Fetching OCI image...
INFO:    Extracting OCI image...
INFO:    Inserting Apptainer configuration...
INFO:    Creating SIF file...
INFO:    Build complete: /nas/cee-water/cjgleason/ellie/SWOT/confluence/confluence_svs17/sif/hivdi.simg


## 3. Create Driver Script to run multiple modules

---
### Confluence Driver Script Generator (RUN ON HPC, NOT LOCALLY)
* Creates a batch submission script that will run all of your sif files in serial
* use sbatch to submit the entire run
* low resources and a long time should be used here, as all this job will do is launch your SLURM scripts you created for each module, it is basically a job manager

In [None]:
# Create driver SLURM script for each run in run_list

# Define which modules have special (hardcoded) job counts
HARDCODED_JOBS = {
    "expanded_setfinder": "6",
    "expanded_combine_data": "1",
    "non_expanded_setfinder": "6",
    "non_expanded_combine_data": "1",
    "unconstrained_priors": "6",
    "constrained_priors": "6",
    "metroman_consolidation": "6",
    "output": "6",
}

# Define modules that need dynamic job counts (will use $default_jobs placeholder)
# These will be upgraded to specific JSON files during execution
DYNAMIC_MODULES = [
    "input",
    "prediagnostics",
    "metroman",
    "hivdi",
    "sic4dvar",
    "unconstrained_momma",
    "constrained_momma",
    "sad",
    "moi",
    "consensus",
    "unconstrained_offline",
    "validation",
]

for run in run_list:

    job_name = str(run)
    output_log_dir = f"{run_dir}/log"
    partition = "cpu-preempt" #your partition here
    time_limit = "30:00:00"
    nodes = 1
    ntasks = 1
    cpus_per_task = 1
    mem = "5G"

    run = str(run)
    directory = run_dir
    sh_directory = f"{directory}/sh_scripts"
    json_file = f"{directory}/{run}_mnt/input/reaches_of_interest.json"
    expanded_json_file = f"{directory}/{run}_mnt/input/expanded_reaches_of_interest.json"
    reach_json_file = f"{directory}/{run}_mnt/input/reaches.json"
    basin_json_file = f"{directory}/{run}_mnt/input/basin.json"
    metroman_json_file = f"{directory}/{run}_mnt/input/metrosets.json"
    

    batch_size = 1000 # cluster specific
    concurrent_jobs = 400 # cluster specific

    # Dynamically build script_jobs based on INCLUDED_MODULES
    script_jobs = {}
    for module in INCLUDED_MODULES:
        script_name = f"{module}.sh"
        
        if module in HARDCODED_JOBS:
            # Use hardcoded job count
            script_jobs[script_name] = HARDCODED_JOBS[module]
        elif module in DYNAMIC_MODULES:
            pass
    
    # Dynamically build scripts list (same order as INCLUDED_MODULES)
    scripts = [f"{module}.sh" for module in INCLUDED_MODULES]
    
    driver_script = generate_slurm_driver(
        job_name=job_name,
        output_log_dir=output_log_dir,
        partition=partition,
        time_limit=time_limit,
        nodes=nodes,
        ntasks=ntasks,
        cpus_per_task=cpus_per_task,
        mem=mem,
        run=run,
        directory=sh_directory,
        json_file=json_file,
        expanded_json_file=expanded_json_file,
        reach_json_file=reach_json_file,
        basin_json_file=basin_json_file,
        metroman_json_file=metroman_json_file,
        batch_size=batch_size,
        concurrent_jobs=concurrent_jobs,
        script_jobs=script_jobs,
        scripts=scripts,
    )
    
    # Save to file
    with open(f"{sh_directory}/slurm_driver.sh", "w") as f:
        f.write(driver_script)


# Optionally submit
# import subprocess
# subprocess.run(["sbatch", f"{sh_dir}/slurm_driver.sh"], check=True)


---
# Reach or Module Changes

#### In order to run on different Type I reaches
* modify the file at /mnt/input/reaches_of_interest.json

#### In order to change a module and test it:
### Option 1
* change the module locally, build it and push to dockerhub using the first part of this notebook and then run as usual
* you can use the run_list variable to generate more submission script per moule to test more than one change at a time. However, whenver you submit them, they will still run one at a time, it just submits the next run automatically.
* Docker tag names highly recommended (custom_tag_name) for version control

### Option 2
* Use code below to generate everything in the HPC environment from cloning Git modules to running Confluence
* Docker images are built initially using GitHub container registry (ghcr.io/) and then overwritten with your HPC modules 
* This allows you to change module, re-build containers, and test as a module instantly
* Version control is handled by GitHub tag:
* Tag your local image
*      ! docker tag output:local ghcr.io/myGitAccount/moduleName:my-custom-tag
* Push to registry
*      docker push ghcr.io/myGitAccount/moduleName:my-custom-tag
* Modify tag name in function from 'latest' to 'my-custom-tag'

### Option 3
* Use a symlink to connect a previous run to a new directory
* Run module of interest using the data in previous modules (only need to run the changed module!)
* Combine 2 and 3 for efficient testing of multiple changes to one or many modules!

In [None]:
#############################################

# Example Option 2:
## Run Confluence on an HPC end-to-end

## Requirements
* GitHub account
* apptainer installed on your HPC
* Basic python environment


## Overall Tasks
* Git clone all of the repos you want to run
* Prep an empty_mnt directory to store confluence run (requires gdown package in environment)
* Run image prep function to create the images from GitHub and your cloned modules
* Create SLURM submission scripts for each module
* Run the Confluence Driver Script Generator section of this notebook on your HPC to create a SLURM submission script that runs each of the modules one by one (the one click run)



---
## Functions (IGNORE)

In [15]:
def clone_repos(github_name, repo_dir, repo_names, name_map, branch='main'):
    """Clone repositories with specified branch.
    
    Parameters
    ----------
    github_name : str
        GitHub username or organization name
    repo_dir : str
        Directory to clone repos into
    repo_names : list
        List of repository names to clone
    branch : str or dict, optional
        Branch name to clone. Can be:
        - A string: same branch for all repos (default: 'main')
        - A dict: mapping repo name to specific branch
    """
    os.makedirs(repo_dir, exist_ok=True)
    
    for name in repo_names:
        path = os.path.join(repo_dir, name)
        repo_name = name_map.get(name, name)
        url = f'https://github.com/{github_name}/{repo_name}.git'
        
        # Determine which branch to use
        if isinstance(branch, dict):
            branch_name = branch.get(name, 'main')
        else:
            branch_name = branch
        
        if os.path.exists(path):
            print(f'[Remove] Deleting existing {name} to overwrite...')
            try:
                shutil.rmtree(path)  # rm -rf
            except OSError as e:
                print(f"Error: {path} : {e.strerror}")
        
        print(f'[Clone] Cloning {name} from branch {branch_name}...')
        sp.run(['git', 'clone', '--branch', branch_name, url, name], cwd=repo_dir)


#-------------------------------------------------

def create_slurm_scripts(run_name, mnt_dir, sif_dir, sh_dir, report_dir, included_modules):
    # Create batchs script details
    submission_prefix = '#SBATCH'

    job_details = {
    'partition': 'cpu-preempt',
    'nodes' : '1',
    'cpus-per-task': '1',
    'job-name': f'{run_name}_cfl',
    }

    command_dict = {
        'expanded_setfinder': f'apptainer run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'setfinder.sif') + ' -r reaches_of_interest.json -c continent.json -e -s 17 -o /data -n /data -a MetroMan HiVDI SIC NeoBAM -i ${SLURM_ARRAY_TASK_ID}',
        'expanded_combine_data': f'apptainer run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'combine_data.sif') + ' -d /data  -e -s 17',
        'input': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind ' + f'{mnt_dir}/input:/mnt/data ' + os.path.join(sif_dir, 'input.sif') + ' -v 17 -r /mnt/data/expanded_reaches_of_interest.json -c SWOT_L2_HR_RiverSP_D -i ${GLOBAL_INDEX}',
        'non_expanded_setfinder': f'apptainer run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'setfinder.sif') + ' -c continent.json -s 17 -o /data -n /data -a MetroMan HiVDI SIC NeoBAM -i ${SLURM_ARRAY_TASK_ID}',
        'non_expanded_combine_data': f'apptainer run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'combine_data.sif') + ' -d /data -s 17',
        'prediagnostics': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind ' + f'{mnt_dir}/input:/mnt/data/input,{mnt_dir}/diagnostics/prediagnostics:/mnt/data/output ' + os.path.join(sif_dir, f'prediagnostics.sif') + ' -i ${GLOBAL_INDEX} -r reaches.json',
        'constrained_priors': f'apptainer run -c --writable-tmpfs --bind {mnt_dir}/input:/mnt/data {os.path.join(sif_dir, "priors.sif")} ' + ' -i ${SLURM_ARRAY_TASK_ID} -r constrained -p usgs riggs -g -s local',
        'unconstrained_priors': f'apptainer run -c --writable-tmpfs --bind {mnt_dir}/input:/mnt/data {os.path.join(sif_dir, "priors.sif")} ' + ' -i ${SLURM_ARRAY_TASK_ID} -r unconstrained -p usgs riggs -g -s local',
        'hivdi': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/hivdi:/mnt/data/flpe/hivdi ' + os.path.join(sif_dir, 'hivdi.simg') + ' /mnt/data/input/reaches.json --input-dir /mnt/data/input -i ${SLURM_ARRAY_TASK_ID}',
        'sic4dvar': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/sic4dvar:/mnt/data/output,{mnt_dir}/logs:/mnt/data/logs '+ os.path.join(sif_dir, 'sic4dvar.sif') + ' -r reaches.json --index ${GLOBAL_INDEX}',
        'metroman': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --env AWS_BATCH_JOB_ID="foo" --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/metroman:/mnt/data/output ' + os.path.join(sif_dir, "metroman.sif") + ' -i ${GLOBAL_INDEX} -r metrosets.json -s local -v',
        'metroman_consolidation': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/metroman:/mnt/data/flpe ' + os.path.join(sif_dir, 'metroman_consolidation.sif') + ' -i ${GLOBAL_INDEX}',
        'unconstrained_momma': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/momma:/mnt/data/output ' + os.path.join(sif_dir, 'momma.sif') + ' -r reaches.json -m 3 -i ${GLOBAL_INDEX}',
        'constrained_momma': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/momma:/mnt/data/output ' + os.path.join(sif_dir, 'momma.sif') + ' -r reaches.json -m 3 -c -i ${GLOBAL_INDEX}',
        'sad': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/sad:/mnt/data/output ' + os.path.join(sif_dir, 'sad.sif') + ' --reachfile reaches.json --index ${GLOBAL_INDEX}',
        'moi': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --env AWS_BATCH_JOB_ID="foo" --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/output ' + os.path.join(sif_dir, 'moi.sif') + ' -j basin.json -v -b unconstrained -i ${GLOBAL_INDEX}', # -s local
        'consensus': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe ' + os.path.join(sif_dir, 'consensus.sif') + ' --mntdir /mnt/data -r /mnt/data/input/reaches.json -i ${GLOBAL_INDEX}',
        'unconstrained_offline': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/output ' + os.path.join(sif_dir, 'offline.sif') + ' unconstrained timeseries integrator reaches.json ${GLOBAL_INDEX}',
        'validation': f'GLOBAL_INDEX=$(( ${{OFFSET:-0}} + SLURM_ARRAY_TASK_ID ))\n\napptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/output ' + os.path.join(sif_dir, 'validation.sif') + ' -r reaches.json -t unconstrained -i ${GLOBAL_INDEX}',
        # 'output': f'apptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/diagnostics:/mnt/data/diagnostics,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/validation,{mnt_dir}/output:/mnt/data/output ' + os.path.join(sif_dir, 'output.sif') + ' -s local -j /app/metadata/metadata.json -m input prediagnostics momma hivdi neobam metroman sic4dvar sad consensus validation swot priors -v 17 -i ${SLURM_ARRAY_TASK_ID}'
        'output': f'apptainer run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/diagnostics:/mnt/data/diagnostics,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/validation,{mnt_dir}/output:/mnt/data/output ' + os.path.join(sif_dir, 'output.sif') + ' -s local -j /app/metadata/metadata.json -m input momma metroman sic4dvar hivdi consensus swot -v 17 -i ${SLURM_ARRAY_TASK_ID}'
    }



    def create_slurm_script(job_details=job_details):
        submission_prefix = job_details['submission_prefix']

        file = open(os.path.join(sh_dir, f'{module_to_run}.sh'), 'w')
        file.write('#!/bin/bash \n')
        file.write(f'{submission_prefix} -o {os.path.join(report_dir, f"{module_to_run}.%j_%a.out")}' + ' \n')

        for item in job_details:
            if item not in ['run_command', 'module_name', 'submission_prefix']:
                file.write(f'{submission_prefix} --{item}={job_details[item]} \n')
        file.write(job_details["run_command"])
        file.close()


    for module_to_run, run_command in command_dict.items():

        if module_to_run == 'moi':
            time_to_use = '00:30:00'
            mem_to_use = '2G'
        elif module_to_run == 'output':
            time_to_use = '05:00:00'
            mem_to_use = '4G'
        else:
            time_to_use = '00:20:00'
            mem_to_use = '4G'

        if included_modules:
            if module_to_run not in included_modules:
                continue

        print('DIRECTORY NAME: ', run_name, '\nMODULE: ', module_to_run)



        job_details.update({
            'run_command': run_command,
            'module_name': module_to_run,
            'mem': mem_to_use,
            'time': time_to_use,
            'submission_prefix': submission_prefix,
            'job-name': f'{module_to_run}_{run_name}_cfl',

        })

        create_slurm_script(job_details=job_details)



#-------------------------------------------------

def create_singularity_def(mod, repo_dir):
    """Dockerfile -> Singularity.def with fix for nested output"""
    
    dockerfile_path = os.path.join(repo_dir, mod, 'Dockerfile')
    def_path = os.path.join(repo_dir, mod, 'Singularity.def')
    
    if not os.path.exists(dockerfile_path):
        print(f'{mod}: Dockerfile x, skip')
        return None
    
    with open(dockerfile_path) as f:
        content = f.read()
    
    files = []
    entrypoint = None
    
    for line in content.split('\n'):
        line = line.strip()
        
        if line.startswith('COPY'):
            parts = line.split()
            if len(parts) >= 3 and '--from' not in line:
                src = parts[1]
                dst = parts[2]
                if 'requirements' not in src:
                    files.append((src.replace('./', ''), dst))
        
        if line.startswith('ENTRYPOINT'):
            if '[' in line:
                import re
                matches = re.findall(r'"([^"]*)"', line)
                if matches:
                    entrypoint = ' '.join(matches)
    
    # Singularity.def with post-processing fix
    def_content = f'''Bootstrap: docker
From: ghcr.io/swot-confluence/{mod}:latest

%files
'''
    for src, dst in files:
        def_content += f'    {src} {dst}\n'
    
    # Add post section to fix nested directories
    if mod == 'output':
        def_content += '''
%post
    # Fix nested output directory - copy contents up one level
    if [ -d /app/output/output ]; then
        cp -rf /app/output/output/* /app/output/
        rm -rf /app/output/output
    fi
'''
    
    if entrypoint:
        def_content += f'''
%runscript
    exec {entrypoint} "$@"
'''
    
    with open(def_path, 'w') as f:
        f.write(def_content)
    
    print(f'{mod}: Singularity.def created')
    return def_path

#------------------------------------------------------------
                
def generate_slurm_driver(
    job_name: str,
    output_log_dir: str,
    partition: str,
    time_limit: str,
    nodes: int,
    ntasks: int,
    cpus_per_task: int,
    mem: str,
    run: str,
    directory: str,
    json_file: str,
    expanded_json_file: str,
    reach_json_file: str,
    basin_json_file: str,
    metroman_json_file: str,
    batch_size: int,
    concurrent_jobs: int,
    script_jobs: dict[str, str],
    scripts: list[str]
) -> str:
    slurm_header = f"""#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --output={output_log_dir}/{job_name}_%j_%a.out
#SBATCH --error={output_log_dir}/{job_name}_%j_%a.err
#SBATCH --partition={partition}
#SBATCH --time={time_limit}
#SBATCH --nodes={nodes}
#SBATCH --ntasks={ntasks}
#SBATCH --cpus-per-task={cpus_per_task}
#SBATCH --mem={mem}

run='{run}'
echo "Run: $run"

directory="{directory}"

# Parameters
json_file="{json_file}"
expanded_json_file="{expanded_json_file}"
reach_json_file="{reach_json_file}"
basin_json_file="{basin_json_file}"
metroman_json_file="{metroman_json_file}"
default_jobs=$(jq length "$json_file")

# Adjust to HPC requirements
batch_size={batch_size}
concurrent_jobs={concurrent_jobs}

# Map specific script names to their job counts
declare -A script_jobs=(
"""

    # Inject job counts into script_jobs associative array
    for script, jobs in script_jobs.items():
        slurm_header += f"    [{script}]={jobs}\n"
    slurm_header += ")\n\n"

    # Build scripts array
    script_array = '    ' + '\n    '.join(scripts)
    scripts_block = f"""scripts=(
{script_array}
)
"""

    body = rf"""{scripts_block}




for slurm_script in "${{scripts[@]}}"; do
    echo "Starting submission for: $slurm_script"
    date

    # Initialize num_jobs from script_jobs array FIRST
    num_jobs="${{script_jobs[$slurm_script]}}"

    # Dynamic job count updates (files created during workflow)
    if [[ -s "$expanded_json_file" ]]; then
      expanded_jobs=$(jq length "$expanded_json_file")
      script_jobs["input.sh"]=$expanded_jobs
      # Update num_jobs if this is the input script
      if [[ "$slurm_script" == "input.sh" ]]; then
        num_jobs=$expanded_jobs
      fi
    fi

    if [[ -s "$basin_json_file" ]]; then
      basin_jobs=$(jq length "$basin_json_file")
      script_jobs["moi.sh"]=$basin_jobs
      if [[ "$slurm_script" == "moi.sh" ]]; then
        num_jobs=$basin_jobs
      fi
    fi

    if [[ -s "$metroman_json_file" ]]; then
      metroman_jobs=$(jq length "$metroman_json_file")
      script_jobs["metroman.sh"]=$metroman_jobs
      if [[ "$slurm_script" == "metroman.sh"  ]]; then
        num_jobs=$metroman_jobs
      fi
    fi

    # Fallback: all remaining $default_jobs modules use reaches.json once available,
    # otherwise fall back to reaches_of_interest.json
    if [[ -z "$num_jobs" || "$num_jobs" == "\$default_jobs" ]]; then
        if [[ -s "$reach_json_file" ]]; then
            num_jobs=$(jq length "$reach_json_file")
            echo "Using reach_json_file job count ($num_jobs) for $slurm_script"
        else
            num_jobs=$default_jobs
            echo "Using reaches_of_interest.json job count ($num_jobs) for $slurm_script"
        fi
    fi

    # Safety check
    if [[ -z "$num_jobs" ]]; then
        echo "Warning: No job count found for $slurm_script. Skipping."
        continue
    fi

    start=0
    while [ $start -lt $num_jobs ]; do
        end=$((start + batch_size - 1))
        if [ $end -ge $num_jobs ]; then
            end=$((num_jobs - 1))
        fi

        echo "Submitting jobs $start to $end from $slurm_script"
        job_id=$(sbatch --export=ALL,OFFSET=${{start}} --array=0-$((end - start))%${{concurrent_jobs}} "${{directory}}/${{slurm_script}}")
        # job_id=$(sbatch --array=${{start}}-${{end}}%${{concurrent_jobs}} "${{directory}}/${{slurm_script}}")
        job_id_number=$(echo $job_id | awk '{{print $4}}')

        echo "Waiting for job array $job_id_number to finish..."
        while squeue -j "$job_id_number" 2>/dev/null | grep -q "$job_id_number"; do
            job_info=$(squeue -j "${{job_id_number}}[]" --noheader -o "%i %T %R")
            held_tasks=$(echo "$job_info" | grep -i "requeued held" | awk '{{print $1}}')

            if [[ -n "$held_tasks" ]]; then
                echo "Detected held tasks in array $job_id_number:"
                echo "$held_tasks"
                for task in $held_tasks; do
                    echo "Cancelling task $task..."
                    scancel "$task"
                done
            fi

            sleep 10
        done

        echo "Batch $job_id_number has finished. Submitting next batch."
        date

        start=$((end + 1))
        sleep 5
    done      
    
done

echo "Run $run has finished successfully."
"""
    return slurm_header + body


## 1. Prepare Run

In [16]:
import os
import shutil
import subprocess as sp
from pathlib import Path
import json
import glob
import numpy as np
import pandas as pd

BASE_DIR = Path('/path/confluence/') #directory storing confluence runs
REPO_DIR = BASE_DIR / 'modules/' #directory storing repos i.e ./modules/
RUN_NAME = 'runTest' #Specific run name i.e. 'runtest'

run_dir = BASE_DIR / f'confluence_{RUN_NAME}' # new directory for run
src_dir = BASE_DIR / 'confluence_empty'

HPC_username = 'your_hpc_username'
github_name = 'your_github_name'
os.chdir(BASE_DIR)



In [3]:
###############################
## INITIAL OR NEW MNT DOWNLOAD:
###############################

## Install empty /mnt directory with input data and eventual output data
# ! pip install gdown
! gdown 10gJwg0wsl51K_mcoXGq1uQVW34oQwrJc

In [17]:
####################
## SUBSEQUENT RUNS:
####################

src_dir = os.path.join(BASE_DIR, 'confluence_empty')  # initial unzipped gdown 
run_dir = os.path.join(BASE_DIR, f'confluence_{RUN_NAME}') # new directory for run
shutil.copytree(src_dir, run_dir) # copy the contents of empty to new (preserves initial data)

p = Path(f"{run_dir}/empty_mnt") # rename internal mnt to run name
p.rename(p.with_name(f"{RUN_NAME}_mnt"))



RUN_NAME: svs17
REPO_DIR: /nas/cee-water/cjgleason/ellie/SWOT/confluence/modules/D
SIF_DIR: /nas/cee-water/cjgleason/ellie/SWOT/confluence/confluence_svs17/sif


In [None]:

# Point to necessary directories 
SIF_DIR = run_dir / 'sif' # Store built Docker images
sh_dir = run_dir / 'sh_scripts' # Store the sh scripts to run each module
report_dir = run_dir / 'report' # Job logs
mnt_dir = run_dir / f'{RUN_NAME}_mnt' #the mnt storing all confluence run data

os.environ['RUN_NAME'] = RUN_NAME
os.environ['BASE_DIR'] = str(BASE_DIR)
os.environ['REPO_DIR'] = str(REPO_DIR)
os.environ['SIF_DIR'] = str(SIF_DIR)
os.environ['APPTAINER_CACHEDIR'] = f'/work/{HPC_username}/.apptainer/cache' #add your hpc username here

# Fail safe for directory build
for d in [SIF_DIR, sh_dir, report_dir, REPO_DIR]:
    os.makedirs(d, exist_ok=True)

print(f'RUN_NAME: {RUN_NAME}')
print(f'REPO_DIR: {REPO_DIR}')
print(f'SIF_DIR: {SIF_DIR}')

In [18]:

#Name of confluence offline module
#expanded and non_expanded modules work from 'setfinder' and 'combine_data'
INCLUDED_MODULES = [
    'expanded_setfinder',
    'expanded_combine_data',
    'input',
    'non_expanded_setfinder',
    'non_expanded_combine_data',
    'prediagnostics',
    # # 'priors',
    'metroman',
    'metroman_consolidation',
    'unconstrained_momma',
    'hivdi',
    # # 'sad',
    'sic4dvar',
    'consensus',
    # # 'moi',
    # # 'unconstrained_offline',
    # # 'validation',
    'output'
]

# Git modules to pull
TARGET_MODULES = [
    'setfinder',
    'combine_data',
    'input',
    'prediagnostics',
    # # 'priors',
    'metroman',
    'metroman_consolidation',
    'momma',
    'hivdi',
    # # 'sad',
    'sic4dvar',
    'consensus',
    # # 'moi',
    # # 'offline',
    # # 'validation',
    'output'
]

# Pull working branches for certain Git repos
branch_map = {
    'setfinder': 'main',
    'combine_data': 'main',
    'input': 'input_D_products',
    'prediagnostics': 'main',
    # 'priors': 'main',
    'metroman': 'main',
    'metroman_consolidation': 'main',
    'momma': 'main',
    'h2ivdi': 'main',
    # 'sad': 'main',
    'sic4dvar': 'main',
    'consensus': 'main',
    # 'moi': 'main',
    # 'offline': 'main',
    # 'validation': 'main',
    'output': 'add-sword-version'
}


## 2. Clone GitHub repositories

In [19]:
name_map = {
        'offline': 'offline-discharge-data-product-creation',
        'moi': 'MOI',
        'validation': 'Validation',
        'hivdi': 'h2ivdi'
    }
clone_repos(github_name=github_name, repo_dir=REPO_DIR, repo_names=TARGET_MODULES, name_map=name_map, branch=branch_map)

[Remove] Deleting existing metroman to overwrite...
[Clone] Cloning metroman from branch 16-discharge-timeseries-are-super-noisy...


Cloning into 'metroman'...


## 3. Build SIF (apptainer)

In [20]:
# Builds container and sif for all repos of interest

for mod in TARGET_MODULES:
    dockerfile = os.path.join(REPO_DIR, mod, 'Dockerfile')
    if os.path.exists(dockerfile):
        print(f"[{mod}]")
        with open(dockerfile) as f:
            lines = f.readlines()

            for line in lines:
                if line.strip().startswith(('COPY', 'ENTRYPOINT', 'FROM')):
                    print(line.strip())
        print()
    else:
        print(f'{mod} -> (Dockerfile x)')
        print()
        

for mod in TARGET_MODULES:
    create_singularity_def(mod, REPO_DIR)

for mod in TARGET_MODULES:
    sif_path = os.path.join(SIF_DIR, f'{mod}.sif')
    print(f'{mod}: Building...')
    os.system(f'cd {REPO_DIR}/{mod} && apptainer build --force --ignore-fakeroot-command {sif_path} Singularity.def')


[metroman]
FROM python:3.12-slim as stage0
FROM stage0 as stage1
FROM stage1 as stage2
COPY requirements.txt /app/requirements.txt
FROM stage2 as stage3
COPY ./metroman /app/metroman/
COPY ./sos_read /app/sos_read/
FROM stage3 as stage4
COPY run_metroman.py /app/run_metroman.py
ENTRYPOINT ["/app/env/bin/python3", "/app/run_metroman.py"]

metroman: Singularity.def created
metroman: Building...


INFO:    User not listed in /etc/subuid, trying root-mapped namespace
INFO:    fakeroot command not found
INFO:    Installing some packages may fail
INFO:    Starting build...
INFO:    Fetching OCI image...
INFO:    Extracting OCI image...
INFO:    Inserting Apptainer configuration...
INFO:    Copying metroman to /app/metroman/
INFO:    Copying sos_read to /app/sos_read/
INFO:    Copying run_metroman.py to /app/run_metroman.py
INFO:    Adding runscript
INFO:    Creating SIF file...
INFO:    Build complete: /nas/cee-water/cjgleason/ellie/SWOT/confluence/confluence_svs17/sif/metroman.sif


## 4. Create sh Scripts

In [12]:
create_slurm_scripts(run_name=RUN_NAME, \
                     mnt_dir=mnt_dir, \
                     sif_dir=SIF_DIR, \
                     sh_dir=sh_dir, \
                     report_dir=report_dir, \
                     included_modules=INCLUDED_MODULES)

DIRECTORY NAME:  svs17 
MODULE:  prediagnostics
DIRECTORY NAME:  svs17 
MODULE:  sic4dvar
DIRECTORY NAME:  svs17 
MODULE:  metroman
DIRECTORY NAME:  svs17 
MODULE:  metroman_consolidation
DIRECTORY NAME:  svs17 
MODULE:  unconstrained_momma
DIRECTORY NAME:  svs17 
MODULE:  consensus
DIRECTORY NAME:  svs17 
MODULE:  output


## 5. Create Driver Script to run multiple modules
---
### Confluence Driver Script Generator (RUN ON HPC, NOT LOCALLY)
* Creates a batch submission script that will run all of your sif files in serial
* use sbatch to submit the entire run
* low resources and a long time should be used here, as all this job will do is launch your SLURM scripts you created for each module, it is basically a job manager

In [14]:
# Create driver SLURM script for each run in run_list

# Define which modules have special (hardcoded) job counts
HARDCODED_JOBS = {
    "expanded_setfinder": "6",
    "expanded_combine_data": "1",
    "non_expanded_setfinder": "6",
    "non_expanded_combine_data": "1",
    "unconstrained_priors": "6",
    "constrained_priors": "6",
    "metroman_consolidation": "6",
    "output": "6",
}

# Define modules that need dynamic job counts (will use $default_jobs placeholder)
# These will be upgraded to specific JSON files during execution
DYNAMIC_MODULES = [
    "input",
    "prediagnostics",
    "metroman",
    "hivdi",
    "sic4dvar",
    "unconstrained_momma",
    "constrained_momma",
    "sad",
    "moi",
    "consensus",
    "unconstrained_offline",
    "validation",
]



job_name = str(RUN_NAME)
output_log_dir = f"{run_dir}/log"
partition = "cpu-preempt" #your partition here
time_limit = "30:00:00"
nodes = 1
ntasks = 1
cpus_per_task = 1
mem = "10G"

directory = run_dir
sh_directory = f"{directory}/sh_scripts"
json_file = f"{directory}/{RUN_NAME}_mnt/input/reaches_of_interest.json"
expanded_json_file = f"{directory}/{RUN_NAME}_mnt/input/expanded_reaches_of_interest.json"
reach_json_file = f"{directory}/{RUN_NAME}_mnt/input/reaches.json"
basin_json_file = f"{directory}/{RUN_NAME}_mnt/input/basin.json"
metroman_json_file = f"{directory}/{RUN_NAME}_mnt/input/metrosets.json"


batch_size = 1000 # cluster specific
concurrent_jobs = 400 # cluster specific

# Dynamically build script_jobs based on INCLUDED_MODULES
script_jobs = {}
for module in INCLUDED_MODULES:
    script_name = f"{module}.sh"

    if module in HARDCODED_JOBS:
        # Use hardcoded job count
        script_jobs[script_name] = HARDCODED_JOBS[module]
    elif module in DYNAMIC_MODULES:
        pass
        # Use $default_jobs placeholder (will be upgraded during execution)
        #script_jobs[script_name] = "$default_jobs"
    # If module not in either list, it won't be in script_jobs and will use fallback

# Dynamically build scripts list (same order as INCLUDED_MODULES)
scripts = [f"{module}.sh" for module in INCLUDED_MODULES]

driver_script = generate_slurm_driver(
    job_name=job_name,
    output_log_dir=output_log_dir,
    partition=partition,
    time_limit=time_limit,
    nodes=nodes,
    ntasks=ntasks,
    cpus_per_task=cpus_per_task,
    mem=mem,
    run=RUN_NAME,
    directory=sh_directory,
    json_file=json_file,
    expanded_json_file=expanded_json_file,
    reach_json_file=reach_json_file,
    basin_json_file=basin_json_file,
    metroman_json_file=metroman_json_file,
    batch_size=batch_size,
    concurrent_jobs=concurrent_jobs,
    script_jobs=script_jobs,
    scripts=scripts,
)

# Save to file
with open(f"{sh_directory}/slurm_driver.sh", "w") as f:
    f.write(driver_script)


# Optionally submit within notebook
# import subprocess
# subprocess.run(["sbatch", f"{sh_dir}/slurm_driver.sh"], check=True)
