# Running PheWAS on All of Us Workbench with dsub
This notebook submits a `dsub` job on the All of Us Researcher Workbench to run the `PheTK.PheWAS()` step in a scalable, containerized GCP environment. NOTE: This is useful when running multiple PheWAS analyses and not a single PheWAS.

## Import Required Libraries

In [None]:
import os


## Define dsub Job Submission Function

In [None]:
def dsub_script(
    machine_type,
    envs,
    in_params,
    out_params,
    boot_disk=100,
    disk_size=150,
    image="us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.2.14",
    script="gs://fc-secure-453d144e-d2ed-4211-b422-1d3427c622c5/scripts/run_phewas.sh",
    preemptible=True
):
    """
    Submits a dsub job in All of Us Research Workbench for PheWAS analysis.
    """

    dsub_user_name = os.getenv("OWNER_EMAIL").split('@')[0]
    user_name = os.getenv("OWNER_EMAIL").split('@')[0].replace(".", "-")

    dsub_cmd = "dsub "
    dsub_cmd += "--provider google-cls-v2 "
    dsub_cmd += '--machine-type "{}" '.format(machine_type)

    if preemptible:
        dsub_cmd += "--preemptible "

    if "c4" in machine_type:
        dsub_cmd += '--disk-type "hyperdisk-balanced" '
    else:
        dsub_cmd += '--disk-type "pd-ssd" '

    dsub_cmd += "--boot-disk-size {} ".format(boot_disk)
    dsub_cmd += "--disk-size {} ".format(disk_size)
    dsub_cmd += '--user-project "${GOOGLE_PROJECT}" '
    dsub_cmd += '--project "${GOOGLE_PROJECT}" '
    dsub_cmd += '--image "{}" '.format(image)
    dsub_cmd += '--network "network" '
    dsub_cmd += '--subnetwork "subnetwork" '
    dsub_cmd += '--service-account "$(gcloud config get-value account)" '
    dsub_cmd += '--user "{}" '.format(dsub_user_name)
    dsub_cmd += '--logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d')/{job-id}-{task-id}-{task-attempt}.log" '
    dsub_cmd += '--name "{}" '.format(machine_type)
    dsub_cmd += '--env GOOGLE_PROJECT="${GOOGLE_PROJECT}" '
    dsub_cmd += '--script "{}" '.format(script)

    for env_key, env_value in envs.items():
        dsub_cmd += '--env {}="{}" '.format(env_key, env_value)

    for in_key, in_value in in_params.items():
        dsub_cmd += '--input {}="{}" '.format(in_key, in_value)

    for out_key, out_value in out_params.items():
        dsub_cmd += '--output {}="{}" '.format(out_key, out_value)

    print("Submitting dsub job...")
    print(dsub_cmd)
    os.system(dsub_cmd)


In [None]:
%%writefile run_phewas.sh
#!/bin/bash
set -e  # Exit immediately if any command fails

# run_phewas.sh - executes PheTK.PheWAS from pre-extracted variant and phecode data

python3 - <<EOF
from PheTK.PheWAS import PheWAS

# Define the cohort, phecode_count, and phewas_result file_path
COHORT="gs://your-bucket/cohort_name.csv",
PHECODE_COUNTS="gs://your-bucket/phecode_count.csv"
PHEWAS_RESULTS="gs://your-bucket/phewas_results.csv"


phewas = PheWAS(
    phecode_version="X",
    phecode_count_csv_path="${PHECODE_COUNTS}",
    cohort_csv_path="${COHORT}",
    sex_at_birth_col="${SEX_COL}",
    male_as_one=True,
    covariate_cols="${COVARIATES}".split(","),
    independent_variable_of_interest="case",
    min_cases=10,
    min_phecode_count=2,
    output_file_name="${PHEWAS_RESULTS}"
)
phewas.run()
EOF


## Example: Submit dsub Job for PheWAS

In [None]:
# Define example parameters
MACHINE_TYPE = "c4-standard-8"
SCRIPT_PATH = "gs://your-bucket/scripts/run_phewas.sh"  # Update with your script path

env_vars = {
    "SNP_ID": "rs78378222",
    "SEX_COL": "sex_at_birth",
    "COVARIATES": "age_at_last_event,sex_at_birth,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9"
}

input_params = {}

output_params = {
    "PHEWAS_RESULTS": "gs://your-bucket/phewas_results.csv"
}

# Submit the job
dsub_script(
    machine_type=MACHINE_TYPE,
    envs=env_vars,
    in_params=input_params,
    out_params=output_params,
    image="phetk/phetk:0.1.47",
    script=SCRIPT_PATH,
    preemptible=True
)

print(f"Submitted job for SNP: {env_vars['SNP_ID']}")
