In [1]:
import wolf
import os
import re

# wolF tasks

In [2]:
class DIG_convert_maf(wolf.Task):
    name = "DIG_convert_maf"

    inputs = {
            "input_maf" : None,
            "ref_build" : None, # ref build of input maf
            "liftover_chainfile": None,
            }
    
    script = """
    python3 /build/convert_maf.py --input_maf ${input_maf} --input_build ${ref_build} --output_path $(basename ${input_maf:0:-4}).hg19.dig.maf --liftover_chainfile ${liftover_chainfile}
    """

    output_patterns = {
            "dig_maf": "*.hg19.dig.maf"
            }

    resources = { "cpus-per-task": 2, "mem" : "20G" }
    docker = "gcr.io/broad-getzlab-workflows/dig_docker:latest"
    
    
class DIG_annotate_maf(wolf.Task):
    name = "DIG_annotate_maf"

    inputs = {
            "input_maf": None,
            "ref_fasta": None,
            "ref_fasta_idx": None
            }

    script = """
    DigPreprocess.py annotMutationFile ${input_maf} ${ref_fasta} $(basename ${input_maf:0:-4}).annot.txt
    """

    output_patterns = {
            "dig_maf": "*.annot.txt"
            }
    docker = "gcr.io/broad-getzlab-workflows/dig_docker:latest"
    resources = { "cpus-per-task": 2, "mem" : "20G" } 


class DIG_test_coding(wolf.Task):
    name = 'DIG_test_coding'

    inputs = {
        "input_annot_maf" : None,
        "input_mut_map" : None,
        "output_prefix": None
    }

    script="""
    DigDriver.py geneDriver ${input_annot_maf} ${input_mut_map} --outdir . --outpfx ${output_prefix}.coding
    """

    output_patterns = {
        "dig_results": "*.coding.results.txt"
    }

    docker = "gcr.io/broad-getzlab-workflows/dig_docker:latest"
    resources = { "cpus-per-task": 2, "mem" : "20G" } 


class DIG_report_coding(wolf.Task):
    name = 'DIG_report_coding'

    inputs = {
        "input_results" : None,
        "cgc_list": None,
        "pancan_list": None,
        "cohort": None
    }

    script="""
    python3 /build/generate_dig_report_coding.py ${input_results} ./ ${cgc_list} ${pancan_list} --prefix_output ${cohort}
    """

    output_patterns = {
        "dig_report" : "*.html"
    }

    docker = "gcr.io/broad-getzlab-workflows/dig_docker:latest"
    resources = { "cpus-per-task": 2, "mem" : "20G" } 


class DIG_preprocess_element_model(wolf.Task):
    name = 'DIG_preprocess_element_model'

    input = {
        "input_bed" : None,
        "input_element_data" : None,
        "input_mut_map" : None,
        "ref_fasta" : None,
        "annot_name" : None
    }

    script="""
    cp ${input_mut_map} ./mutation_map.h5
    cp ${input_element_data} ./element_data.h5
    DigPreprocess.py preprocess_element_model ./element_data.h5 ./mutation_map.h5 ${ref_fasta} ${annot_name} --f-bed ${input_bed}
    """

    output_patterns = {
        "output_element_data" : "element_data.h5",
        "output_mut_map": "mutation_map.h5"
    }

    docker = "gcr.io/broad-getzlab-workflows/dig_docker:latest"
    resources = { "cpus-per-task": 4, "mem" : "20G" }


class DIG_element_model(wolf.Task):
    name = 'DIG_element_model'

    input = {
        "input_element_data" : None,
        "input_mut_map" : None,
        "annot_name" : None
    }

    script="""
    cp ${input_mut_map} ./mutation_map.h5
    cp ${input_element_data} ./element_data.h5
    DigPretrain.py elementModel ./mutation_map.h5 ./element_data.h5 ${annot_name}
    """

    output_patterns = {
        "output_element_data" : "element_data.h5",
        "output_mut_map": "mutation_map.h5"
    }

    docker = "gcr.io/broad-getzlab-workflows/dig_docker:latest"
    resources = { "cpus-per-task": 4, "mem" : "20G" }


class DIG_test_noncoding(wolf.Task):
    name = 'DIG_test_noncoding'

    inputs = {
        "input_annot_maf" : None,
        "input_mut_map" : None,
        "input_bed" : None,
        "annot_name" : None
    }

    script="""
    DigDriver.py elementDriver ${input_annot_maf} ${input_mut_map} ${annot_name} --f-bed ${input_bed} --outdir . --outpfx ${annot_name}
    """

    output_patterns = {
        "dig_results": "*.results.txt"
    }

    docker = "gcr.io/broad-getzlab-workflows/dig_docker:latest"
    resources = { "cpus-per-task": 2, "mem" : "20G" } 


class DIG_report_noncoding(wolf.Task):
    name = 'DIG_report_noncoding'

    inputs = {
        "input_results" : None,
        "cgc_list": None,
        "pancan_list": None, 
        "annot_name" : None,
        "cohort" : None
    }

    script="""
    python3 /build/generate_dig_report_noncoding.py ${input_results} ./ ${cgc_list} ${pancan_list} ${annot_name} --prefix_output ${cohort}
    """

    output_patterns = {
        "dig_report" : "*.html"
    }

    docker = "gcr.io/broad-getzlab-workflows/dig_docker:latest"
    resources = { "cpus-per-task": 2, "mem" : "20G" } 

# wolF workflow

In [3]:
def generate_report_workflow(
    maf_file=None,
    interval_set_bed=None,
    interval_set_name=None,
    element_data="gs://getzlab-workflows-reference_files-oa/hg38/dig/element_data.h5",
    mutation_map=None,
    ref_build=None,
    ref_fasta="gs://getzlab-workflows-reference_files-oa/hg38/dig/hg19.fasta",
    ref_fasta_idx="gs://getzlab-workflows-reference_files-oa/hg38/dig/hg19.fasta.fai",
    liftover_chain_file="gs://getzlab-workflows-reference_files-oa/hg38/dig/hg38ToHg19.over.chain.gz",
    cgc_list="gs://getzlab-workflows-reference_files-oa/hg38/dig/cancer_gene_census_2024_06_20.tsv",
    pancan_list="gs://getzlab-workflows-reference_files-oa/hg38/dig/pancanatlas_genes.tsv",
):
    try:
        cohort_name = re.search(r"(.*?)\.(?:txt|bed|tsv|maf)$", os.path.basename(maf_file)).groups()[0].replace("_", "-").lower()
    except:
        raise ValueError("maf file expected to be in DIG format with ext [.txt|.bed|.tsv|.maf]")
    
    fasta_localization = wolf.LocalizeToDisk(
        files = {
            "ref_fasta": ref_fasta, 
            "ref_fasta_idx": ref_fasta_idx
        }
    )

    # Liftover to hg19 and conversion to DIG-compatible format
    
    maf_hg19 = DIG_convert_maf(
        inputs = {
            "input_maf" : maf_file,
            "ref_build" : ref_build,
            "liftover_chainfile" : liftover_chain_file
        }
    )

    # DIG-compatible annotation of mutations
    
    annot_maf = DIG_annotate_maf(
        inputs = {
            "input_maf": maf_hg19["dig_maf"],
            "ref_fasta": fasta_localization["ref_fasta"],
            "ref_fasta_idx": fasta_localization["ref_fasta_idx"]
        }
    )

    # Building background model from interval sets and mutation map

    preproc_element = DIG_preprocess_element_model(
        inputs = {
            "input_bed" : interval_set_bed,
            "annot_name" : interval_set_name,
            "input_element_data" : element_data,
            "input_mut_map" : mutation_map,
            "ref_fasta" : fasta_localization["ref_fasta"]
        }
    )

    element_model = DIG_element_model(
        inputs = {
            "input_element_data" : preproc_element["output_element_data"],
            "input_mut_map" : preproc_element["output_mut_map"],
            "annot_name" : interval_set_name
        }
    )
    
    # Running statistical test and report generation for the coding region
    
    results_coding = DIG_test_coding(
        inputs = {
            "input_annot_maf" : annot_maf["dig_maf"],
            "input_mut_map" : mutation_map,
            "output_prefix": cohort_name
        }
    )

    report_coding = DIG_report_coding(
        inputs = {
            "input_results": results_coding["dig_results"],
            "cgc_list": cgc_list,
            "pancan_list": pancan_list,
            "cohort": cohort_name
        }
    )

    # Running statistical test and report generation for the noncoding region

    results_noncoding = DIG_test_noncoding(
        inputs = {
            "input_annot_maf" : annot_maf["dig_maf"],
            "input_mut_map" : element_model["output_mut_map"],
            "input_bed" : interval_set_bed,
            "annot_name" : interval_set_name
        }    
    )

    report_coding = DIG_report_noncoding(
        inputs = {
            "input_results" : results_noncoding["dig_results"],
            "cgc_list": cgc_list,
            "pancan_list": pancan_list,
            "annot_name" : interval_set_name,
            "cohort" : cohort_name
        }
    )

# Running the workflow

In [4]:
with wolf.Workflow(workflow = generate_report_workflow) as w:
    w.run(
        maf_file = 'TCGA_WGS_UVM.validated.maf',
        interval_set_bed = [
            "gs://getzlab-workflows-reference_files-oa/hg38/dig/gc19_pc.prom.bed",
            "gs://getzlab-workflows-reference_files-oa/hg38/dig/gc19_pc.3utr.bed",
            "gs://getzlab-workflows-reference_files-oa/hg38/dig/gc19_pc.5utr.bed"
        ],
        interval_set_name = [
            "promoters",
            "3-prime_UTRs",
            "5-prime_UTRs"
        ],
        mutation_map = "https://cb.csail.mit.edu/DIG/downloads/mutation_maps/Kidney-RCC_SNV_MNV_INDEL.Pretrained.h5",
        ref_build = "hg38",
        RUN_NAME = "DIG_burden_test"
    )

[20240924-14:28:02] [prefect] Starting Slurm controller ...
[20240924-14:28:02] [prefect] Waiting up to 60 seconds for Slurm controller to start ...
[20240924-14:28:02] [prefect] Started Slurm controller.
[20240924-14:28:02] [prefect] Workflow results disk low on space (13 GB remaining)
[20240924-14:28:03] [prefect] Enqueued workflow DIG_burden_test
[20240924-14:28:04] [DIG_burden_test:BatchLocalDisk] Job avoidance disabled for this task; overwriting output.
[20240924-14:28:04] [DIG_burden_test:BatchLocalDisk] Localizing inputs...
[20240924-14:28:05] [DIG_burden_test:BatchLocalDisk] Disk name is canine-5714e9942090819a76b081ec3fd090aa
[20240924-14:28:05] [DIG_burden_test:DIG_convert_maf] Hashing file TCGA_WGS_UVM.validated.maf; 100/230 MiB completed
[20240924-14:28:05] [DIG_burden_test:BatchLocalDisk] Found existing disk canine-5714e9942090819a76b081ec3fd090aa
[20240924-14:28:05] [DIG_burden_test:BatchLocalDisk] Task staged in /mnt/nfs/workspace/DIG_burden_test/BatchLocalDisk__2024-09-

Copying gs://getzlab-workflows-reference_files-oa/hg38/dig/cancer_gene_census_2024_06_20.tsv...
/ [1 files][  4.2 KiB/  4.2 KiB]                                                
Operation completed over 1 objects/4.2 KiB.                                      
Copying gs://getzlab-workflows-reference_files-oa/hg38/dig/pancanatlas_genes.tsv...
/ [1 files][  1.7 KiB/  1.7 KiB]                                                
Operation completed over 1 objects/1.7 KiB.                                      


[20240924-14:30:11] [DIG_burden_test:DIG_report_noncoding] Task staged in /mnt/nfs/workspace/DIG_burden_test/DIG_report_noncoding__2024-09-24--14-30-08_drmmwkq_s2nsq0q_v22fd5jdbe21k
[20240924-14:30:12] [DIG_burden_test:DIG_report_noncoding] 3 jobs submitted.
[20240924-14:33:39] [DIG_burden_test:DIG_report_coding] Finished with status COMPLETED
[20240924-14:34:13] [DIG_burden_test:DIG_report_noncoding] Finished with statuses COMPLETED: 3
[20240924-14:34:13] [prefect] Collated results from workflow DIG_burden_test
