In [1]:
import os
import yaml
import json
import pandas as pd
from pathlib import Path

In [2]:
cwl_dir = '/diskmnt/Projects/Users/estorrs/pecgs-cnv/cwl'
Path(cwl_dir).mkdir(parents=True, exist_ok=True)

In [3]:
# GENOME = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa'
# GENOME_DICT = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.dict'
# # REF_DIR = 
# # TARGET_INTERVAL_LIST = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/cnv/db/IDT_xGen_Exome_Research_Panel_v1.hg38.removed_alt_chr.bed.target.preprocessed.exome.interval_list'
# TARGET_INTERVAL_LIST = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/cnv/db/IDT_xGen_Exome_Research_Panel_v1.hg38.removed_alt_chr.autosome_only.bed.preprocessed.exome.interval_list'
# COMMON_BIALLELIC = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/cnv/db/snp_database/af-only-gnomad.hg38.common_biallelic.chr1-22XY.vcf'
# PROTEIN_CODING_GENE = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/cnv/db/gencode.v34.annotation.gene_filtered.protein_coding.ensembl_ID_no_version.protein-coding_hgnc_filtered.duplicates_removed.ensembl_ID_removed.txt'
# POOL_OF_NORMALS = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/cnv/pon/create_pon/gatk4scnaPON.Normal.hdf5'
# run_cnv(normal_bam, tumor_bam, sample, out_dir, gene_level_script, merge_gene_script)

In [4]:
template = {
    'normal_bam': {
        'class': 'File',
        'path': 'a/file/path'
    },
    'tumor_bam': {
        'class': 'File',
        'path': 'a/file/path'
    },
    'sample': 'a_string',
#     'genome': {
#         'class': 'File',
#         'path': 'a/file/path'
#     },
#     'genome_dict': {
#         'class': 'File',
#         'path': 'a/file/path'
#     },
    'reference_dir': {
        'class': 'Directory',
        'path': 'a/file/path'
    },
    'target_interval_list': {
        'class': 'File',
        'path': 'a/file/path'
    },
    'common_biallelic': {
        'class': 'File',
        'path': 'a/file/path'
    },
    'protein_coding_gene': {
        'class': 'File',
        'path': 'a/file/path'
    },
    'cytoband': {
        'class': 'File',
        'path': 'a/file/path'
    },
    'pool_of_normals': {
        'class': 'File',
        'path': 'a/file/path'
    },
}

In [5]:
yaml.safe_dump(template, open(os.path.join(cwl_dir, 'template.cnv.yaml'), 'w'))

In [6]:
cwl = {
    'class': 'CommandLineTool',
    'cwlVersion': 'v1.0',
    '$namespaces': {'sbg': 'https://www.sevenbridges.com/'},
    'id': 'cnv',
    'label': 'cnv',
    'baseCommand': ['python', '/pecgs-cnv/cnv/cnv.py'],
    'inputs': [
        {
            'id': 'sample',
            'type': 'string',
            'inputBinding': {
                'position': '1'
            }
        },
        {
            'id': 'tumor_bam',
            'type': 'File',
            'inputBinding': {
                'position': '2'
            }
        },
        {
            'id': 'normal_bam',
            'type': 'File',
            'inputBinding': {
                'position': '3'
            }
        },
        {
            'id': 'reference_dir',
            'type': 'Directory',

        },
        {
            'id': 'target_interval_list',
            'type': 'File',
            'inputBinding': {
                'prefix': '--target-interval-list',
                'position': '0'
            }
        },
        {
            'id': 'common_biallelic',
            'type': 'File',
            'inputBinding': {
                'prefix': '--common-biallelic',
                'position': '0'
            }
        },
        {
            'id': 'protein_coding_gene',
            'type': 'File',
            'inputBinding': {
                'prefix': '--protein-coding-gene',
                'position': '0'
            }
        },
        {
            'id': 'cytoband',
            'type': 'File',
            'inputBinding': {
                'prefix': '--cytoband',
                'position': '0'
            }
        },
        {
            'id': 'pool_of_normals',
            'type': 'File',
            'inputBinding': {
                'prefix': '--pool-of-normals',
                'position': '0'
            }
        },
        # needs path to be set so it works on compute1
        {
            'id': 'environ_PATH',
            'type': 'string?',
            'default': '/miniconda/envs/cnv/bin:$PATH'
        },
    ],
    'arguments': [
        {
            'position': 0,
            'prefix': '--genome',
            'valueFrom': '$(inputs.reference_dir)/GRCh38.d1.vd1.fa'
        },
        {
            'position': 0,
            'prefix': '--genome-dict',
            'valueFrom': '$(inputs.reference_dir)/GRCh38.d1.vd1.dict'
        },
        {
            'position': 0,
            'prefix': '--gene-level-script',
            'valueFrom': '/pecgs-cnv/cnv/segment_to_geneLevel_v4.py'
        },
        {
            'position': 0,
            'prefix': '--arm-level-script',
            'valueFrom': '/pecgs-cnv/cnv/segment_to_chr_arm_level_v4.py'
        },
        {
            'position': 0,
            'prefix': '--out-dir',
            'valueFrom': 'outputs'
        },
    ],
    'outputs': [
        {
            'id': 'gene_level_cnv',
            'type': 'File',
            'outputBinding': {'glob': 'outputs/*T.geneLevel.from_seg.cn'}
        },
        {
            'id': 'arm_level_cnv',
            'type': 'File',
            'outputBinding': {'glob': 'outputs/*T.bandLevel.from_seg.cn'}
        }
    ],
    'requirements': [
        {
            'class': 'DockerRequirement',
            'dockerPull': 'estorrs/pecgs_cnv:0.0.2'
        },
        {
            'class': 'ResourceRequirement',
            'ramMin': 60000
        },
        {
            'class': 'EnvVarRequirement',
            'envDef': {
                'PATH': '$(inputs.environ_PATH)',
            }
        }
    ]
}

In [7]:
yaml.safe_dump(cwl, open(os.path.join(cwl_dir, 'cnv.cwl'), 'w'))

In [8]:
# t = yaml.safe_load(open('/home/estorrs/pecgs-pipeline/cwl/msisensor/msisensor_workflow.cwl'))
# t

In [9]:
t = {
    'class': 'Workflow',
    'cwlVersion': 'v1.0',
    'id': 'cnv_workflow',
    'label': 'cnv_workflow',
    'requirements': [{'class': 'ScatterFeatureRequirement'}],
    'inputs': [
        {
            'id': 'sample',
            'type': 'string',
        },
        {
            'id': 'tumor_bam',
            'type': 'File',
        },
        {
            'id': 'normal_bam',
            'type': 'File',
        },
        {
            'id': 'reference_dir',
            'type': 'Directory',

        },
#         {
#             'id': 'genome',
#             'type': 'File',
#         },
#         {
#             'id': 'genome_dict',
#             'type': 'File',
#         },
        {
            'id': 'target_interval_list',
            'type': 'File',
        },
        {
            'id': 'common_biallelic',
            'type': 'File',
        },
        {
            'id': 'protein_coding_gene',
            'type': 'File',
        },
        {
            'id': 'cytoband',
            'type': 'File',
        },
        {
            'id': 'pool_of_normals',
            'type': 'File',
        },
    ],
    'outputs': [
        {
            'id': 'gene_level_cnv',
            'outputSource': 'cnv/gene_level_cnv',
            'type': 'File'
        },
        {
            'id': 'arm_level_cnv',
            'outputSource': 'cnv/arm_level_cnv',
            'type': 'File'
        },
    ],
    'steps': [
        {
            'id': 'cnv',
            'in': [
#                 {'id': 'genome', 'source': 'genome'},
#                 {'id': 'genome_dict', 'source': 'genome_dict'},
                {'id': 'reference_dir', 'source': 'reference_dir'},
                {'id': 'target_interval_list', 'source': 'target_interval_list'},
                {'id': 'common_biallelic', 'source': 'common_biallelic'},
                {'id': 'protein_coding_gene', 'source': 'protein_coding_gene'},
                {'id': 'cytoband', 'source': 'cytoband'},
                {'id': 'pool_of_normals', 'source': 'pool_of_normals'},
                {'id': 'sample', 'source': 'sample'},
                {'id': 'normal_bam', 'source': 'stage_normal_bam/output'},
                {'id': 'tumor_bam', 'source': 'stage_tumor_bam/output'}],
           'label': 'cnv',
           'out': [{'id': 'gene_level_cnv'}, {'id': 'arm_level_cnv'}],
           'run': './cnv.cwl'
        },
        {
            'id': 'stage_normal_bam',
            'in': [{'id': 'BAM', 'source': 'normal_bam'}],
            'label': 'stage_normal_bam',
            'out': [{'id': 'output'}],
            'run': './stage_bam.cwl'
        },
        {
            'id': 'stage_tumor_bam',
            'in': [{'id': 'BAM', 'source': 'tumor_bam'}],
            'label': 'stage_tumor_bam',
            'out': [{'id': 'output'}],
            'run': './stage_bam.cwl'
        }
    ]
}

In [10]:
yaml.safe_dump(t, open(os.path.join(cwl_dir, 'cnv_workflow.cwl'), 'w'))