In [1]:
import os
import re
import pandas as pd
import yaml
import uuid

###### save defaults

In [3]:
resources_dir = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources'

In [4]:
pecgs_storage1_defaults = {
    # general
    'cpu': 40,
    
    # align dna-seq
    'known_sites': {
        'class': 'File',
        'path': f'{resources_dir}/dnaseq_alignment/dbsnp/00-All.chr.vcf.gz',
        'secondaryFiles': [
            {'class': 'File', 'path': f'{resources_dir}/dnaseq_alignment/dbsnp/00-All.chr.vcf.gz.tbi'}
        ]
    },
    'reference': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa',
        'secondaryFiles': [
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.amb'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.ann'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.bwt'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.fai'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.pac'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.sa'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.dict'}
        ]
    },
    'platform': 'ILLUMINA',
    
    # cnv
    'common_biallelic': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/snp_database/af-only-gnomad.hg38.common_biallelic.chr1-22XY.vcf'
    },
    'pool_of_normals': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/pon/pecgs_pon_v1/create_pon/gatk4scnaPON.Normal.hdf5'
    },
    'protein_coding_gene': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/gencode.v34.annotation.gene_filtered.protein_coding.ensembl_ID_no_version.protein-coding_hgnc_filtered.duplicates_removed.ensembl_ID_removed.txt'
    },
    'cytoband': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/cytoBand.txt'
    },
    'reference_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1'
    },
    'target_interval_list': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/IDT_xGen_Exome_Research_Panel_v1.hg38.removed_alt_chr.autosome_only.bed.preprocessed.exome.interval_list'
    },
    
    # somaticwrapper
    # all included in other inputs
    
    # msisensor
    'microsatellite': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/PECGS/ref_genome/hg38.microsatellite'
    },
    'minimal_homopolymer_size': 1,
    'minimal_microsatellite_size': 1,
    
    # tindaisy
    'clinvar_annotation': {
        'class': 'File',
        'path': f'{resources_dir}/clinvar/GRCh38/clinvar_20200706.vcf.gz'
    },
    'rescue_clinvar': True,
    'rescue_cosmic': True,
    'call_regions': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed.gz'
    },
    'canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed'
    },
    'pindel_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/pindel.WES.ini'
    },
    'strelka_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/strelka.WES.ini'
    },
    'varscan_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/varscan.WES.ini'
    },
    'classification_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/classification_filter_config.ini'
    },
    'af_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/af_filter_config.ini'
    },
    'centromere_bed': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tindaisy_vep_cache_gz': {
        'class': 'File',
        'path': f'{resources_dir}/vep/20220525/vep-cache.102_GRCh38.tar.gz'
    },
    'tindaisy_vep_cache_version': 102,
    'assembly': 'GRCh38',
    'tindaisy_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.d1.vd1.chrlist.txt'
    },
    'tindaisy_rescue_bed': {
        'class': 'File',
        'path': f'{resources_dir}/bed/tindaisy_vaf_rescue/299_drivers.bed'
    },
    
    # TinJasmine
    'centromere': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tinjasmine_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/chrlist/GRCh38.d1.vd1.chrlist-reordered.txt'
    },
    'pindel_config_template': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_config_template/pindel_germline_filter_config.ini'
    },
    'Canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/Canonical_BED/GRCh38.callRegions.bed'
    },
    'ROI_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/ROI_BED/Homo_sapiens.GRCh38.95.allCDS.2bpFlanks.biomart.withCHR.bed'
    },
#     'varscan_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/varscan_filter_config/VLD_FilterVCF-varscan.config.ini'
#     },
#     'pindel_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_filter_config/VLD_FilterVCF-pindel.config.ini'
#     },
#     'gatk_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/gatk_filter_config/VLD_FilterVCF-GATK.config.ini'
#     },
    'tinjasmine_vep_cache_gz': {
        'class': 'File',
        'path': f'{resources_dir}/vep/20220525/vep-cache.102_GRCh38.tar.gz'
    },
    
    # neoscan
    'neoscan_ref_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/neoscan/refseq_hg38_june29'
    },
    'neoscan_bed': {
        'class': 'File',
        'path': f'{resources_dir}/neoscan/refseq_hg38_june29/proteome.bed'
    },
    
    # charger
    'charger_inheritance_gene_list': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/cancer_pred_genes_160genes_011321_curated_forCharGer.txt'
    },
    'charger_pp2_gene_list': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/160cpgs.txt'
    },
    'charger_pathogenic_variants': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/emptyRemoved_20160428_pathogenic_variants_HGVSg_VEP_grch38lifOver.vcf'
    },
    'charger_hotspot3d_clusters': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/cptac_mc3_combined_noHypers_sorted.maf.3D_Proximity.pairwise.recurrence.l0.ad10.r10.clusters'
    },
    'charger_clinvar_alleles': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/clinvar_alleles.single.b38.tsv.gz'
    },
    
}

In [5]:
yaml.safe_dump(pecgs_storage1_defaults, open('/diskmnt/Projects/Users/estorrs/pecgs-pipeline/cwl/pecgs_workflows/defaults.pecgs_TN_wxs_fq.yaml', 'w'))

In [6]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1.defaults.pecgs_TN_wxs_fq.yaml', 'w'))

In [7]:
pecgs_storage1_defaults = {
    # general
    'cpu': 40,
    
    'reference': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa',
        'secondaryFiles': [
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.amb'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.ann'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.bwt'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.fai'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.pac'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.sa'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.dict'}
        ]
    },
    
    # cnv
    'common_biallelic': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/snp_database/af-only-gnomad.hg38.common_biallelic.chr1-22XY.vcf'
    },
    'pool_of_normals': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/pon/pecgs_pon_v1/create_pon/gatk4scnaPON.Normal.hdf5'
    },
    'protein_coding_gene': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/gencode.v34.annotation.gene_filtered.protein_coding.ensembl_ID_no_version.protein-coding_hgnc_filtered.duplicates_removed.ensembl_ID_removed.txt'
    },
    'cytoband': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/cytoBand.txt'
    },
    'reference_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1'
    },
    'target_interval_list': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/IDT_xGen_Exome_Research_Panel_v1.hg38.removed_alt_chr.autosome_only.bed.preprocessed.exome.interval_list'
    },
    
    # somaticwrapper
    # all included in other inputs
    
    # msisensor
    'microsatellite': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/PECGS/ref_genome/hg38.microsatellite'
    },
    'minimal_homopolymer_size': 1,
    'minimal_microsatellite_size': 1,
    
    # tindaisy
    'clinvar_annotation': {
        'class': 'File',
        'path': f'{resources_dir}/clinvar/GRCh38/clinvar_20200706.vcf.gz'
    },
    'rescue_clinvar': True,
    'rescue_cosmic': True,
    'call_regions': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed.gz'
    },
    'canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed'
    },
    'pindel_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/pindel.WES.ini'
    },
    'strelka_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/strelka.WES.ini'
    },
    'varscan_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/varscan.WES.ini'
    },
    'classification_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/classification_filter_config.ini'
    },
    'af_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/af_filter_config.ini'
    },
    'centromere_bed': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tindaisy_vep_cache_gz': {
        'class': 'File',
        'path': f'{resources_dir}/vep/20220525/vep-cache.102_GRCh38.tar.gz'
    },
    'tindaisy_vep_cache_version': 102,
    'assembly': 'GRCh38',
    'tindaisy_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.d1.vd1.chrlist.txt'
    },
    'tindaisy_rescue_bed': {
        'class': 'File',
        'path': f'{resources_dir}/bed/tindaisy_vaf_rescue/299_drivers.bed'
    },
    
    # TinJasmine
    'centromere': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tinjasmine_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/chrlist/GRCh38.d1.vd1.chrlist-reordered.txt'
    },
    'pindel_config_template': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_config_template/pindel_germline_filter_config.ini'
    },
    'Canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/Canonical_BED/GRCh38.callRegions.bed'
    },
    'ROI_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/ROI_BED/Homo_sapiens.GRCh38.95.allCDS.2bpFlanks.biomart.withCHR.bed'
    },
#     'varscan_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/varscan_filter_config/VLD_FilterVCF-varscan.config.ini'
#     },
#     'pindel_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_filter_config/VLD_FilterVCF-pindel.config.ini'
#     },
#     'gatk_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/gatk_filter_config/VLD_FilterVCF-GATK.config.ini'
#     },
    'tinjasmine_vep_cache_gz': {
        'class': 'File',
        'path': f'{resources_dir}/vep/20220525/vep-cache.102_GRCh38.tar.gz'
    },
    
    # neoscan
    'neoscan_ref_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/neoscan/refseq_hg38_june29'
    },
    'neoscan_bed': {
        'class': 'File',
        'path': f'{resources_dir}/neoscan/refseq_hg38_june29/proteome.bed'
    },
    
    # charger
    'charger_inheritance_gene_list': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/cancer_pred_genes_160genes_011321_curated_forCharGer.txt'
    },
    'charger_pp2_gene_list': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/160cpgs.txt'
    },
    'charger_pathogenic_variants': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/emptyRemoved_20160428_pathogenic_variants_HGVSg_VEP_grch38lifOver.vcf'
    },
    'charger_hotspot3d_clusters': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/cptac_mc3_combined_noHypers_sorted.maf.3D_Proximity.pairwise.recurrence.l0.ad10.r10.clusters'
    },
    'charger_clinvar_alleles': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/clinvar_alleles.single.b38.tsv.gz'
    },
}

In [8]:
yaml.safe_dump(pecgs_storage1_defaults, open('/diskmnt/Projects/Users/estorrs/pecgs-pipeline/cwl/pecgs_workflows/defaults.pecgs_TN_wxs_bam.yaml', 'w'))

In [9]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1.defaults.pecgs_TN_wxs_bam.yaml', 'w'))

In [17]:
pecgs_storage1_defaults = {
    # general
    'cpu': 40,
    
    # fusion
    'bwts': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/Integrate_dependencies/bwts'
    },
    'filter_database': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/FilterDatabase'
    },
    'fusion_annotator_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/FusionAnnotator'
    },
    'genome_db': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/ericscript_dependencies/ericscript_db_homosapiens_ensembl84'
    },
    'genome_lib_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/STAR-Fusion_dependencies/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play/ctat_genome_lib_build_dir'
    },
    'integrate_annotations': {
        'class': 'File',
        'path': f'{resources_dir}/fusion/Integrate_dependencies/annot.ensembl.GRCh38.txt'
    },
    'integrate_executable': {
        'class': 'File',
        'path': f'{resources_dir}/fusion/INTEGRATE_0_2_6/INTEGRATE-build/bin/Integrate'
    },
    'integrate_fasta': {
        'class': 'File',
        'path': f'{resources_dir}/fusion/Integrate_dependencies/STAR/hg38.fa'
    },
    
    # bulk expression
    'star_index': {
        'class': 'Directory',
        'path': f'{resources_dir}/bulk_expression/STAR_GRCh38_d1_vd1_d1_GENCODE_v34'
    },
    'gtf': {
        'class': 'File',
        'path': f'{resources_dir}/bulk_expression/STAR_GRCh38_d1_vd1_d1_GENCODE_v34/gencode.v34.annotation.gtf'
    },
    'gene_info': {
        'class': 'File',
        'path': f'{resources_dir}/bulk_expression/STAR_GRCh38_d1_vd1_d1_GENCODE_v34/gencode.gene.info.v34.tsv'
    },
}

In [18]:
yaml.safe_dump(pecgs_storage1_defaults, open('/diskmnt/Projects/Users/estorrs/pecgs-pipeline/cwl/pecgs_workflows/defaults.pecgs_T_rna_fq.yaml', 'w'))

In [19]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1.defaults.pecgs_T_rna_fq.yaml', 'w'))

In [20]:
pecgs_storage1_defaults = {
    # general
#     'cpu': 40,
    
    'reference': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa',
    },
    
    # somatic sv
    'generate_evidence_bam': True,
}

In [21]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1.defaults.pecgs_TN_wgs_bam.yaml', 'w'))