In [54]:
import os
import re
import pandas as pd
import yaml
import uuid

In [55]:
bammap = pd.read_csv('../tests/data/pecgs_pipeline/input.bammap', sep='\t')
bammap

Unnamed: 0,sample_name,case,disease,experimental_strategy,sample_type,data_path,filesize,data_format,reference,UUID,system
0,HT191P1-S1H1A3Y3.WXS.R1.T,HT191P1-S1H1A3Y3,PDAC,WXS,tumor,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,12345678,FASTQ,,78777884-505c-4b2f-9170-107662c1effa,storage1
1,HT191P1-S1H1A3Y3.WXS.R2.T,HT191P1-S1H1A3Y3,PDAC,WXS,tumor,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,12345678,FASTQ,,2699158f-25fe-4f19-a7cf-2a4e1cf687b3,storage1
2,HT191P1-S1H1A3Y3.WXS.R1.N,HT191P1-S1H1A3Y3,PDAC,WXS,blood_normal,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,12345678,FASTQ,,eafb3178-e459-44c0-a374-8e17a8107fce,storage1
3,HT191P1-S1H1A3Y3.WXS.R2.N,HT191P1-S1H1A3Y3,PDAC,WXS,blood_normal,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,12345678,FASTQ,,9923349f-dc3c-40d4-a097-803c4828f732,storage1
4,HT191P1-S1H1A3Y3.RNA-seq.R1.T,HT191P1-S1H1A3Y3,PDAC,RNA-seq,tumor,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,12345678,FASTQ,,fb1f4b8a-a56b-4bc6-b059-864c2b066dc8,storage1
5,HT191P1-S1H1A3Y3.RNA-seq.R2.T,HT191P1-S1H1A3Y3,PDAC,RNA-seq,tumor,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,12345678,FASTQ,,80ce8102-e8a4-4529-85b0-701a4dc022ea,storage1


looking for all samples in bammap with all input files present (in this case paired WXS fastqs and rna-seq tumor fastqs)

In [56]:
cases = sorted(set(bammap['case']))
inputs = {}
for case in cases:
    data = {
        'wxs': {
            'tumor': {},
            'normal': {}
        },
        'rna-seq': {
            'tumor': {}
        }
    }
    filtered = bammap[bammap['case']==case]
    
    # check for wxs tumor fastqs
    f = filtered[[True if e=='WXS' and df=='FASTQ' and 'tumor' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 2:
        print(f'Warning: more than 2 wxs tumor fastqs found for case {case}')
    for i, row in f.iterrows():
        if '.r1.' in row['sample_name'].lower():
            data['wxs']['tumor']['R1'] = (row['data_path'], row['UUID'])
            break
    for i, row in f.iterrows():
        if '.r2.' in row['sample_name'].lower():
            data['wxs']['tumor']['R2'] = (row['data_path'], row['UUID'])
            break
            
    # check for wxs normal fastqs
    f = filtered[[True if e=='WXS' and df=='FASTQ' and 'normal' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 2:
        print(f'Warning: more than 2 wxs normal fastqs found for case {case}')
    for i, row in f.iterrows():
        if '.r1.' in row['sample_name'].lower():
            data['wxs']['normal']['R1'] = (row['data_path'], row['UUID'])
            break
    for i, row in f.iterrows():
        if '.r2.' in row['sample_name'].lower():
            data['wxs']['normal']['R2'] = (row['data_path'], row['UUID'])
            break
            
    # check for rna-seq tumor fastqs
    f = filtered[[True if e=='RNA-seq' and df=='FASTQ' and 'tumor' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 2:
        print(f'Warning: more than 2 rna-seq tumor fastqs found for case {case}')
    for i, row in f.iterrows():
        if '.r1.' in row['sample_name'].lower():
            data['rna-seq']['tumor']['R1'] = (row['data_path'], row['UUID'])
            break
    for i, row in f.iterrows():
        if '.r2.' in row['sample_name'].lower():
            data['rna-seq']['tumor']['R2'] = (row['data_path'], row['UUID'])
            break
            
    
    # if everything is present then accept sample
    if len(data['wxs']['tumor']) == 2 and len(data['wxs']['normal']) == 2 and len(data['rna-seq']['tumor']) == 2:
        inputs[case] = data
inputs

{'HT191P1-S1H1A3Y3': {'wxs': {'tumor': {'R1': ('/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CCAGTAGCGT-ATGTATTGGC_S53_L002_R1_001.fastq.gz',
     '78777884-505c-4b2f-9170-107662c1effa'),
    'R2': ('/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CCAGTAGCGT-ATGTATTGGC_S53_L002_R2_001.fastq.gz',
     '2699158f-25fe-4f19-a7cf-2a4e1cf687b3')},
   'normal': {'R1': ('/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CATTATCGCT-CTTGAAGGTT_S34_L001_R1_001.fastq.gz',
     'eafb3178-e459-44c0-a374-8e17a8107fce'),
    'R2': ('/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CATTATCGCT-CTTGAAGGTT_S34_L001_R2_001.fastq.gz',
     '9923349f-dc3c-40d4-a097-803c4828f732')}},
  'rna-seq': {'tumor': {'R1': ('/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/rna-seq/ht191p1-s1h1a3y3e1_1.AAGGTGTTAG-TAAGCGCGTG.HFWJGDSXY_AAGGTGTTAG-TAAGCGCGTG_L001_R1.fast

In [57]:
run_list = {}
for case, d in inputs.items():
    run_list[case] = {f'{dt}_{st}_{r}.filepath': fp
                     for dt, d1 in d.items()
                     for st, d2 in d1.items()
                     for r, (fp, uid) in d2.items()}
    run_list[case].update({f'{dt}_{st}_{r}.uuid': uid
                     for dt, d1 in d.items()
                     for st, d2 in d1.items()
                     for r, (fp, uid) in d2.items()})
run_list

{'HT191P1-S1H1A3Y3': {'wxs_tumor_R1.filepath': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CCAGTAGCGT-ATGTATTGGC_S53_L002_R1_001.fastq.gz',
  'wxs_tumor_R2.filepath': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CCAGTAGCGT-ATGTATTGGC_S53_L002_R2_001.fastq.gz',
  'wxs_normal_R1.filepath': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CATTATCGCT-CTTGAAGGTT_S34_L001_R1_001.fastq.gz',
  'wxs_normal_R2.filepath': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CATTATCGCT-CTTGAAGGTT_S34_L001_R2_001.fastq.gz',
  'rna-seq_tumor_R1.filepath': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/rna-seq/ht191p1-s1h1a3y3e1_1.AAGGTGTTAG-TAAGCGCGTG.HFWJGDSXY_AAGGTGTTAG-TAAGCGCGTG_L001_R1.fastq.gz',
  'rna-seq_tumor_R2.filepath': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/rna-seq/ht191p1-s1h1a3y3e1_1.AAG

In [58]:
run_table = pd.DataFrame.from_dict(run_list).transpose()
run_table['sample_id'] = run_table.index.to_list()
run_table['run_uuid'] = [str(uuid.uuid4()) for i in range(run_table.shape[0])]
run_table.index = [f'{c}_{u}' for c, u in zip(run_table.index, run_table['run_uuid'])]
run_table.index.name = 'run_id'
run_table

Unnamed: 0_level_0,rna-seq_tumor_R1.filepath,rna-seq_tumor_R1.uuid,rna-seq_tumor_R2.filepath,rna-seq_tumor_R2.uuid,wxs_normal_R1.filepath,wxs_normal_R1.uuid,wxs_normal_R2.filepath,wxs_normal_R2.uuid,wxs_tumor_R1.filepath,wxs_tumor_R1.uuid,wxs_tumor_R2.filepath,wxs_tumor_R2.uuid,sample_id,run_uuid
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
HT191P1-S1H1A3Y3_419b8098-e4b2-4318-883d-c233cb7e06c8,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,fb1f4b8a-a56b-4bc6-b059-864c2b066dc8,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,80ce8102-e8a4-4529-85b0-701a4dc022ea,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,eafb3178-e459-44c0-a374-8e17a8107fce,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,9923349f-dc3c-40d4-a097-803c4828f732,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,78777884-505c-4b2f-9170-107662c1effa,/scratch1/fs1/dinglab/estorrs/pecgs_resources/...,2699158f-25fe-4f19-a7cf-2a4e1cf687b3,HT191P1-S1H1A3Y3,419b8098-e4b2-4318-883d-c233cb7e06c8


In [59]:
yaml.safe_dump(run_list, open('../tests/data/pecgs_pipeline/run_list.yaml', 'w'))
run_table.to_csv('../tests/data/pecgs_pipeline/run_list.txt', sep='\t')

if we want the .bams to have correct readgroup ids, etc. then we need to pass some sequencing info. this info is available in the Samplemap.csv from MGI

In [60]:
df = pd.read_csv('/data/pecgs/test_sample/ht191p1-s1h1a3/Samplemap_normal.csv', sep=',')
df

Unnamed: 0,Instrument Data ID,Flow Cell ID,Lane,Index Sequence,Status,Library Summary,Library Name,Library Type,Sample Full Name,Species Name,Creation Event ID,Creation Date,Completion Date,Gerald Bam Path,Full Path,File1,File2
0,2903505022,HH7KNDSXY,1,AGACACCAGG-GAATCCAACA,completed,2903382347,TWCE-HT200P1-M1K2Y2D1_1-lib1,genomic library,TWCE-HT200P1-M1K2Y2D1_1,human,177990049,10/25/20 14:47,10/25/20 16:20,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,AGACACCAGG-GAATCCAACA_S8_L001_R1_001.fastq.gz,AGACACCAGG-GAATCCAACA_S8_L001_R2_001.fastq.gz
1,2903504698,HH7KNDSXY,1,AGACATCAAC-TAGTGATAGC,completed,2903382356,TWCE-HT128B1-S1H3A2K2Y2D1-lib1,genomic library,TWCE-HT128B1-S1H3A2K2Y2D1,human,177989994,10/25/20 14:47,10/25/20 16:14,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,AGACATCAAC-TAGTGATAGC_S9_L001_R2_001.fastq.gz,AGACATCAAC-TAGTGATAGC_S9_L001_R1_001.fastq.gz
2,2903505057,HH7KNDSXY,1,AGATTCAGCA-GACATGACAA,completed,2903382352,TWCE-HT062B1-S1R1A5Y1D1-lib1,genomic library,TWCE-HT062B1-S1R1A5Y1D1,human,177990056,10/25/20 14:47,10/25/20 16:19,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,AGATTCAGCA-GACATGACAA_S13_L001_R2_001.fastq.gz,AGATTCAGCA-GACATGACAA_S13_L001_R1_001.fastq.gz
3,2903505343,HH7KNDSXY,1,AGCAATGGCT-TAGGCCGCTA,completed,2903382337,TWCE-HT036B1-S1PDH1Y2D1-lib1,genomic library,TWCE-HT036B1-S1PDH1Y2D1,human,177990104,10/25/20 14:48,10/25/20 16:26,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,AGCAATGGCT-TAGGCCGCTA_S15_L001_R1_001.fastq.gz,AGCAATGGCT-TAGGCCGCTA_S15_L001_R2_001.fastq.gz
4,2903504972,HH7KNDSXY,1,AGCCACCTAC-TCACAGCTGC,completed,2903382334,TWCE-HT206B1-S1H4A2Y2D1_1-lib1,genomic library,TWCE-HT206B1-S1H4A2Y2D1_1,human,177990043,10/25/20 14:47,10/25/20 16:21,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,AGCCACCTAC-TCACAGCTGC_S16_L001_R2_001.fastq.gz,AGCCACCTAC-TCACAGCTGC_S16_L001_R1_001.fastq.gz
5,2903504687,HH7KNDSXY,1,AGGATTAGGC-ATGTTAGACG,completed,2903382368,TWCE-HT206B1-S1H1A2Y2D1_1-lib1,genomic library,TWCE-HT206B1-S1H1A2Y2D1_1,human,177989981,10/25/20 14:46,10/25/20 16:20,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,AGGATTAGGC-ATGTTAGACG_S20_L001_R1_001.fastq.gz,AGGATTAGGC-ATGTTAGACG_S20_L001_R2_001.fastq.gz
6,2903504770,HH7KNDSXY,1,AGGCAATCGC-GATCGCCTCA,completed,2903382358,TWCE-HT036B1-S1PGH2Y2D1_1-lib1,genomic library,TWCE-HT036B1-S1PGH2Y2D1_1,human,177990012,10/25/20 14:47,10/25/20 16:17,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,AGGCAATCGC-GATCGCCTCA_S21_L001_R1_001.fastq.gz,AGGCAATCGC-GATCGCCTCA_S21_L001_R2_001.fastq.gz
7,2903504879,HH7KNDSXY,1,CACGAGTGTC-TACTCGCCGA,completed,2903382355,TWCE-HT077B1-S1H7A2K3D1_1-lib1,genomic library,TWCE-HT077B1-S1H7A2K3D1_1,human,177990027,10/25/20 14:47,10/25/20 16:19,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,CACGAGTGTC-TACTCGCCGA_S23_L001_R1_001.fastq.gz,CACGAGTGTC-TACTCGCCGA_S23_L001_R2_001.fastq.gz
8,2903505327,HH7KNDSXY,1,CACTCACCTC-ATTAACCAGC,completed,2903382340,TWCE-RESL9B-3622m-PDXO-DNA-lib1,genomic library,TWCE-RESL9B-3622m-PDXO-DNA,human,177990101,10/25/20 14:48,10/25/20 16:21,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,CACTCACCTC-ATTAACCAGC_S24_L001_R1_001.fastq.gz,CACTCACCTC-ATTAACCAGC_S24_L001_R2_001.fastq.gz
9,2903504673,HH7KNDSXY,1,CACTTACGGC-ATAGAGTTCG,completed,2903382343,TWCE-HT163B1-S1H4A3Y1D1-lib1,genomic library,TWCE-HT163B1-S1H4A3Y1D1,human,177989977,10/25/20 14:46,10/25/20 16:23,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,CACTTACGGC-ATAGAGTTCG_S25_L001_R1_001.fastq.gz,CACTTACGGC-ATAGAGTTCG_S25_L001_R2_001.fastq.gz


In [61]:
df[[True if 'HT191P1'.lower() in s.lower() else False for s in df['Sample Full Name']]]

Unnamed: 0,Instrument Data ID,Flow Cell ID,Lane,Index Sequence,Status,Library Summary,Library Name,Library Type,Sample Full Name,Species Name,Creation Event ID,Creation Date,Completion Date,Gerald Bam Path,Full Path,File1,File2
14,2903504949,HH7KNDSXY,1,CATTATCGCT-CTTGAAGGTT,completed,2903382342,TWCE-HT191P1-JM1D1_1-lib1,genomic library,TWCE-HT191P1-JM1D1_1,human,177990041,10/25/20 14:47,10/25/20 16:19,,/gscmnt/gc13037/lims/2857177/2020-10/csf_17799...,CATTATCGCT-CTTGAAGGTT_S34_L001_R2_001.fastq.gz,CATTATCGCT-CTTGAAGGTT_S34_L001_R1_001.fastq.gz


In [62]:
m = {
    'HT191P1-S1H1A3Y3_23402d1f-7ea6-48d0-8553-02d5f4163aed': {
        'tumor': {
            'flowcell': 'HFMFWDSXY',
            'lane': '2',
            'index_sequencer': 'CCAGTAGCGT-ATGTATTGGC',
            'library_preparation': 'TWCE-HT191P1-S1H1A3Y3D1_1-lib1',
            'platform': 'ILLUMINA'
        },
        'normal': {
            'flowcell': 'HH7KNDSXY',
            'lane': '1',
            'index_sequencer': 'CATTATCGCT-CTTGAAGGTT',
            'library_preparation': 'TWCE-HT191P1-JM1D1_1-lib1',
            'platform': 'ILLUMINA'
        }
    }
}

In [63]:
yaml.safe_dump(m, open('../tests/data/pecgs_pipeline/sequencing_info.yaml', 'w'))


###### save defaults

In [8]:
pecgs_storage1_defaults = {
    # general
    'cpu': 40,
    
    # align dna-seq
    'known_sites': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/dnaseq_alignment/dbsnp/00-All.chr.vcf.gz',
        'secondaryFiles': [
            {'class': 'File', 'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/dnaseq_alignment/dbsnp/00-All.chr.vcf.gz.tbi'}
        ]
    },
    'reference': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa',
        'secondaryFiles': [
            {'class': 'File', 'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.amb'},
            {'class': 'File', 'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.ann'},
            {'class': 'File', 'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.bwt'},
            {'class': 'File', 'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.fai'},
            {'class': 'File', 'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.pac'},
            {'class': 'File', 'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.sa'},
            {'class': 'File', 'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.dict'}
        ]
    },
    'platform': 'ILLUMINA',
    
    # fusion
    'bwts': {
        'class': 'Directory',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/fusion/Integrate_dependencies/bwts'
    },
    'filter_database': {
        'class': 'Directory',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/fusion/FilterDatabase'
    },
    'fusion_annotator_dir': {
        'class': 'Directory',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/fusion/FusionAnnotator'
    },
    'genome_db': {
        'class': 'Directory',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/fusion/ericscript_dependencies/ericscript_db_homosapiens_ensembl84'
    },
    'genome_lib_dir': {
        'class': 'Directory',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/fusion/STAR-Fusion_dependencies/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play/ctat_genome_lib_build_dir'
    },
    'integrate_annotations': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/fusion/Integrate_dependencies/annot.ensembl.GRCh38.txt'
    },
    'integrate_executable': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/fusion/INTEGRATE_0_2_6/INTEGRATE-build/bin/Integrate'
    },
    'integrate_fasta': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/fusion/Integrate_dependencies/STAR/hg38.fa'
    },
    
    # cnv
    'common_biallelic': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/db/snp_database/af-only-gnomad.hg38.common_biallelic.chr1-22XY.vcf'
    },
    'pool_of_normals': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/pon/create_pon/gatk4scnaPON.Normal.hdf5'
    },
    'protein_coding_gene': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/db/remaking_protein-coding/gencode.v34.annotation.gene_filterd.need_gene_symbol.no_sym.filtered_to_hgnc_protein-coding_genes.bed'
    },
    'reference_dir': {
        'class': 'Directory',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/references/GRCh38.d1.vd1'
    },
    'target_interval_list': {
        'class': 'File',
        'path': '/scratch1/fs1/dinglab/estorrs/pecgs_resources/cnv/db/IDT_xGen_Exome_Research_Panel_v1.hg38.removed_alt_chr.bed.target.preprocessed.exome.interval_list'
    },
    
    # msisensor
    'microsatellite': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/PECGS/ref_genome/hg38.microsatellite'
    },
    'minimal_homopolymer_size': 1,
    'minimal_microsatellite_size': 1,
    
    # tindaisy
    'clinvar_annotation': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/clinvar/GRCh38/clinvar_20200706.vcf.gz'
    },
    'rescue_clinvar': True,
    'rescue_cosmic': True,
    'call_regions': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed.gz'
    },
    'canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed'
    },
    'pindel_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/pindel.WES.ini'
    },
    'strelka_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/strelka.WES.ini'
    },
    'varscan_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/varscan.WES.ini'
    },
    'classification_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/classification_filter_config.ini'
    },
    'af_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/af_filter_config.ini'
    },
    'centromere_bed': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/centromere/ucsc-centromere.GRCh38.bed'
    },
    'vep_cache_gz': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/vep/v99/vep-cache.99_GRCh38.tar.gz'
    },
    'vep_cache_version': 99,
    'assembly': 'GRCh38',
    'tindaisy_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.d1.vd1.chrlist.txt'
    },
    
    # TinJasmine
    'centromere': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tinjasmine_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/chrlist/GRCh38.d1.vd1.chrlist-reordered.txt'
    },
    'pindel_config_template': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_config_template/pindel_germline_filter_config.ini'
    },
    'Canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/Canonical_BED/GRCh38.callRegions.bed'
    },
    'ROI_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/ROI_BED/Homo_sapiens.GRCh38.95.allCDS.2bpFlanks.biomart.withCHR.bed'
    },
    'varscan_filter_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/varscan_filter_config/VLD_FilterVCF-varscan.config.ini'
    },
    'pindel_filter_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_filter_config/VLD_FilterVCF-pindel.config.ini'
    },
    'gatk_filter_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/gatk_filter_config/VLD_FilterVCF-GATK.config.ini'
    },
}

In [11]:
yaml.safe_dump(pecgs_storage1_defaults, open('/home/estorrs/pecgs-pipeline/cwl/pecgs_workflows/defaults.pecgs_TN_wxs_fq_T_rna_fq.yaml', 'w'))

In [12]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1_pecgs_defaults.yaml', 'w'))