In [2]:
import os
import re
import pandas as pd
import yaml
import uuid

###### C3N-05929 test case

In [3]:
bammap = pd.read_csv('../tests/data/pecgs_TN_wgs_bam/input.bammap', sep='\t')
bammap

Unnamed: 0,sample_name,case,disease,experimental_strategy,sample_type,data_path,filesize,data_format,reference,UUID,system
0,C3N-05929.WGS.T,C3N-05929,LSCC,WGS,tumor,/storage1/fs1/m.wyczalkowski/Active/Primary/CP...,12345678,BAM,hg38,d3ddd33b-c7a3-4caf-bdd7-c05e98a19d83,storage1
1,C3N-05929.WGS.N,C3N-05929,LSCC,WGS,blood_normal,/storage1/fs1/m.wyczalkowski/Active/Primary/CP...,12345678,BAM,hg38,7d8f4ede-1911-456e-9eee-2fafe7132f50,storage1


In [4]:
cases = sorted(set(bammap['case']))
inputs = {}
metadata = {}
for case in cases:
    data = {
        'wgs': {
        }
    }
    filtered = bammap[bammap['case']==case]
    # check for wxs tumor bam
    f = filtered[[True if e=='WGS' and df=='BAM' and 'tumor' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 1:
        print(f'Warning: more than 1 wgs tumor bams found for case {case}')
    if f.shape[0]:
        data['wgs']['tumor'] = (f['data_path'].to_list()[0], f['UUID'].to_list()[0])
    
    # check for wxs normal bam
    f = filtered[[True if e=='WGS' and df=='BAM' and 'normal' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 1:
        print(f'Warning: more than 1 wgs normal bams found for case {case}')
    if f.shape[0]:
        data['wgs']['normal'] = (f['data_path'].to_list()[0], f['UUID'].to_list()[0])
            
    # if everything is present then accept sample
    if 'tumor' in data['wgs'] and 'normal' in data['wgs']:
        inputs[case] = data
        metadata[case] = {'disease': f['disease'].to_list()[0]}
inputs, metadata

({'C3N-05929': {'wgs': {'tumor': ('/storage1/fs1/m.wyczalkowski/Active/Primary/CPTAC3.share/CPTAC3-GDC/GDC_import/data/c5d518a3-fc69-4d7b-a719-0aad38abe43a/23fa88f9-9293-406b-bc21-2928ff12bc8b_wgs_gdc_realn.bam',
     'd3ddd33b-c7a3-4caf-bdd7-c05e98a19d83'),
    'normal': ('/storage1/fs1/m.wyczalkowski/Active/Primary/CPTAC3.share/CPTAC3-GDC/GDC_import/data/823dda5b-ff15-4a66-b28b-de0632850623/e3bf416b-3d08-4d1b-bec4-808e55c7788c_wgs_gdc_realn.bam',
     '7d8f4ede-1911-456e-9eee-2fafe7132f50')}}},
 {'C3N-05929': {'disease': 'LSCC'}})

In [5]:
run_list = {}
for case, d in inputs.items():
    run_list[case] = {f'{dt}_{st}_bam.filepath': fp
                     for dt, d1 in d.items()
                     for st, (fp, uid) in d1.items()}
    run_list[case].update({f'{dt}_{st}_bam.uuid': uid
                     for dt, d1 in d.items()
                     for st, (fp, uid) in d1.items()})
run_list

{'C3N-05929': {'wgs_tumor_bam.filepath': '/storage1/fs1/m.wyczalkowski/Active/Primary/CPTAC3.share/CPTAC3-GDC/GDC_import/data/c5d518a3-fc69-4d7b-a719-0aad38abe43a/23fa88f9-9293-406b-bc21-2928ff12bc8b_wgs_gdc_realn.bam',
  'wgs_normal_bam.filepath': '/storage1/fs1/m.wyczalkowski/Active/Primary/CPTAC3.share/CPTAC3-GDC/GDC_import/data/823dda5b-ff15-4a66-b28b-de0632850623/e3bf416b-3d08-4d1b-bec4-808e55c7788c_wgs_gdc_realn.bam',
  'wgs_tumor_bam.uuid': 'd3ddd33b-c7a3-4caf-bdd7-c05e98a19d83',
  'wgs_normal_bam.uuid': '7d8f4ede-1911-456e-9eee-2fafe7132f50'}}

In [8]:
run_table = pd.DataFrame.from_dict(run_list).transpose()
run_table['case_id'] = run_table.index.to_list()
run_table['run_uuid'] = [str(uuid.uuid4()) for i in range(run_table.shape[0])]
run_table.index = [f'{c}_{u}' for c, u in zip(run_table.index, run_table['run_uuid'])]
run_table.index.name = 'run_id'
run_table

Unnamed: 0_level_0,wgs_normal_bam.filepath,wgs_normal_bam.uuid,wgs_tumor_bam.filepath,wgs_tumor_bam.uuid,case_id,run_uuid
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C3N-05929_ad046693-8a91-4c29-a97e-3f13da5e7c90,/storage1/fs1/m.wyczalkowski/Active/Primary/CP...,7d8f4ede-1911-456e-9eee-2fafe7132f50,/storage1/fs1/m.wyczalkowski/Active/Primary/CP...,d3ddd33b-c7a3-4caf-bdd7-c05e98a19d83,C3N-05929,ad046693-8a91-4c29-a97e-3f13da5e7c90


In [9]:
metadata_table = pd.DataFrame.from_dict(metadata).transpose()
metadata_table

Unnamed: 0,disease
C3N-05929,LSCC


In [10]:
for c in metadata_table.columns:
    run_table[c] = [metadata_table.loc[x, c] for x in run_table['case_id']]
run_table

Unnamed: 0_level_0,wgs_normal_bam.filepath,wgs_normal_bam.uuid,wgs_tumor_bam.filepath,wgs_tumor_bam.uuid,case_id,run_uuid,disease
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C3N-05929_ad046693-8a91-4c29-a97e-3f13da5e7c90,/storage1/fs1/m.wyczalkowski/Active/Primary/CP...,7d8f4ede-1911-456e-9eee-2fafe7132f50,/storage1/fs1/m.wyczalkowski/Active/Primary/CP...,d3ddd33b-c7a3-4caf-bdd7-c05e98a19d83,C3N-05929,ad046693-8a91-4c29-a97e-3f13da5e7c90,LSCC


In [None]:
run_table['project'] = 'TCGA'

In [11]:
filtered = run_table[['case_id', 'run_uuid', 'project', 'disease', 'wgs_normal_bam.filepath', 'wgs_normal_bam.uuid', 'wgs_tumor_bam.filepath', 'wgs_tumor_bam.uuid']]
filtered.to_csv('../tests/data/pecgs_TN_wgs_bam/run_list.txt', sep='\t')


In [67]:
# df = pd.read_csv('../tests/data/pecgs_TN_wgs_bam/run_list.txt', sep='\t', index_col=0)
# df['project'] = 'TCGA'
# df = df[['case_id', 'run_uuid', 'project', 'disease', 'wgs_normal_bam.filepath', 'wgs_normal_bam.uuid', 'wgs_tumor_bam.filepath', 'wgs_tumor_bam.uuid']]
# df.to_csv('../tests/data/pecgs_TN_wgs_bam/run_list.txt', sep='\t')

Unnamed: 0_level_0,case_id,run_uuid,disease,wgs_normal_bam.filepath,wgs_normal_bam.uuid,wgs_tumor_bam.filepath,wgs_tumor_bam.uuid
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C3N-05929_ad046693-8a91-4c29-a97e-3f13da5e7c90,C3N-05929,ad046693-8a91-4c29-a97e-3f13da5e7c90,LSCC,/storage1/fs1/m.wyczalkowski/Active/Primary/CP...,7d8f4ede-1911-456e-9eee-2fafe7132f50,/storage1/fs1/m.wyczalkowski/Active/Primary/CP...,d3ddd33b-c7a3-4caf-bdd7-c05e98a19d83


###### MMRF_1250 test case

In [30]:
bammap = pd.read_csv('../tests/data/pecgs_TN_wxs_bam/input.bammap', sep='\t')
bammap

Unnamed: 0,sample_name,case,disease,experimental_strategy,sample_type,data_path,filesize,data_format,reference,UUID,system
0,C3L-00677.WXS.T,C3L-00677,GBM,WXS,tumor,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,BAM,hg38,6e0bde4a-22c1-4e1f-a38a-dff275b61472,storage1
1,C3L-00677.WXS.N,C3L-00677,GBM,WXS,blood_normal,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,BAM,hg38,8919accb-6973-4ec5-b29f-9d895a1713d4,storage1
2,HT191P1-S1H1A3Y3.RNA-seq.R1.T,C3L-00677,GBM,RNA-seq,tumor,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,FASTQ,,2983df85-958b-483c-95e5-2375426879d1,storage1
3,HT191P1-S1H1A3Y3.RNA-seq.R2.T,C3L-00677,GBM,RNA-seq,tumor,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,FASTQ,,f840b94b-eac9-4f87-98b6-51b26b606e1a,storage1


In [31]:
cases = sorted(set(bammap['case']))
inputs = {}
metadata = {}
for case in cases:
    data = {
        'wxs': {
        }
    }
    filtered = bammap[bammap['case']==case]
    # check for wxs tumor bam
    f = filtered[[True if e=='WXS' and df=='BAM' and 'tumor' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 1:
        print(f'Warning: more than 1 wxs tumor bams found for case {case}')
    if f.shape[0]:
        data['wxs']['tumor'] = (f['data_path'].to_list()[0], f['UUID'].to_list()[0])
    
    # check for wxs normal bam
    f = filtered[[True if e=='WXS' and df=='BAM' and 'normal' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 1:
        print(f'Warning: more than 1 wxs normal bams found for case {case}')
    if f.shape[0]:
        data['wxs']['normal'] = (f['data_path'].to_list()[0], f['UUID'].to_list()[0])
            
    # if everything is present then accept sample
    if 'tumor' in data['wxs'] and 'normal' in data['wxs']:
        inputs[case] = data
        metadata[case] = {'disease': f['disease'].to_list()[0]}
inputs, metadata

({'C3L-00677': {'wxs': {'tumor': ('/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/C3L-00677/wxs/1d6c4f0f-09c4-4472-a19e-30ee7e0ecf64_gdc_realn.bam',
     '6e0bde4a-22c1-4e1f-a38a-dff275b61472'),
    'normal': ('/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/C3L-00677/wxs/5cee818c-61e8-4441-bd9a-5c984cde9c6b_gdc_realn.bam',
     '8919accb-6973-4ec5-b29f-9d895a1713d4')}}},
 {'C3L-00677': {'disease': 'GBM'}})

In [32]:
run_list = {}
for case, d in inputs.items():
    run_list[case] = {f'{dt}_{st}_bam.filepath': fp
                     for dt, d1 in d.items()
                     for st, (fp, uid) in d1.items()}
    run_list[case].update({f'{dt}_{st}_bam.uuid': uid
                     for dt, d1 in d.items()
                     for st, (fp, uid) in d1.items()})
run_list

{'C3L-00677': {'wxs_tumor_bam.filepath': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/C3L-00677/wxs/1d6c4f0f-09c4-4472-a19e-30ee7e0ecf64_gdc_realn.bam',
  'wxs_normal_bam.filepath': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/C3L-00677/wxs/5cee818c-61e8-4441-bd9a-5c984cde9c6b_gdc_realn.bam',
  'wxs_tumor_bam.uuid': '6e0bde4a-22c1-4e1f-a38a-dff275b61472',
  'wxs_normal_bam.uuid': '8919accb-6973-4ec5-b29f-9d895a1713d4'}}

In [33]:
run_table = pd.DataFrame.from_dict(run_list).transpose()
run_table['case_id'] = run_table.index.to_list()
run_table['run_uuid'] = [str(uuid.uuid4()) for i in range(run_table.shape[0])]
run_table.index = [f'{c}_{u}' for c, u in zip(run_table.index, run_table['run_uuid'])]
run_table.index.name = 'run_id'
run_table

Unnamed: 0_level_0,wxs_normal_bam.filepath,wxs_normal_bam.uuid,wxs_tumor_bam.filepath,wxs_tumor_bam.uuid,case_id,run_uuid
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C3L-00677_9d492e62-5b46-4b9a-a4af-cd40c74548b9,/storage1/fs1/dinglab/Active/Projects/estorrs/...,8919accb-6973-4ec5-b29f-9d895a1713d4,/storage1/fs1/dinglab/Active/Projects/estorrs/...,6e0bde4a-22c1-4e1f-a38a-dff275b61472,C3L-00677,9d492e62-5b46-4b9a-a4af-cd40c74548b9


In [34]:
metadata_table = pd.DataFrame.from_dict(metadata).transpose()
metadata_table

Unnamed: 0,disease
C3L-00677,GBM


In [35]:
for c in metadata_table.columns:
    run_table[c] = [metadata_table.loc[x, c] for x in run_table['case_id']]
run_table

Unnamed: 0_level_0,wxs_normal_bam.filepath,wxs_normal_bam.uuid,wxs_tumor_bam.filepath,wxs_tumor_bam.uuid,case_id,run_uuid,disease
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C3L-00677_9d492e62-5b46-4b9a-a4af-cd40c74548b9,/storage1/fs1/dinglab/Active/Projects/estorrs/...,8919accb-6973-4ec5-b29f-9d895a1713d4,/storage1/fs1/dinglab/Active/Projects/estorrs/...,6e0bde4a-22c1-4e1f-a38a-dff275b61472,C3L-00677,9d492e62-5b46-4b9a-a4af-cd40c74548b9,GBM


In [37]:
filtered = run_table[['case_id', 'run_uuid', 'disease', 'wxs_normal_bam.filepath', 'wxs_normal_bam.uuid', 'wxs_tumor_bam.filepath', 'wxs_tumor_bam.uuid']]
filtered.to_csv('../tests/data/pecgs_TN_wxs_bam/run_list.txt', sep='\t')

In [72]:
# df = pd.read_csv('../tests/data/pecgs_TN_wxs_bam/run_list.txt', sep='\t', index_col=0)
# df['project'] = 'TCGA'
# df = df[['case_id', 'run_uuid', 'project', 'disease', 'wxs_normal_bam.filepath', 'wxs_normal_bam.uuid', 'wxs_tumor_bam.filepath', 'wxs_tumor_bam.uuid']]
# df.to_csv('../tests/data/pecgs_TN_wxs_bam/run_list.txt', sep='\t')
# # df

In [40]:
# run_map = filtered.transpose().to_dict()
# run_map = {k: {c.replace('.filepath', ''): val
#                for c, val in v.items() if 'filepath' in c}
#            for k, v in run_map.items()}
# run_map

###### HT191P1 test case

In [41]:
bammap = pd.read_csv('../tests/data/pecgs_TN_wxs_fq/input.bammap', sep='\t')
bammap

Unnamed: 0,sample_name,case,disease,experimental_strategy,sample_type,data_path,filesize,data_format,reference,UUID,system
0,HT191P1-S1H1A3Y3.WXS.R1.T,HT191P1-S1H1A3Y3,PDAC,WXS,tumor,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,FASTQ,,9f3664ec-db31-46ae-b9f3-c06af39bda0e,storage1
1,HT191P1-S1H1A3Y3.WXS.R2.T,HT191P1-S1H1A3Y3,PDAC,WXS,tumor,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,FASTQ,,b9df3743-3d7a-4ebd-89e3-85b22bb930ac,storage1
2,HT191P1-S1H1A3Y3.WXS.R1.N,HT191P1-S1H1A3Y3,PDAC,WXS,blood_normal,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,FASTQ,,5932ffee-c02e-4816-8e2f-1dd42d15ee31,storage1
3,HT191P1-S1H1A3Y3.WXS.R2.N,HT191P1-S1H1A3Y3,PDAC,WXS,blood_normal,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,FASTQ,,7bbe5119-4d51-4810-893c-0e0797c8219b,storage1
4,HT191P1-S1H1A3Y3.RNA-seq.R1.T,HT191P1-S1H1A3Y3,PDAC,RNA-seq,tumor,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,FASTQ,,06955efb-3963-416e-8a61-24c298f2f108,storage1
5,HT191P1-S1H1A3Y3.RNA-seq.R2.T,HT191P1-S1H1A3Y3,PDAC,RNA-seq,tumor,/storage1/fs1/dinglab/Active/Projects/estorrs/...,12345678,FASTQ,,ddd71943-6ad8-4c96-8935-0caba920da00,storage1


looking for all samples in bammap with all input files present (in this case paired WXS fastqs and rna-seq tumor fastqs)

In [42]:
cases = sorted(set(bammap['case']))
inputs = {}
metadata = {}
for case in cases:
    data = {
        'wxs': {
            'tumor': {},
            'normal': {}
        },
        'rna-seq': {
            'tumor': {}
        }
    }
    filtered = bammap[bammap['case']==case]
    
    # check for wxs tumor fastqs
    f = filtered[[True if e=='WXS' and df=='FASTQ' and 'tumor' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 2:
        print(f'Warning: more than 2 wxs tumor fastqs found for case {case}')
    for i, row in f.iterrows():
        if '.r1.' in row['sample_name'].lower():
            data['wxs']['tumor']['R1'] = (row['data_path'], row['UUID'])
            break
    for i, row in f.iterrows():
        if '.r2.' in row['sample_name'].lower():
            data['wxs']['tumor']['R2'] = (row['data_path'], row['UUID'])
            break
            
    # check for wxs normal fastqs
    f = filtered[[True if e=='WXS' and df=='FASTQ' and 'normal' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 2:
        print(f'Warning: more than 2 wxs normal fastqs found for case {case}')
    for i, row in f.iterrows():
        if '.r1.' in row['sample_name'].lower():
            data['wxs']['normal']['R1'] = (row['data_path'], row['UUID'])
            break
    for i, row in f.iterrows():
        if '.r2.' in row['sample_name'].lower():
            data['wxs']['normal']['R2'] = (row['data_path'], row['UUID'])
            break
            
    # check for rna-seq tumor fastqs
    f = filtered[[True if e=='RNA-seq' and df=='FASTQ' and 'tumor' in st.lower() else False
                 for e, st, df in zip(filtered['experimental_strategy'],
                                      filtered['sample_type'], filtered['data_format'])]]
    if f.shape[0] > 2:
        print(f'Warning: more than 2 rna-seq tumor fastqs found for case {case}')
    for i, row in f.iterrows():
        if '.r1.' in row['sample_name'].lower():
            data['rna-seq']['tumor']['R1'] = (row['data_path'], row['UUID'])
            break
    for i, row in f.iterrows():
        if '.r2.' in row['sample_name'].lower():
            data['rna-seq']['tumor']['R2'] = (row['data_path'], row['UUID'])
            break
            
    
    # if everything is present then accept sample
    # dont do in real life bc pipeline is split by input, but easier this way for test case
    if len(data['wxs']['tumor']) == 2 and len(data['wxs']['normal']) == 2 and len(data['rna-seq']['tumor']) == 2:
        inputs[case] = data
        metadata[case] = {'disease': f['disease'].to_list()[0]}
inputs, metadata

({'HT191P1-S1H1A3Y3': {'wxs': {'tumor': {'R1': ('/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CCAGTAGCGT-ATGTATTGGC_S53_L002_R1_001.fastq.gz',
      '9f3664ec-db31-46ae-b9f3-c06af39bda0e'),
     'R2': ('/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CCAGTAGCGT-ATGTATTGGC_S53_L002_R2_001.fastq.gz',
      'b9df3743-3d7a-4ebd-89e3-85b22bb930ac')},
    'normal': {'R1': ('/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CATTATCGCT-CTTGAAGGTT_S34_L001_R1_001.fastq.gz',
      '5932ffee-c02e-4816-8e2f-1dd42d15ee31'),
     'R2': ('/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CATTATCGCT-CTTGAAGGTT_S34_L001_R2_001.fastq.gz',
      '7bbe5119-4d51-4810-893c-0e0797c8219b')}},
   'rna-seq': {'tumor': {'R1': ('/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/rna-se

In [43]:
run_list = {}
for case, d in inputs.items():
    run_list[case] = {f'{dt}_{st}_{r}.filepath': fp
                     for dt, d1 in d.items()
                     for st, d2 in d1.items()
                     for r, (fp, uid) in d2.items()}
    run_list[case].update({f'{dt}_{st}_{r}.uuid': uid
                     for dt, d1 in d.items()
                     for st, d2 in d1.items()
                     for r, (fp, uid) in d2.items()})
run_list

{'HT191P1-S1H1A3Y3': {'wxs_tumor_R1.filepath': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CCAGTAGCGT-ATGTATTGGC_S53_L002_R1_001.fastq.gz',
  'wxs_tumor_R2.filepath': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CCAGTAGCGT-ATGTATTGGC_S53_L002_R2_001.fastq.gz',
  'wxs_normal_R1.filepath': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CATTATCGCT-CTTGAAGGTT_S34_L001_R1_001.fastq.gz',
  'wxs_normal_R2.filepath': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/wxs/CATTATCGCT-CTTGAAGGTT_S34_L001_R2_001.fastq.gz',
  'rna-seq_tumor_R1.filepath': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources/test_samples/HT191P1-S1H1A3Y3/rna-seq/ht191p1-s1h1a3y3e1_1.AAGGTGTTAG-TAAGCGCGTG.HFWJGDSXY_AAGGTGTTAG-TAAGCGCGTG_L001_R1.fastq.gz',
  'rna-seq_tumor_R2.filepath': '/storage1/fs1/dinglab/Active

In [44]:
run_table = pd.DataFrame.from_dict(run_list).transpose()
run_table['case_id'] = run_table.index.to_list()
run_table['run_uuid'] = [str(uuid.uuid4()) for i in range(run_table.shape[0])]
run_table.index = [f'{c}_{u}' for c, u in zip(run_table.index, run_table['run_uuid'])]
run_table.index.name = 'run_id'
run_table

Unnamed: 0_level_0,rna-seq_tumor_R1.filepath,rna-seq_tumor_R1.uuid,rna-seq_tumor_R2.filepath,rna-seq_tumor_R2.uuid,wxs_normal_R1.filepath,wxs_normal_R1.uuid,wxs_normal_R2.filepath,wxs_normal_R2.uuid,wxs_tumor_R1.filepath,wxs_tumor_R1.uuid,wxs_tumor_R2.filepath,wxs_tumor_R2.uuid,case_id,run_uuid
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
HT191P1-S1H1A3Y3_b089ade7-05a6-4dbf-97c2-b9eb97628d17,/storage1/fs1/dinglab/Active/Projects/estorrs/...,06955efb-3963-416e-8a61-24c298f2f108,/storage1/fs1/dinglab/Active/Projects/estorrs/...,ddd71943-6ad8-4c96-8935-0caba920da00,/storage1/fs1/dinglab/Active/Projects/estorrs/...,5932ffee-c02e-4816-8e2f-1dd42d15ee31,/storage1/fs1/dinglab/Active/Projects/estorrs/...,7bbe5119-4d51-4810-893c-0e0797c8219b,/storage1/fs1/dinglab/Active/Projects/estorrs/...,9f3664ec-db31-46ae-b9f3-c06af39bda0e,/storage1/fs1/dinglab/Active/Projects/estorrs/...,b9df3743-3d7a-4ebd-89e3-85b22bb930ac,HT191P1-S1H1A3Y3,b089ade7-05a6-4dbf-97c2-b9eb97628d17


In [45]:
metadata_table = pd.DataFrame.from_dict(metadata).transpose()
metadata_table

Unnamed: 0,disease
HT191P1-S1H1A3Y3,PDAC


In [46]:
for c in metadata_table.columns:
    run_table[c] = [metadata_table.loc[x, c] for x in run_table['case_id']]
run_table

Unnamed: 0_level_0,rna-seq_tumor_R1.filepath,rna-seq_tumor_R1.uuid,rna-seq_tumor_R2.filepath,rna-seq_tumor_R2.uuid,wxs_normal_R1.filepath,wxs_normal_R1.uuid,wxs_normal_R2.filepath,wxs_normal_R2.uuid,wxs_tumor_R1.filepath,wxs_tumor_R1.uuid,wxs_tumor_R2.filepath,wxs_tumor_R2.uuid,case_id,run_uuid,disease
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
HT191P1-S1H1A3Y3_b089ade7-05a6-4dbf-97c2-b9eb97628d17,/storage1/fs1/dinglab/Active/Projects/estorrs/...,06955efb-3963-416e-8a61-24c298f2f108,/storage1/fs1/dinglab/Active/Projects/estorrs/...,ddd71943-6ad8-4c96-8935-0caba920da00,/storage1/fs1/dinglab/Active/Projects/estorrs/...,5932ffee-c02e-4816-8e2f-1dd42d15ee31,/storage1/fs1/dinglab/Active/Projects/estorrs/...,7bbe5119-4d51-4810-893c-0e0797c8219b,/storage1/fs1/dinglab/Active/Projects/estorrs/...,9f3664ec-db31-46ae-b9f3-c06af39bda0e,/storage1/fs1/dinglab/Active/Projects/estorrs/...,b9df3743-3d7a-4ebd-89e3-85b22bb930ac,HT191P1-S1H1A3Y3,b089ade7-05a6-4dbf-97c2-b9eb97628d17,PDAC


wxs fastq runlist

In [47]:
filtered = run_table[['case_id', 'run_uuid', 'disease', 'wxs_normal_R1.filepath', 'wxs_normal_R1.uuid', 'wxs_normal_R2.filepath', 'wxs_normal_R2.uuid', 'wxs_tumor_R1.filepath', 'wxs_tumor_R1.uuid', 'wxs_tumor_R2.filepath', 'wxs_tumor_R2.uuid']]
filtered.to_csv('../tests/data/pecgs_TN_wxs_fq/run_list.txt', sep='\t')

rnaseq fq runlist

In [48]:
filtered = run_table[['case_id', 'run_uuid', 'disease', 'rna-seq_tumor_R1.filepath', 'rna-seq_tumor_R1.uuid', 'rna-seq_tumor_R2.filepath', 'rna-seq_tumor_R2.uuid']]
filtered.to_csv('../tests/data/pecgs_T_rna_fq/run_list.txt', sep='\t')

In [81]:
df = pd.read_csv('../tests/data/pecgs_TN_wxs_fq/run_list.txt', sep='\t', index_col=0)
# df['project'] = 'TCGA'
# df['disease'] = 'PAAD'
# df = df[['case_id', 'run_uuid', 'project', 'disease', 'wxs_normal_R1.filepath', 'wxs_normal_R1.uuid', 'wxs_normal_R2.filepath', 'wxs_normal_R2.uuid', 'wxs_tumor_R1.filepath', 'wxs_tumor_R1.uuid', 'wxs_tumor_R2.filepath', 'wxs_tumor_R2.uuid']]
# df.to_csv('../tests/data/pecgs_TN_wxs_fq/run_list.txt', sep='\t')
df

Unnamed: 0_level_0,case_id,run_uuid,project,disease,wxs_normal_R1.filepath,wxs_normal_R1.uuid,wxs_normal_R2.filepath,wxs_normal_R2.uuid,wxs_tumor_R1.filepath,wxs_tumor_R1.uuid,wxs_tumor_R2.filepath,wxs_tumor_R2.uuid
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
HT191P1-S1H1A3Y3_b089ade7-05a6-4dbf-97c2-b9eb97628d17,HT191P1-S1H1A3Y3,b089ade7-05a6-4dbf-97c2-b9eb97628d17,TCGA,PAAD,/storage1/fs1/dinglab/Active/Projects/estorrs/...,5932ffee-c02e-4816-8e2f-1dd42d15ee31,/storage1/fs1/dinglab/Active/Projects/estorrs/...,7bbe5119-4d51-4810-893c-0e0797c8219b,/storage1/fs1/dinglab/Active/Projects/estorrs/...,9f3664ec-db31-46ae-b9f3-c06af39bda0e,/storage1/fs1/dinglab/Active/Projects/estorrs/...,b9df3743-3d7a-4ebd-89e3-85b22bb930ac


In [82]:
df = pd.read_csv('../tests/data/pecgs_T_rna_fq/run_list.txt', sep='\t', index_col=0)
# df['project'] = 'TCGA'
# df['disease'] = 'PAAD'
# df = df[['case_id', 'run_uuid', 'project', 'disease', 'rna-seq_tumor_R1.filepath', 'rna-seq_tumor_R1.uuid', 'rna-seq_tumor_R2.filepath', 'rna-seq_tumor_R2.uuid']]
# df.to_csv('../tests/data/pecgs_T_rna_fq/run_list.txt', sep='\t')
df

Unnamed: 0_level_0,case_id,run_uuid,project,disease,rna-seq_tumor_R1.filepath,rna-seq_tumor_R1.uuid,rna-seq_tumor_R2.filepath,rna-seq_tumor_R2.uuid
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HT191P1-S1H1A3Y3_b089ade7-05a6-4dbf-97c2-b9eb97628d17,HT191P1-S1H1A3Y3,b089ade7-05a6-4dbf-97c2-b9eb97628d17,TCGA,PAAD,/storage1/fs1/dinglab/Active/Projects/estorrs/...,06955efb-3963-416e-8a61-24c298f2f108,/storage1/fs1/dinglab/Active/Projects/estorrs/...,ddd71943-6ad8-4c96-8935-0caba920da00


if we want the .bams to have correct readgroup ids, etc. then we need to pass some sequencing info. this info is available in the Samplemap.csv from MGI

In [27]:
df = pd.read_csv('/data/pecgs/test_sample/ht191p1-s1h1a3/Samplemap.csv', sep=',')
df

FileNotFoundError: [Errno 2] No such file or directory: '/data/pecgs/test_sample/ht191p1-s1h1a3/Samplemap.csv'

In [13]:
data = [
    ['HT191P1-S1H1A3Y3', '419b8098-e4b2-4318-883d-c233cb7e06c8', 'wxs', 'tumor', 'HFMFWDSXY', '2', 'CCAGTAGCGT-ATGTATTGGC', 'TWCE-HT191P1-S1H1A3Y3D1_1-lib1', 'ILLUMINA'],
    ['HT191P1-S1H1A3Y3', '419b8098-e4b2-4318-883d-c233cb7e06c8', 'wxs', 'normal', 'HH7KNDSXY', '1', 'CATTATCGCT-CTTGAAGGTT', 'TWCE-HT191P1-JM1D1_1-lib1', 'ILLUMINA']
]
sq_df = pd.DataFrame(data=data, columns=['sample_id', 'run_uuid', 'experimental_strategy', 'sample_type',
                                        'flowcell', 'lane', 'index_sequencer', 'library_preparation', 'platform'])
sq_df.index = [f'{s}_{u}' for s, u in zip(sq_df['sample_id'], sq_df['run_uuid'])]
sq_df.index.name = 'run_id'
sq_df



Unnamed: 0_level_0,sample_id,run_uuid,experimental_strategy,sample_type,flowcell,lane,index_sequencer,library_preparation,platform
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
HT191P1-S1H1A3Y3_419b8098-e4b2-4318-883d-c233cb7e06c8,HT191P1-S1H1A3Y3,419b8098-e4b2-4318-883d-c233cb7e06c8,wxs,tumor,HFMFWDSXY,2,CCAGTAGCGT-ATGTATTGGC,TWCE-HT191P1-S1H1A3Y3D1_1-lib1,ILLUMINA
HT191P1-S1H1A3Y3_419b8098-e4b2-4318-883d-c233cb7e06c8,HT191P1-S1H1A3Y3,419b8098-e4b2-4318-883d-c233cb7e06c8,wxs,normal,HH7KNDSXY,1,CATTATCGCT-CTTGAAGGTT,TWCE-HT191P1-JM1D1_1-lib1,ILLUMINA


In [14]:
sq_df.to_csv('../tests/data/pecgs_TN_wxs_fq/sequencing_info.txt', sep='\t')


In [87]:
df = pd.read_csv('../tests/data/pecgs_TN_wxs_fq/sequencing_info.txt', sep='\t', index_col=0)
df


Unnamed: 0,sample_id,run_uuid,experimental_strategy,sample_type,flowcell,lane,index_sequencer,library_preparation,platform
HT191P1-S1H1A3Y3_b089ade7-05a6-4dbf-97c2-b9eb97628d17,HT191P1-S1H1A3Y3,b089ade7-05a6-4dbf-97c2-b9eb97628d17,wxs,tumor,HFMFWDSXY,2,CCAGTAGCGT-ATGTATTGGC,TWCE-HT191P1-S1H1A3Y3D1_1-lib1,ILLUMINA
HT191P1-S1H1A3Y3_b089ade7-05a6-4dbf-97c2-b9eb97628d17,HT191P1-S1H1A3Y3,b089ade7-05a6-4dbf-97c2-b9eb97628d17,wxs,normal,HH7KNDSXY,1,CATTATCGCT-CTTGAAGGTT,TWCE-HT191P1-JM1D1_1-lib1,ILLUMINA


###### save defaults

In [3]:
resources_dir = '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs_resources'

In [4]:
pecgs_storage1_defaults = {
    # general
    'cpu': 40,
    
    # align dna-seq
    'known_sites': {
        'class': 'File',
        'path': f'{resources_dir}/dnaseq_alignment/dbsnp/00-All.chr.vcf.gz',
        'secondaryFiles': [
            {'class': 'File', 'path': f'{resources_dir}/dnaseq_alignment/dbsnp/00-All.chr.vcf.gz.tbi'}
        ]
    },
    'reference': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa',
        'secondaryFiles': [
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.amb'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.ann'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.bwt'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.fai'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.pac'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.sa'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.dict'}
        ]
    },
    'platform': 'ILLUMINA',
    
    # cnv
    'common_biallelic': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/snp_database/af-only-gnomad.hg38.common_biallelic.chr1-22XY.vcf'
    },
    'pool_of_normals': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/pon/pecgs_pon_v1/create_pon/gatk4scnaPON.Normal.hdf5'
    },
    'protein_coding_gene': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/gencode.v34.annotation.gene_filtered.protein_coding.ensembl_ID_no_version.protein-coding_hgnc_filtered.duplicates_removed.ensembl_ID_removed.txt'
    },
    'cytoband': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/cytoBand.txt'
    },
    'reference_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1'
    },
    'target_interval_list': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/IDT_xGen_Exome_Research_Panel_v1.hg38.removed_alt_chr.autosome_only.bed.preprocessed.exome.interval_list'
    },
    
    # msisensor
    'microsatellite': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/PECGS/ref_genome/hg38.microsatellite'
    },
    'minimal_homopolymer_size': 1,
    'minimal_microsatellite_size': 1,
    
    # tindaisy
    'clinvar_annotation': {
        'class': 'File',
        'path': f'{resources_dir}/clinvar/GRCh38/clinvar_20200706.vcf.gz'
    },
    'rescue_clinvar': True,
    'rescue_cosmic': True,
    'call_regions': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed.gz'
    },
    'canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed'
    },
    'pindel_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/pindel.WES.ini'
    },
    'strelka_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/strelka.WES.ini'
    },
    'varscan_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/varscan.WES.ini'
    },
    'classification_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/classification_filter_config.ini'
    },
    'af_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/af_filter_config.ini'
    },
    'centromere_bed': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tindaisy_vep_cache_gz': {
        'class': 'File',
        'path': f'{resources_dir}/vep/20220525/vep-cache.102_GRCh38.tar.gz'
    },
    'tindaisy_vep_cache_version': 102,
    'assembly': 'GRCh38',
    'tindaisy_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.d1.vd1.chrlist.txt'
    },
    'tindaisy_rescue_bed': {
        'class': 'File',
        'path': f'{resources_dir}/bed/tindaisy_vaf_rescue/299_drivers.bed'
    },
    
    # TinJasmine
    'centromere': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tinjasmine_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/chrlist/GRCh38.d1.vd1.chrlist-reordered.txt'
    },
    'pindel_config_template': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_config_template/pindel_germline_filter_config.ini'
    },
    'Canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/Canonical_BED/GRCh38.callRegions.bed'
    },
    'ROI_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/ROI_BED/Homo_sapiens.GRCh38.95.allCDS.2bpFlanks.biomart.withCHR.bed'
    },
#     'varscan_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/varscan_filter_config/VLD_FilterVCF-varscan.config.ini'
#     },
#     'pindel_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_filter_config/VLD_FilterVCF-pindel.config.ini'
#     },
#     'gatk_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/gatk_filter_config/VLD_FilterVCF-GATK.config.ini'
#     },
    'tinjasmine_vep_cache_gz': {
        'class': 'File',
        'path': f'{resources_dir}/vep/20220525/vep-cache.102_GRCh38.tar.gz'
    },
    
    # neoscan
    'neoscan_ref_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/neoscan/refseq_hg38_june29'
    },
    'neoscan_bed': {
        'class': 'File',
        'path': f'{resources_dir}/neoscan/refseq_hg38_june29/proteome.bed'
    },
    
    # charger
    'charger_inheritance_gene_list': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/cancer_pred_genes_160genes_011321_curated_forCharGer.txt'
    },
    'charger_pp2_gene_list': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/160cpgs.txt'
    },
    'charger_pathogenic_variants': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/emptyRemoved_20160428_pathogenic_variants_HGVSg_VEP_grch38lifOver.vcf'
    },
    'charger_hotspot3d_clusters': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/cptac_mc3_combined_noHypers_sorted.maf.3D_Proximity.pairwise.recurrence.l0.ad10.r10.clusters'
    },
    'charger_clinvar_alleles': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/clinvar_alleles.single.b38.tsv.gz'
    },
    
}

In [5]:
yaml.safe_dump(pecgs_storage1_defaults, open('/diskmnt/Projects/Users/estorrs/pecgs-pipeline/cwl/pecgs_workflows/defaults.pecgs_TN_wxs_fq.yaml', 'w'))

In [6]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1.defaults.pecgs_TN_wxs_fq.yaml', 'w'))

In [7]:
pecgs_storage1_defaults = {
    # general
    'cpu': 40,
    
    'reference': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa',
        'secondaryFiles': [
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.amb'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.ann'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.bwt'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.fai'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.pac'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa.sa'},
            {'class': 'File', 'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.dict'}
        ]
    },
    
    # cnv
    'common_biallelic': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/snp_database/af-only-gnomad.hg38.common_biallelic.chr1-22XY.vcf'
    },
    'pool_of_normals': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/pon/pecgs_pon_v1/create_pon/gatk4scnaPON.Normal.hdf5'
    },
    'protein_coding_gene': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/gencode.v34.annotation.gene_filtered.protein_coding.ensembl_ID_no_version.protein-coding_hgnc_filtered.duplicates_removed.ensembl_ID_removed.txt'
    },
    'cytoband': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/cytoBand.txt'
    },
    'reference_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1'
    },
    'target_interval_list': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/db/IDT_xGen_Exome_Research_Panel_v1.hg38.removed_alt_chr.autosome_only.bed.preprocessed.exome.interval_list'
    },
    
    # msisensor
    'microsatellite': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/PECGS/ref_genome/hg38.microsatellite'
    },
    'minimal_homopolymer_size': 1,
    'minimal_microsatellite_size': 1,
    
    # tindaisy
    'clinvar_annotation': {
        'class': 'File',
        'path': f'{resources_dir}/clinvar/GRCh38/clinvar_20200706.vcf.gz'
    },
    'rescue_clinvar': True,
    'rescue_cosmic': True,
    'call_regions': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed.gz'
    },
    'canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.callRegions.bed'
    },
    'pindel_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/pindel.WES.ini'
    },
    'strelka_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/strelka.WES.ini'
    },
    'varscan_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/caller_config/varscan.WES.ini'
    },
    'classification_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/classification_filter_config.ini'
    },
    'af_config': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/filter_config/af_filter_config.ini'
    },
    'centromere_bed': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tindaisy_vep_cache_gz': {
        'class': 'File',
        'path': f'{resources_dir}/vep/20220525/vep-cache.102_GRCh38.tar.gz'
    },
    'tindaisy_vep_cache_version': 102,
    'assembly': 'GRCh38',
    'tindaisy_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Projects/estorrs/pecgs-pipeline/submodules/TinDaisy/params/chrlist/GRCh38.d1.vd1.chrlist.txt'
    },
    'tindaisy_rescue_bed': {
        'class': 'File',
        'path': f'{resources_dir}/bed/tindaisy_vaf_rescue/299_drivers.bed'
    },
    
    # TinJasmine
    'centromere': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/centromere/ucsc-centromere.GRCh38.bed'
    },
    'tinjasmine_chrlist': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/chrlist/GRCh38.d1.vd1.chrlist-reordered.txt'
    },
    'pindel_config_template': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_config_template/pindel_germline_filter_config.ini'
    },
    'Canonical_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/Canonical_BED/GRCh38.callRegions.bed'
    },
    'ROI_BED': {
        'class': 'File',
        'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/ROI_BED/Homo_sapiens.GRCh38.95.allCDS.2bpFlanks.biomart.withCHR.bed'
    },
#     'varscan_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/varscan_filter_config/VLD_FilterVCF-varscan.config.ini'
#     },
#     'pindel_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/pindel_filter_config/VLD_FilterVCF-pindel.config.ini'
#     },
#     'gatk_filter_config': {
#         'class': 'File',
#         'path': '/storage1/fs1/dinglab/Active/Resources/Pipelines/TinJasmine/TinJasmine_etc/gatk_filter_config/VLD_FilterVCF-GATK.config.ini'
#     },
    'tinjasmine_vep_cache_gz': {
        'class': 'File',
        'path': f'{resources_dir}/vep/20220525/vep-cache.102_GRCh38.tar.gz'
    },
    
    # neoscan
    'neoscan_ref_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/neoscan/refseq_hg38_june29'
    },
    'neoscan_bed': {
        'class': 'File',
        'path': f'{resources_dir}/neoscan/refseq_hg38_june29/proteome.bed'
    },
    
    # charger
    'charger_inheritance_gene_list': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/cancer_pred_genes_160genes_011321_curated_forCharGer.txt'
    },
    'charger_pp2_gene_list': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/160cpgs.txt'
    },
    'charger_pathogenic_variants': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/emptyRemoved_20160428_pathogenic_variants_HGVSg_VEP_grch38lifOver.vcf'
    },
    'charger_hotspot3d_clusters': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/PanCan/cptac_mc3_combined_noHypers_sorted.maf.3D_Proximity.pairwise.recurrence.l0.ad10.r10.clusters'
    },
    'charger_clinvar_alleles': {
        'class': 'File',
        'path': f'{resources_dir}/charger/CharGer_dependencyFiles/clinvar_alleles.single.b38.tsv.gz'
    },
}

In [8]:
yaml.safe_dump(pecgs_storage1_defaults, open('/diskmnt/Projects/Users/estorrs/pecgs-pipeline/cwl/pecgs_workflows/defaults.pecgs_TN_wxs_bam.yaml', 'w'))

In [9]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1.defaults.pecgs_TN_wxs_bam.yaml', 'w'))

In [17]:
pecgs_storage1_defaults = {
    # general
    'cpu': 40,
    
    # fusion
    'bwts': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/Integrate_dependencies/bwts'
    },
    'filter_database': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/FilterDatabase'
    },
    'fusion_annotator_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/FusionAnnotator'
    },
    'genome_db': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/ericscript_dependencies/ericscript_db_homosapiens_ensembl84'
    },
    'genome_lib_dir': {
        'class': 'Directory',
        'path': f'{resources_dir}/fusion/STAR-Fusion_dependencies/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play/ctat_genome_lib_build_dir'
    },
    'integrate_annotations': {
        'class': 'File',
        'path': f'{resources_dir}/fusion/Integrate_dependencies/annot.ensembl.GRCh38.txt'
    },
    'integrate_executable': {
        'class': 'File',
        'path': f'{resources_dir}/fusion/INTEGRATE_0_2_6/INTEGRATE-build/bin/Integrate'
    },
    'integrate_fasta': {
        'class': 'File',
        'path': f'{resources_dir}/fusion/Integrate_dependencies/STAR/hg38.fa'
    },
    
    # bulk expression
    'star_index': {
        'class': 'Directory',
        'path': f'{resources_dir}/bulk_expression/STAR_GRCh38_d1_vd1_d1_GENCODE_v34'
    },
    'gtf': {
        'class': 'File',
        'path': f'{resources_dir}/bulk_expression/STAR_GRCh38_d1_vd1_d1_GENCODE_v34/gencode.v34.annotation.gtf'
    },
    'gene_info': {
        'class': 'File',
        'path': f'{resources_dir}/bulk_expression/STAR_GRCh38_d1_vd1_d1_GENCODE_v34/gencode.gene.info.v34.tsv'
    },
}

In [18]:
yaml.safe_dump(pecgs_storage1_defaults, open('/diskmnt/Projects/Users/estorrs/pecgs-pipeline/cwl/pecgs_workflows/defaults.pecgs_T_rna_fq.yaml', 'w'))

In [19]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1.defaults.pecgs_T_rna_fq.yaml', 'w'))

In [20]:
pecgs_storage1_defaults = {
    # general
#     'cpu': 40,
    
    'reference': {
        'class': 'File',
        'path': f'{resources_dir}/cnv/references/GRCh38.d1.vd1/GRCh38.d1.vd1.fa',
    },
    
    # somatic sv
    'generate_evidence_bam': True,
}

In [21]:
yaml.safe_dump(pecgs_storage1_defaults, open('../wombat/templates/compute1.defaults.pecgs_TN_wgs_bam.yaml', 'w'))