In [70]:
import requests
import os
def download(url, file_path):
    with open(os.path.expanduser(file_path), "wb") as file:
        response = requests.get(url)
        file.write(response.content)

### Cooccurrence on TopMed COPDGene data


In [38]:
from fasp.workflow import sbWESClient
bdcwes = sbWESClient('https://ga4gh-api.sb.biodatacatalyst.nhlbi.nih.gov/ga4gh/wes/v1', 'forei/fasp-vus',
                     '~/.keys/sbbdc_key.json', debug=True)

The workflow was run through the UI but we can retrieve the results from here.

Prior to the workflow the COPDGene_phs000951_TOPMed_WGS_freeze.8.chr13.hg38.c1.vcf was filtered down to the range of BRCA2 by using BCFTools. 

In [49]:
vus_run = "424d1d3a-7416-4675-890a-9021cdb7361c"
runLog = bdcwes.GetRunLog(vus_run)
runLog

{'request': {'tags': {},
  'workflow_params': {'name': 'cooccurrence run - 10-29-21 22:40:36',
   'project': 'forei/fasp-vus',
   'inputs': {'p2': None,
    'save_files': None,
    'gene': None,
    'chromosome': None,
    'data_directory': None,
    'ensembl_release': None,
    'pathology_file': None,
    'gnomad_file': {'path': 'drs://ga4gh-api.sb.biodatacatalyst.nhlbi.nih.gov/617c5ce7e6261a31b6d12de3',
     'name': 'gnomad_chr13_brca2.vcf',
     'class': 'File'},
    'hg_version': None,
    'vcf_file': {'path': 'drs://ga4gh-api.sb.biodatacatalyst.nhlbi.nih.gov/617c77bce6261a31b6d12f0a',
     'name': 'COPDGene_phs000951_TOPMed_WGS_freeze.8.chr13.hg38.c1.filtered.vcf',
     'class': 'File'},
    'phased': None,
    'pathogenicity_file': {'path': 'drs://ga4gh-api.sb.biodatacatalyst.nhlbi.nih.gov/617c5ce6e6261a31b6d12ddc',
     'name': 'clinvar_brca2.tsv',
     'class': 'File'}}},
  'workflow_type': 'CWL',
  'workflow_engine_params': {}},
 'state': 'COMPLETE',
 'outputs': {'vpi_file': {

### Find the DRS id of the output file

In [42]:
resultsDRSID = runLog['outputs']['out_file']['path']
resultsDRSID

'drs://ga4gh-api.sb.biodatacatalyst.nhlbi.nih.gov/617c7ce1e6261a31b6d12f61'

In [45]:
from fasp.loc import sbbdcDRSClient
drsClient = sbbdcDRSClient('~/.keys/sevenbridges_keys.json', 's3')

sbDRSID = resultsDRSID.split('/')[-1]
fileDetails = drsClient.getObject(sbDRSID)
fileDetails

{'id': '617c7ce1e6261a31b6d12f61',
 'name': '_1_BRCA2-cooccurrences.json',
 'size': 81104,
 'checksums': [{'type': 'etag',
   'checksum': '858734d89993578a8dc87fd2401fb0f0-1'}],
 'self_uri': 'drs://ga4gh-api.sb.biodatacatalyst.nhlbi.nih.gov/617c7ce1e6261a31b6d12f61',
 'created_time': '2021-10-29T22:59:45Z',
 'updated_time': '2021-10-29T22:59:45Z',
 'mime_type': 'application/json',
 'access_methods': [{'type': 's3',
   'region': 'us-east-1',
   'access_id': 'aws-us-east-1'}]}

In [46]:
url = drsClient.getAccessURL(sbDRSID,'s3')
fullPath = fileDetails['name']
download(url, fullPath)

In [None]:
import requests
import os
def download(url, file_path):
    with open(os.path.expanduser(file_path), "wb") as file:
        response = requests.get(url)
        file.write(response.content)

### Download the file and show the result

In [47]:
with open(fullPath) as json_file:
    data = json.load(json_file)
# delete the local copy of the file
os.remove(fullPath)
# and look at the contents
data

{'cooccurring vus': {"(13, 32318080, 'C', 'T')": {'likelihood data': {'p1': 4.894283476898982e-05,
    'p2': 0.001,
    'n': 10195,
    'k': 2,
    'likelihood': 0.025603946524203208},
   'allele frequencies': {'maxPop': None,
    'maxPopFreq': None,
    'cohortFreq': 0.9979444009397024},
   'pathogenic variants': [[13, 32338749, 'AATTAC', 'A']]},
  "(13, 32318598, 'T', 'C')": {'likelihood data': {'p1': 4.894283476898982e-05,
    'p2': 0.001,
    'n': 9047,
    'k': 1,
    'likelihood': 0.003732503838195817},
   'allele frequencies': {'maxPop': None,
    'maxPopFreq': None,
    'cohortFreq': 0.8855716523101018},
   'pathogenic variants': [[13, 32338749, 'AATTAC', 'A']]},
  "(13, 32319654, 'A', 'G')": {'likelihood data': {'p1': 4.894283476898982e-05,
    'p2': 0.001,
    'n': 8119,
    'k': 1,
    'likelihood': 0.009026128393426513},
   'allele frequencies': {'maxPop': None,
    'maxPopFreq': None,
    'cohortFreq': 0.7947337509788567},
   'pathogenic variants': [[13, 32338749, 'AATTAC'

In [48]:
# flatten cooccurrence output
flat_vus = []
for k, v in data['cooccurring vus'].items():
    pathogenic_count = len(v['pathogenic variants'])
    ## this is a pythonic way of merging dicts - it is cryptic
    z = {**{"vus":k}, **v['likelihood data'], **v['allele frequencies'], **{"no_pathogenic_coocurrs":pathogenic_count}}
    flat_vus.append(z)

# turn the array of dicts into a data frame    
import pandas as pd
flat_df = pd.DataFrame(flat_vus)
flat_df

Unnamed: 0,vus,p1,p2,n,k,likelihood,maxPop,maxPopFreq,cohortFreq,no_pathogenic_coocurrs
0,"(13, 32318080, 'C', 'T')",4.9e-05,0.001,10195,2,0.025604,,,0.997944,1
1,"(13, 32318598, 'T', 'C')",4.9e-05,0.001,9047,1,0.003733,,,0.885572,1
2,"(13, 32319654, 'A', 'G')",4.9e-05,0.001,8119,1,0.009026,,,0.794734,1
3,"(13, 32321240, 'G', 'C')",4.9e-05,0.001,10196,2,0.02558,,,0.998042,1
4,"(13, 32325741, 'C', 'T')",4.9e-05,0.001,10194,2,0.025628,,,0.997847,1
5,"(13, 32331128, 'G', 'A')",4.9e-05,0.001,10194,2,0.025628,,,0.997847,1
6,"(13, 32333969, 'A', 'G')",4.9e-05,0.001,10194,2,0.025628,,,0.997847,1
7,"(13, 32338918, 'A', 'G')",4.9e-05,0.001,10194,2,0.025628,,,0.997847,1
8,"(13, 32340868, 'G', 'C')",4.9e-05,0.001,10193,2,0.025653,,,0.997749,1
9,"(13, 32343709, 'GA', 'G')",4.9e-05,0.001,3580,1,0.678061,,,0.350431,1


In [1]:
from fasp.loc import sbbdcDRSClient
drsClient = sbbdcDRSClient('~/.keys/sevenbridges_keys.json', 's3')
filtered_vcf_drsid ='617c77bce6261a31b6d12f0a'
drsClient.getObject(filtered_vcf_drsid)

{'id': '617c77bce6261a31b6d12f0a',
 'name': 'COPDGene_phs000951_TOPMed_WGS_freeze.8.chr13.hg38.c1.filtered.vcf',
 'size': 1119013993,
 'checksums': [{'type': 'etag',
   'checksum': 'f4d6956ab997599534a3453425f25f69-2'}],
 'self_uri': 'drs://ga4gh-api.sb.biodatacatalyst.nhlbi.nih.gov/617c77bce6261a31b6d12f0a',
 'created_time': '2021-10-29T22:37:48Z',
 'updated_time': '2021-10-29T22:37:48Z',
 'mime_type': 'application/json',
 'access_methods': [{'type': 's3',
   'region': 'us-east-1',
   'access_id': 'aws-us-east-1'}]}

In [3]:
import requests
import os
def download(url, file_path):
    with open(os.path.expanduser(file_path), "wb") as file:
        response = requests.get(url)
        file.write(response.content)

In [6]:
url = drsClient.getAccessURL(filtered_vcf_drsid, 's3')
download(url, '~/Downloads/COPDGene_phs000951_TOPMed_WGS_freeze.8.chr13.hg38.c1.filtered.vcf')

In [7]:
ipv_drs_id = '617c7ce1e6261a31b6d12f5d'
url = drsClient.getAccessURL(ipv_drs_id, 's3')
download(url, '~/Downloads/BRCA2-ipv.json')

In [8]:

vpi_drs_id = '617c7ce1e6261a31b6d12f60'
url = drsClient.getAccessURL(vpi_drs_id, 's3')
download(url, '~/Downloads/BRCA2-vpi.json')

In [9]:
stdout_drs = '617c7cdcfe2ce00231025ebb'
url = drsClient.getAccessURL(stdout_drs, 's3')
download(url, '~/Downloads/stdout.log')