In [1]:
from fasp.workflow import sbcgcWESClient

cl = sbcgcWESClient('forei/fasp-vus', debug=True)



## Setting up the WES run

In [19]:
params = {
    "project": "forei/fasp-vus",
    "inputs": {
        'save_files': 'True',
        'ensembl_release': '75',
        'nomad_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617e05ebd1cbfe46094811ba',
                        'name': 'gnomad_grch37_chr13_brca2.vcf',
                        'class': 'File'},

        'cooccurence_app': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617f288d9c5e581c1b2032b7',
                        'name': 'cooccurrenceFinder7.py',
                        'class': 'File'},

        'hg_version': '37',
        'vcf_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617dede89c5e581c1b200dfa',
                     'name': '_1_PCA.r1.TCGAbarcode.merge.tnSwapCorrected.10389.filtered.vcf',
                     'class': 'File'},

        'pathogenicity_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617c5ce6e6261a31b6d12ddc',
                               'name': 'clinvar_BRCA2_full.txt',
                               'class': 'File'}

        }
    }



Now we have formulated the body in the way that it can be passed to a client function as follows.

In [4]:
import json
run_id= cl.runGenericWorkflow(
    workflow_url="sbg://forei/fasp-vus/cooccurence/21",
    workflow_params = json.dumps(params),
    workflow_type = "CWL",
    workflow_type_version = "sbg:draft-2"
)
run_id

'75ccf738-def9-4e89-b585-9de64b6b65f2'

In [8]:
import dateutil.parser
print(cl.getTaskStatus(run_id))
log = cl.GetRunLog(run_id)
if log['run_log']['start_time']:
    start = dateutil.parser.isoparse(log['run_log']['start_time'])
    end = dateutil.parser.isoparse(log['run_log']['end_time'])
    duration = end - start
    print(str(duration))

COMPLETE
0:08:51


In [9]:
cl.GetRunLog(run_id)

{'request': {'tags': {},
  'workflow_params': {'name': 'cooccurrence run - 10-31-21 23:55:30',
   'project': 'forei/fasp-vus',
   'inputs': {'p2': None,
    'save_files': 'True',
    'gene': None,
    'chromosome': None,
    'pathology_file': None,
    'nomad_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617e05ebd1cbfe46094811ba',
     'basename': 'gnomad_grch37_chr13_brca2.vcf',
     'nameext': '.vcf',
     'class': 'File',
     'nameroot': 'gnomad_grch37_chr13_brca2'},
    'vcf_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617dede89c5e581c1b200dfa',
     'basename': '_1_PCA.r1.TCGAbarcode.merge.tnSwapCorrected.10389.filtered.vcf',
     'nameext': '.vcf',
     'class': 'File',
     'nameroot': '_1_PCA.r1.TCGAbarcode.merge.tnSwapCorrected.10389.filtered'},
    'phased': None,
    'data_directory': None,
    'ensembl_release': '75',
    'hg_version': '37',
    'pathogenicity_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617f27419c5e581c1b2032b1',
     'basename': 'clin

## Getting the results - via DRS
Once the run is complete, further steps can use DRS to obtain the file output from the workflow.

In [10]:
runLog = cl.GetRunLog(run_id)
runLog['outputs']

{'vpi_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617f2f06d1cbfe4609483051',
  'basename': '_12_BRCA2-vpi.json',
  'nameext': '.json',
  'class': 'File',
  'nameroot': '_12_BRCA2-vpi'},
 'pathology_output': None,
 'all_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617f2f06d1cbfe4609483053',
  'basename': '_12_BRCA2-all.json',
  'nameext': '.json',
  'class': 'File',
  'nameroot': '_12_BRCA2-all'},
 'out_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617f2f06d1cbfe4609483055',
  'basename': '_15_BRCA2-cooccurrences.json',
  'nameext': '.json',
  'class': 'File',
  'nameroot': '_15_BRCA2-cooccurrences'},
 'ipv_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617f2f06d1cbfe4609483059',
  'basename': '_12_BRCA2-ipv.json',
  'nameext': '.json',
  'class': 'File',
  'nameroot': '_12_BRCA2-ipv'},
 'tout_file': {'path': 'drs://cgc-ga4gh-api.sbgenomics.com/617f2f06d1cbfe4609483057',
  'basename': '_12_BRCA2-tout.json',
  'nameext': '.json',
  'class': 'File',
  'nameroot':

In [11]:
resultsDRSID = runLog['outputs']['out_file']['path']
resultsDRSID

'drs://cgc-ga4gh-api.sbgenomics.com/617f2f06d1cbfe4609483055'

Use the CGC DRS Server to retrieve the results files

In [12]:
from fasp.loc import sbcgcDRSClient
drsClient = sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3')

### DRS GetObject
Here's how we then get details of the file. Note that here only the id portion of the DRS URI is being passed. It is the job of a metaresolver to look at that URI and to determine where to send the id. As noted, we are passing up on the opportunity to use a metaresolver and putting in the id manually.

In [13]:
sbDRSID = resultsDRSID.split('/')[-1]
fileDetails = drsClient.getObject(sbDRSID)
fileDetails

{'id': '617f2f06d1cbfe4609483055',
 'name': '_15_BRCA2-cooccurrences.json',
 'size': 38886,
 'checksums': [{'type': 'etag',
   'checksum': '774cd65cc59dbb738843e2b40fa81f45-1'}],
 'self_uri': 'drs://cgc-ga4gh-api.sbgenomics.com/617f2f06d1cbfe4609483055',
 'created_time': '2021-11-01T00:04:22Z',
 'updated_time': '2021-11-01T00:04:22Z',
 'mime_type': 'application/json',
 'access_methods': [{'type': 's3',
   'region': 'us-east-1',
   'access_id': 'aws-us-east-1'}]}

In [14]:
url = drsClient.getAccessURL(sbDRSID,'s3')

### Downloading the file
Now we can use the url obtained to download the file. We'll create a small function to encapsulate the download.

In [15]:
import requests
import os
def download(url, file_path):
    with open(os.path.expanduser(file_path), "wb") as file:
        response = requests.get(url)
        file.write(response.content)

In [16]:
fullPath = fileDetails['name']
download(url, fullPath)


In [20]:
with open(fullPath) as json_file:
    data = json.load(json_file)
# delete the local copy of the file
#os.remove(fullPath)

In [21]:
# flatten cooccurrence output
flat_vus = []
for k, v in data['cooccurring vus'].items():
    pathogenic_count = len(v['pathogenic variants'])
    ## this is a pythonic way of merging dicts - it is cryptic
    z = {**{"vus":k}, **v['likelihood data'], **v['allele frequencies'], **{"no_pathogenic_coocurrs":pathogenic_count}}
    flat_vus.append(z)

# turn the array of dicts into a data frame    
import pandas as pd
flat_df = pd.DataFrame(flat_vus)
flat_df

Unnamed: 0,vus,p1,p2,n,k,likelihood,maxPop,maxPopFreq,cohortFreq,no_pathogenic_coocurrs
0,"(13, 32890572, 'G', 'A')",0.000337,0.001,4541,1,0.145945,,,0.437097,1
1,"(13, 32900933, 'T', 'A')",0.000337,0.001,1022,1,1.507573,,,0.098373,1
2,"(13, 32905265, 'G', 'A')",0.000337,0.001,6165,6,11.48579,,,0.593416,6
3,"(13, 32911888, 'A', 'G')",0.000337,0.001,5369,1,0.084252,,,0.516797,1
4,"(13, 32913055, 'A', 'G')",0.000337,0.001,10385,7,2.074215,,,0.999615,7
5,"(13, 32915005, 'G', 'C')",0.000337,0.001,10382,7,2.078348,,,0.999326,7
6,"(13, 32915410, 'CAATT', 'C')",0.000337,0.001,5233,1,0.092209,,,0.503706,1
7,"(13, 32920844, 'T', 'C')",0.000337,0.001,6657,5,2.789874,,,0.640774,5
8,"(13, 32929232, 'A', 'G')",0.000337,0.001,4120,1,0.192979,,,0.396573,1
9,"(13, 32929387, 'T', 'C')",0.000337,0.001,10386,7,2.072839,,,0.999711,7


In [22]:
# homozygous vus output
homozygous_vus = []
for k, v in data['homozygous vus'].items():
    ## this is a pythonic way of merging dicts - it is cryptic
    z = {**{"vus":k}, **v}
    homozygous_vus.append(z)

# turn the array of dicts into a data frame    
import pandas as pd
hz_df = pd.DataFrame(homozygous_vus)
hz_df

Unnamed: 0,vus,count,maxPop,maxPopFreq,cohortFreq
0,"(13, 32890572, 'G', 'A')",663,,,0.063817
1,"(13, 32900933, 'T', 'A')",367,,,0.035326
2,"(13, 32905265, 'G', 'A')",6098,,,0.586967
3,"(13, 32911888, 'A', 'G')",968,,,0.093175
4,"(13, 32913055, 'A', 'G')",10239,,,0.985562
...,...,...,...,...,...
253,"(13, 32893724, 'G', 'A')",1,,,0.000096
254,"(13, 32965764, 'C', 'T')",1,,,0.000096
255,"(13, 32905262, 'C', 'G')",1,,,0.000096
256,"(13, 32942101, 'C', 'T')",1,,,0.000096


## To do
- Submit the pathogenicity file from the local system
- Either  access the gnomad file from Gnomad, or supply it from the local system

## Done
- Make the container available to other WES servers by adding the Docker container to Docker Hub instead of the Seven Bridges docker repository
