## Run WES workflow on Kids First DRS files searched on NCPI FHIR server

This script demonstrates uses of the NCPI FHIR server to find patients and files from the Familial Leukemia study. The query returns DRS ids allowing the files accessed via the Kid's First Data Resource Center DRS server. The workflow is run on the Seven Bridges Cancer Genomics Cloud WES server. 

The FHIR server is accessed through the fhir-py Python client.

### Set up the FHIR client
Access is via a cookie obtained as described [here](https://github.com/NIH-NCPI/ncpi-api-fhir-service).

In [283]:
import os
import sys
import json
import requests
import pandas as pd 
import numpy as np
from pathlib import Path
from datetime import datetime
import pprint

pprint = pprint.PrettyPrinter(indent=1).pprint
# pprint() is now available to pretty-print any JSON

FHIR_SERVER = 'https://kf-api-fhir-service.kidsfirstdrc.org'

# Optional: Turn off SSL verification. Useful when dealing with a corporate proxy with self-signed certificates.
# This should be set to True unless you actually see certificate errors.
VERIFY_SSL = False

if not VERIFY_SSL:
    requests.packages.urllib3.disable_warnings()



# Kids First uses cookie-based authentication, save the cookie locally here.
full_cookie_path = os.path.expanduser('~/.keys/ncpi_prod_fhir_cookie.json')

with open(full_cookie_path) as f:
    cookies = json.load(f)
kf_cookie = cookies['Cookie']
            
# We make a requests.Session to ensure consistent headers/cookie across all the requests we make
s = requests.Session()
s.headers.update({'Accept': 'application/fhir+json'})
s.verify = VERIFY_SSL
s.cookies['AWSELBAuthSessionCookie-0'] = kf_cookie


# Test out the cookie by querying the server metadata
r = s.get(f"{FHIR_SERVER}/metadata")

if "<!DOCTYPE html>" in r.text:
    sys.stderr.write('ERROR: Could not authenticate with Kids First. The cookie may need to be updated')
    

# This helper method allows us to easily switch between printing an entire Bundle, or just the first 20 lines.
# Set truncate_for_github = False for actual use,
# or just replace the function with a `return bundle`

def print_bundle(bundle, truncate_for_github = False):
    if not truncate_for_github:
        return bundle
    
    lines = json.dumps(bundle, indent=2).split('\n')
    if len(lines) <= 20:
        return bundle
    else:   
        print('\n'.join(lines[:20]))
        print('...\nBundle truncated. Change the "print_bundle" function above to print the full content.')

In [207]:
# Resolves all pages for the bundle. Returns an array with all Bundles, including the original Bundle.
def resolve_pages(bundle):
    next_page_link = next(filter(lambda link: link['relation'] == 'next', bundle['link']), None)
    if next_page_link:
        next_page = s.get(next_page_link['url']).json()
        return [bundle] + resolve_pages(next_page)
    else:
        return [bundle]

# NOTE: No cell output.

# Function to run a query for any kind of resources.
def runQuery(query):
    r = s.get(f"{FHIR_SERVER}/{query}")
    first_bundle = r.json()
    if 'entry' not in first_bundle:
        resources = []
    else:    
        all_bundles = resolve_pages(first_bundle)
        resources = [entry['resource'] for bundle in all_bundles for entry in bundle['entry']]
        
    print(f"Total  Resources: {len(resources)}")
    return resources

In [284]:
studies = runQuery('ResearchStudy')
studies

Total  Resources: 29


[{'resourceType': 'ResearchStudy',
  'id': '687202',
  'meta': {'versionId': '1',
   'lastUpdated': '2022-06-20T14:25:32.953+00:00',
   'source': '#RrhpFw6OupcziLQu',
   'profile': ['http://hl7.org/fhir/StructureDefinition/ResearchStudy']},
  'identifier': [{'use': 'official',
    'system': 'https://kf-api-dataservice.kidsfirstdrc.org/studies/',
    'value': 'SD_HGHFVPFD'},
   {'system': 'urn:kids-first:unique-string',
    'value': 'ResearchStudy-SD_HGHFVPFD'},
   {'system': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=',
    'value': 'phs000424.v8.p2'}],
  'title': 'Common Fund (CF) Genotype-Tissue Expression Project (GTEx)',
  'status': 'completed',
  'category': [{'coding': [{'system': 'http://snomed.info/sct',
      'code': '86049000',
      'display': 'Malignant neoplasm, primary (morphologic abnormality)'}],
    'text': 'CANCER'}],
  'keyword': [{'coding': [{'code': 'Kids First'}]},
   {'coding': [{'code': 'KF-GTEX'}]}]},
 {'resourceType': 'ResearchStudy'

The Ewings Sarcoma study is id 52326

In [285]:
study_id = "52326"
subjects = runQuery(f'ResearchSubject?study={study_id}')
pprint(subjects[0])

Total  Resources: 1203
{'id': '53466',
 'identifier': [{'value': 'Schiffman-232'},
                {'system': 'https://kf-api-dataservice.kidsfirstdrc.org/participants/',
                 'value': 'PT_SKYS7BYE'},
                {'system': 'urn:kids-first:unique-string',
                 'value': 'ResearchSubject-SD_YGVA0E1C-PT_SKYS7BYE'}],
 'individual': {'reference': 'Patient/51063'},
 'meta': {'lastUpdated': '2021-11-16T16:51:37.022+00:00',
          'profile': ['http://hl7.org/fhir/StructureDefinition/ResearchSubject'],
          'source': '#6cXjyvyFlK310TS1',
          'tag': [{'code': 'SD_YGVA0E1C'}],
          'versionId': '3'},
 'resourceType': 'ResearchSubject',
 'status': 'off-study',
 'study': {'reference': 'ResearchStudy/52326'}}


In [194]:
patients = runQuery(f'Patient?_has:ResearchSubject:individual:study={study_id}')
pprint(patients[0])

Total  Resources: 1203
{'extension': [{'extension': [{'url': 'text', 'valueString': 'Not Reported'}],
                'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race'},
               {'extension': [{'url': 'text', 'valueString': 'Not Reported'}],
                'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity'}],
 'gender': 'female',
 'id': '51063',
 'identifier': [{'value': 'Schiffman-232'},
                {'system': 'https://kf-api-dataservice.kidsfirstdrc.org/participants/',
                 'value': 'PT_SKYS7BYE'},
                {'system': 'urn:kids-first:unique-string',
                 'value': 'Patient-SD_YGVA0E1C-PT_SKYS7BYE'}],
 'meta': {'lastUpdated': '2021-11-16T08:31:55.743+00:00',
          'profile': ['http://hl7.org/fhir/StructureDefinition/Patient'],
          'source': '#ftuteWwUU3WhzzKW',
          'tag': [{'code': 'SD_YGVA0E1C'}],
          'versionId': '2'},
 'resourceType': 'Patient'}


## Retrieve phenotypic data for each subject from FHIR
Save the results to a DataFrame

In [273]:
import pandas as pd
drs_data = []
ii=0
for pjson in patients:
    print (f"Patient/{pjson['id']}")
    drs_uri = getFiles(pjson['id'], 'bam')
    #print(drs_uri)
    if drs_uri:
        drs_id = drs_uri.split('/')[-1]
        drs_details = drsClient.getObject(drs_id)
        self_uri = drs_details['self_uri']
        url = drsClient.getAccessURL(drs_id, 's3')
        drs_data.append({'drs_uri':drs_uri, 'patient':pjson['id'], 'self_uri':self_uri, 'url':url})
    
    ii +=1
    if ii > 50: break
df = pd.DataFrame.from_dict(drs_data)
df

Patient/51063
Total  Resources: 5
# of documents for subject 51063 :5
Unauthorized for that DRS id
Patient/51052
Total  Resources: 5
# of documents for subject 51052 :5
Patient/51059
Total  Resources: 5
# of documents for subject 51059 :5
Patient/51064
Total  Resources: 0
# of documents for subject 51064 :0
Patient/51056
Total  Resources: 5
# of documents for subject 51056 :5
Patient/51051
Total  Resources: 5
# of documents for subject 51051 :5
Patient/51040
Total  Resources: 0
# of documents for subject 51040 :0
Patient/51058
Total  Resources: 7
# of documents for subject 51058 :7
Patient/51060
Total  Resources: 7
# of documents for subject 51060 :7
Patient/51049
Total  Resources: 5
# of documents for subject 51049 :5
Patient/51046
Total  Resources: 7
# of documents for subject 51046 :7
Patient/51047
Total  Resources: 5
# of documents for subject 51047 :5
Patient/51053
Total  Resources: 0
# of documents for subject 51053 :0
Patient/51048
Total  Resources: 0
# of documents for subject 

Unnamed: 0,drs_uri,patient,self_uri,url
0,drs://data.kidsfirstdrc.org/520bd083-82c1-4f00...,51063,drs://dg.F82A1A:520bd083-82c1-4f00-abf5-ca1783...,
1,drs://data.kidsfirstdrc.org/805860ce-d0e3-406c...,51052,drs://dg.F82A1A:805860ce-d0e3-406c-b652-c60f72...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...
2,drs://data.kidsfirstdrc.org/d41fce40-b916-4d28...,51059,drs://dg.F82A1A:d41fce40-b916-4d28-a294-5364ad...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...
3,drs://data.kidsfirstdrc.org/938490ec-11b6-4a16...,51056,drs://dg.F82A1A:938490ec-11b6-4a16-ac42-e81df1...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...
4,drs://data.kidsfirstdrc.org/ea26fe12-a138-4325...,51051,drs://dg.F82A1A:ea26fe12-a138-4325-a783-d881ed...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...
5,drs://data.kidsfirstdrc.org/c9b83883-fc86-4ac5...,51058,drs://dg.F82A1A:c9b83883-fc86-4ac5-95b2-cb1aab...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...
6,drs://data.kidsfirstdrc.org/98eeb3eb-7652-4d74...,51060,drs://dg.F82A1A:98eeb3eb-7652-4d74-b565-c00aa5...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...
7,drs://data.kidsfirstdrc.org/a052d1a9-e71c-4ccc...,51049,drs://dg.F82A1A:a052d1a9-e71c-4ccc-a4ab-e6cdf0...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...
8,drs://data.kidsfirstdrc.org/31ef38aa-98d9-4b98...,51046,drs://dg.F82A1A:31ef38aa-98d9-4b98-ac56-dd7a3a...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...
9,drs://data.kidsfirstdrc.org/05493152-872b-41a4...,51047,drs://dg.F82A1A:05493152-872b-41a4-b2aa-d7a23b...,https://kf-seq-data-washu.s3.amazonaws.com/Ewi...


In [164]:
patient_id = '51063'
q = f'DocumentReference?subject=Patient/51020'

print(q)
docs = runQuery(q)
print(len(docs))

#f"{FHIR_SERVER}/DocumentReference?subject=Patient/{subject_id}"

DocumentReference?subject=Patient/51020
Total  Resources: 7
7


In [165]:
q = f'Observation?subject=Patient/{patient_id}'
print(q)
observations = runQuery(q)
print(len(observations))

Observation?subject=Patient/51063
Total  Resources: 1
1


In [132]:
observations

[{'resourceType': 'Observation',
  'id': '51331',
  'meta': {'versionId': '1',
   'lastUpdated': '2021-10-14T19:15:40.213+00:00',
   'source': '#fYB2lw2s2oPs1vCF',
   'profile': ['https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/family-relationship']},
  'identifier': [{'system': 'https://kf-api-dataservice.kidsfirstdrc.org/family-relationships/',
    'value': 'FR_XP292D7F'},
   {'system': 'urn:kids-first:unique-string',
    'value': 'Observation-PT_28H2AWQ8-Father-PT_MSB54TSM'}],
  'status': 'final',
  'code': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-RoleCode',
     'code': 'FAMMEMB',
     'display': 'family member'}],
   'text': 'Family relationship'},
  'subject': {'reference': 'Patient/51020'},
  'focus': [{'reference': 'Patient/51021'}],
  'valueCodeableConcept': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-RoleCode',
     'code': 'FTH',
     'display': 'father'}],
   'text': 'Father'}}]

In [92]:
import pandas as pd
mySubjects = []
for pjson in patients:
    #sjson = s.serialize()
    #patientID = sjson['individual']['reference']
    #thisSubject = {"patientID":patientID}
    #patients = client.resources('Patient')
    #p = patients.search(_id=patientID).get()
    #pjson=p.serialize()
    #print(json.dumps(pjson, indent=3))
    

    #print("Observations",'.'*20)
    patientID = f"Patient/{pjson['id']}"
    #resources = client.resources('')
    observations = runQuery(f'Observation?subject=Patient/{patientID}')
    #observations = resources.search(subject=patientID).fetch_all()
    on = 0
    csn = 0
    fmn = 0
    for ojson in observations:
        on = on+1
        #ojson=o.serialize()
        #print(json.dumps(ojson, indent=3))
        if ojson['code']['text'] == 'Clinical status':
            csn +=1
            thisSubject['ClinStatus{}'.format(csn)] = ojson['valueCodeableConcept']['text']
            #thisSubject['CS{}_code'.format(csn)] = ojson['code']['coding'][0]['code']
        '''
        elif ojson['code']['coding'][0]['code'] == 'FAMMEMB':
            fmn +=1
            thisSubject['FamMember{}'.format(fmn)] = ojson['valueCodeableConcept']['text']
            thisSubject['FamMember{}_focus'.format(fmn)] = ojson['focus'][0]['reference']
            #thisSubject['O{}_code'.format(on)] = ojson['code']['coding'][0]['code']

        else:
            thisSubject['O{}_valuetext'.format(on)] = ojson['valueCodeableConcept']['text']
            thisSubject['O{}_code'.format(on)] = ojson['code']['coding'][0]['code']

        print("."*50)
        '''

    #print("Conditions",'+'*20)
    #resources = client.resources('Condition')
    #conditions = resources.search(subject=patientID).fetch_all()
    conditions = runQuery(f'Condition?patientID={patientID}')
    cn = 0
    for cjson in conditions:
        cn += 1
        #cjson=c.serialize()
        #print(json.dumps(cjson, indent=3))
        #print("+"*50)
        thisSubject['Cndtn{}_codetext'.format(cn)] = c['code']['text']
        cdn = 0
        if 'coding' in c['code']:
            for cd in c['code']['coding']:
                cdn +=1
                thisSubject['Cndtn{}_code{}'.format(cn,cdn)] = cd['code']

    mySubjects.append(thisSubject)
    #print('_'*50)
    
df = pd.DataFrame(mySubjects)

KeyError: 'link'

In [6]:
df

Unnamed: 0,patientID,ClinStatus1,ClinStatus2,Cndtn1_codetext,Cndtn1_code1,Cndtn1_code2
0,Patient/576466,Alive,Alive,,,
1,Patient/576465,Alive,Alive,ALL,MONDO:0004967,NCIT:C3167
2,Patient/576467,Alive,Alive,,,
3,Patient/576473,Alive,Alive,NHL,MONDO:0018908,NCIT:C3211
4,Patient/576468,Alive,Alive,HL,MONDO:0004952,NCIT:C9357
5,Patient/576474,Alive,Alive,Hodgkins Lymphoma,MONDO:0004952,NCIT:C9357
6,Patient/576470,Alive,Alive,Hodgkin's Lymphoma,MONDO:0004620,NCIT:C24182
7,Patient/576469,Alive,Alive,,,
8,Patient/576472,Alive,Alive,,,
9,Patient/576450,Reported Unknown,Reported Unknown,HL,MONDO:0004952,NCIT:C9357


Use DRSMetaresolver to direct requests for DRS URIs to the right DRS server

In [5]:
from fasp.loc import DRSMetaResolver
drsClient = DRSMetaResolver()

Searching the GA4GH registry for org.ga4gh:drs services
GA4GH registry unavailable, cannot get registered DRS services.
Continuing with locally known DRS services.


Set up a function to retrieve a DRS id from the FHIR server

In [166]:
r = s.get(f"{FHIR_SERVER}/DocumentReference?subject=Patient/51063")
bundle = r.json()
print_bundle(bundle)

{'resourceType': 'Bundle',
 'id': 'ea70c81d-a496-4425-8cba-fa5a7e06b50a',
 'meta': {'lastUpdated': '2022-06-24T22:38:49.282+00:00'},
 'type': 'searchset',
 'total': 5,
 'link': [{'relation': 'self',
   'url': 'https://kf-api-fhir-service.kidsfirstdrc.org/DocumentReference?subject=Patient%2F51063'}],
 'entry': [{'fullUrl': 'https://kf-api-fhir-service.kidsfirstdrc.org/DocumentReference/394851',
   'resource': {'resourceType': 'DocumentReference',
    'id': '394851',
    'meta': {'versionId': '3',
     'lastUpdated': '2022-01-18T23:05:04.173+00:00',
     'source': '#pegoPWj2pe9YdkYf',
     'profile': ['https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/drs-document-reference'],
     'tag': [{'code': 'SD_YGVA0E1C'}]},
    'identifier': [{'system': 'https://kf-api-dataservice.kidsfirstdrc.org/genomic-files/',
      'value': 'GF_61JFHQKF'},
     {'system': 'urn:kids-first:unique-string',
      'value': 'DocumentReference-SD_YGVA0E1C-GF_61JFHQKF'}],
    'status': 'current',
    'doc

In [203]:
def getFiles(subject_id, docFormat):
    #r = s.get(f"{FHIR_SERVER}/DocumentReference?subject=Patient/{subject_id}")
    #bundle = r.json()
    #resources = resources.search(subject=subject_id).limit(1000)
    # The following works for the data for certain studies/patients, but not others
    # resources.search(subject=subject_id, format__text=docFormat)
    # So we do a query just based on subject, and then filter the results ourselves
    # The following filters for the required format
    myDocs = []
    retval = None
    documents =runQuery(f"DocumentReference?subject=Patient/{subject_id}")
    #documents = list(map(lambda e: e['resource'], bundle['entry']))
    #with open("docs.json", "w") as f:
        #print(json.dump(documents, f, indent=3))
        #print(json.dumps(documents[0],indent=3))

    print("# of documents for subject {} :{}".format(subject_id, len(documents)))

    for d in documents:
        #djson = d.json()
        #print(json.dumps(d, indent=3))

        #print(d['content'][0])
        #print(d['content'][1]['format']['display'])


        if d['content'][1]['format']['display'] == docFormat:
           retval = d['content'][0]['attachment']['url']
    return retval


In [243]:
drsuri = getFiles('51020', 'bam')
drsuri

Total  Resources: 7
# of documents for subject 51020 :7


'drs://data.kidsfirstdrc.org/e0bd6fcc-7895-4392-9487-c5559ec5d9a1'

In [245]:
drsuri = getFiles('49936', 'bam')
drsuri
drs_id = drsuri.split('/')[-1]
drs_id

Total  Resources: 5
# of documents for subject 49936 :5


'c2bcf2f7-e262-4e37-9262-217a647e9186'

Iterate over each subject to submit the correspinding bam file to the Seven Bridges WES server. The worflow will execute SAMTools Stats on each file.

In [256]:
subj_id = {'individual':{'reference':'51020'}}
subjects =[subj_id]
print(subjects)

[{'individual': {'reference': '51020'}}]


In [257]:
drsuri = getFiles(subjects[0]['individual']['reference'], 'bam')
print(drsuri)
drs_id = drsuri.split('/')[-1]
drs_id

Total  Resources: 7
# of documents for subject 51020 :7
drs://data.kidsfirstdrc.org/e0bd6fcc-7895-4392-9487-c5559ec5d9a1


'e0bd6fcc-7895-4392-9487-c5559ec5d9a1'

In [266]:
from fasp.loc import kfDRSClient
drsClient = kfDRSClient("~/.keys/kf_credentials.json")

In [264]:
drsClient.getObject(drs_id)

{'access_methods': [{'access_id': 's3',
   'access_url': {'url': 's3://kf-seq-data-washu/EwingSarcoma//db3cb5310167464996be1ac9a3cd1468.bam'},
   'region': '',
   'type': 's3'}],
 'aliases': [],
 'checksums': [{'checksum': 'ad41e608ed9499b4f410ffdca88fea9e4598d5f0a1f73aebc8d1bc628f7123b8',
   'type': 'sha256'},
  {'checksum': '7b595bb3dc46b5df4107c395d12ccb47', 'type': 'md5'}],
 'created_time': '2021-07-13T17:17:51.162204',
 'description': None,
 'form': 'object',
 'id': '520bd083-82c1-4f00-abf5-ca1783d751aa',
 'mime_type': 'application/json',
 'name': 'db3cb5310167464996be1ac9a3cd1468.bam',
 'self_uri': 'drs://dg.F82A1A:520bd083-82c1-4f00-abf5-ca1783d751aa',
 'size': 59215639645,
 'updated_time': '2021-09-14T14:18:21.113043',
 'version': 'e01b4f4f'}

In [265]:
drsClient.getAccessURL(drs_id, 's3')

Unauthorized for that DRS id


In [274]:
from fasp.workflow import sbcgcWESClient
from fasp.runner import FASPRunner
import datetime

faspRunner = FASPRunner()
runNote = 'KF Ewings file retrieved from NCPI FHIR Server and computed on SB'

# Set up WES Client for WES server of our choice
settings = faspRunner.settings
wesClient = sbcgcWESClient(settings['SevenBridgesProject'])

drsClient = kfDRSClient("~/.keys/kf_credentials.json")

run_ids = []

# repeat for each row of the query
limit = 5
nn= 0
for pjson in patients:
    patientID = pjson['id']
    print (f"Patient/{patientID}")
    drsuri = getFiles(patientID, 'bam')
    
    #DRS ids stored in the NCPI FHIR server include the host prefix. Just get the id
    #subjectID = sjson['id']
    
    if drsuri:
        print("subject={}, DRS URI={}".format(patientID, drsuri))
        #drsID = drsuri.split('/')[-1]
        #Call the DRS server to get file details
        # We're going to keep the file size
        drs_id = drsuri.split('/')[-1]
        objInfo = drsClient.getObject(drs_id)
        print(objInfo)
        fileSize = objInfo['size']
        # Use DRS to get the URL
        url = drsClient.getAccessURL(drs_id,'s3')
        # Step 3 - Run a pipeline on the file at the drs url
        if url:
            outfile = "{}.txt".format(patientID)
            pipeline_id = wesClient.runWorkflow(url, outfile)
            print('submitted:{}'.format(pipeline_id))
        else:
            print("could not get URL for subject={}".format(patientID))
    else:
        print("No bam for subject={}".format(patientID))
        pipeline_id = None
        fileSize = None
        

    via = 'WES'
    note = 'WES samtools on KF test'
    time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
    faspRunner.logRun(time, via, note,  pipeline_id, outfile, str(fileSize),
        client, drsClient, wesClient)
    run_ids.append({"patientID":patientID, "run_id":pipeline_id})
    
    nn += 1
    if nn > limit:
        break

runs_df = pd.DataFrame(run_ids)
runs_df


Running /var/folders/wz/jjbxsnr13v7dkw2jkbpmqd_dly65gq/T/ipykernel_73464/261786707.py
Patient/51063
Total  Resources: 5
# of documents for subject 51063 :5
subject=51063, DRS URI=drs://data.kidsfirstdrc.org/520bd083-82c1-4f00-abf5-ca1783d751aa
{'access_methods': [{'access_id': 's3', 'access_url': {'url': 's3://kf-seq-data-washu/EwingSarcoma//db3cb5310167464996be1ac9a3cd1468.bam'}, 'region': '', 'type': 's3'}], 'aliases': [], 'checksums': [{'checksum': 'ad41e608ed9499b4f410ffdca88fea9e4598d5f0a1f73aebc8d1bc628f7123b8', 'type': 'sha256'}, {'checksum': '7b595bb3dc46b5df4107c395d12ccb47', 'type': 'md5'}], 'created_time': '2021-07-13T17:17:51.162204', 'description': None, 'form': 'object', 'id': '520bd083-82c1-4f00-abf5-ca1783d751aa', 'mime_type': 'application/json', 'name': 'db3cb5310167464996be1ac9a3cd1468.bam', 'self_uri': 'drs://dg.F82A1A:520bd083-82c1-4f00-abf5-ca1783d751aa', 'size': 59215639645, 'updated_time': '2021-09-14T14:18:21.113043', 'version': 'e01b4f4f'}
Unauthorized for that

Unnamed: 0,patientID,run_id
0,51063,350180ed-0558-427c-bf71-25d89d666a71
1,51052,8f3b22d7-a02e-461b-ae7a-dc2a11e5022a
2,51059,5aedde42-7a27-4991-844a-6fbfc17f932e
3,51064,
4,51056,46978154-0027-45d0-9818-39fe8fbf702f
5,51051,5ca73a9a-93b3-4950-93cd-7bfcea344065


Check status of each run via the WES server

In [277]:
for index, row in runs_df.iterrows():
    if row['run_id'] != None:
        status = wesClient.getTaskStatus(row['run_id'])
        runs_df.at[index, "status"] = status
    else:
        print ('no run for {}'.format(row['patientID']))
runs_df

no run for 51064


Unnamed: 0,patientID,run_id,status
0,51063,350180ed-0558-427c-bf71-25d89d666a71,COMPLETE
1,51052,8f3b22d7-a02e-461b-ae7a-dc2a11e5022a,COMPLETE
2,51059,5aedde42-7a27-4991-844a-6fbfc17f932e,COMPLETE
3,51064,,
4,51056,46978154-0027-45d0-9818-39fe8fbf702f,COMPLETE
5,51051,5ca73a9a-93b3-4950-93cd-7bfcea344065,COMPLETE


Define function to download a file and to extract the result from the output file from SAMTools Stats

In [217]:
import requests
import os
def download(url, file_path):
    '''Download a file from a URL to a local file path'''
    with open(os.path.expanduser(file_path), "wb") as file:
        response = requests.get(url)
        file.write(response.content)
        
def getStats(filePath, statsList):
    ''' Extract values from a SAMTools Stats results file'''
    ### 
    
    retDict = {}
    f = open(filePath, "r")
    for x in f:
        if x.startswith('SN'):   
            parts = x.split('\t')
            statName = parts[1].split(':')[0]
            if statName in statsList:
                retDict[statName] = parts[2].rstrip()
    return retDict
 

For the completed workflows retrieve the results file and extract the required result values. Add the result values to a DataFrame.

In [279]:
from fasp.loc import sbcgcDRSClient
results_drs= sbcgcDRSClient("/Users/forei/.keys/sevenbridges_keys.json", "s3")
statsList = []
for index, row in runs_df.iterrows():
    if row['run_id'] != None and row['status'] == 'COMPLETE':
        log = wesClient.getRunLog(row['run_id'])
        results_drs_uri = log['outputs']['statistics']['path']
        print("DRS URI for results for subject {}: {}".format(row['patientID'] ,results_drs_uri))
        resultsDRSID = results_drs_uri.split('/')[-1]
        url = results_drs.getAccessURL(resultsDRSID,'s3')
        fileName = log['outputs']['statistics']['name']
        download(url, fileName)
        statsRequired = ['insert size average','insert size standard deviation']
        stats = getStats(fileName, statsRequired)
        #print(stats)
        stats['run_id'] = row['run_id']
        stats['patientID'] = row['patientID']
        statsList.append(stats)
        os.remove(fileName)
stats_df =  pd.DataFrame(statsList)
stats_df

DRS URI for results for subject 51063: drs://cgc-ga4gh-api.sbgenomics.com/62b64d6a4e3edb6b1c397b59
DRS URI for results for subject 51052: drs://cgc-ga4gh-api.sbgenomics.com/62b67ee14e3edb6b1c3988c3
DRS URI for results for subject 51059: drs://cgc-ga4gh-api.sbgenomics.com/62b67cc24e3edb6b1c398859
DRS URI for results for subject 51056: drs://cgc-ga4gh-api.sbgenomics.com/62b67cdaf08fea477057664d
DRS URI for results for subject 51051: drs://cgc-ga4gh-api.sbgenomics.com/62b67ca44e3edb6b1c39883d


Unnamed: 0,insert size average,insert size standard deviation,run_id,patientID
0,364.3,83.7,350180ed-0558-427c-bf71-25d89d666a71,51063
1,365.8,78.9,8f3b22d7-a02e-461b-ae7a-dc2a11e5022a,51052
2,337.0,73.6,5aedde42-7a27-4991-844a-6fbfc17f932e,51059
3,405.7,92.5,46978154-0027-45d0-9818-39fe8fbf702f,51056
4,357.8,82.0,5ca73a9a-93b3-4950-93cd-7bfcea344065,51051


In [28]:
final_df = pd.merge(
    df,
    stats_df,
    how="outer",
    left_on='patientID',
    right_on='patientID'
)
final_df.drop('run_id', axis=1, inplace=True)
final_df.drop('ClinStatus2', axis=1, inplace=True)

final_df

Unnamed: 0,patientID,ClinStatus1,Cndtn1_codetext,Cndtn1_code1,Cndtn1_code2,insert size average,insert size standard deviation
0,Patient/576466,Alive,,,,360.3,112.1
1,Patient/576465,Alive,ALL,MONDO:0004967,NCIT:C3167,378.4,116.0
2,Patient/576467,Alive,,,,365.1,131.9
3,Patient/576473,Alive,NHL,MONDO:0018908,NCIT:C3211,346.4,107.9
4,Patient/576468,Alive,HL,MONDO:0004952,NCIT:C9357,350.7,113.6
5,Patient/576474,Alive,Hodgkins Lymphoma,MONDO:0004952,NCIT:C9357,371.8,98.6
6,Patient/576470,Alive,Hodgkin's Lymphoma,MONDO:0004620,NCIT:C24182,357.5,100.7
7,Patient/576469,Alive,,,,338.3,102.0
8,Patient/576472,Alive,,,,,
9,Patient/576450,Reported Unknown,HL,MONDO:0004952,NCIT:C9357,291.8,130.0
