# Launching WES workflow on DRS objects based on a Data Connect query

## Check if endpoints are available

1) Check if WES endpoints are available

In [43]:
import requests
import json

def pretty_print_json(response):
    # pretty print JSON in blue color
    print("\033[38;2;8;75;138m"+json.dumps(response.json(), indent=4)+"\033[0m")
    
def print_head(text):
    # print in green color
    print("\033[38;2;8;138;75m"+text+"\033[0m")

In [44]:
node_ips = ['ga4gh-starter-kit.ilifu.ac.za','osdp.ace.ac.ug','elwazi-node.icermali.org' ] #, '196.43.136.22'] 
wes_port = "6000"
service_info_path = "/service-info"
runs_path = "/runs"
http_method = "GET"

for node_ip in node_ips:
    ga4gh_base_url = "http://" + node_ip + ":{}/ga4gh/{}/v1"
    wes_base_url = ga4gh_base_url.format(wes_port,"wes")
    request_url = wes_base_url+service_info_path
    print_head("{} request to {}".format(http_method, request_url))
    # GET request to service-info endpoint
    wes_service_info_resp = requests.request(http_method, request_url)
    # print the response
    pretty_print_json(wes_service_info_resp)

[38;2;8;138;75mGET request to http://ga4gh-starter-kit.ilifu.ac.za:6000/ga4gh/wes/v1/service-info[0m
[38;2;8;75;138m{
    "id": "org.ga4gh.starterkit.wes",
    "name": "GA4GH Starter Kit WES Service",
    "description": "An open source, community-driven implementation of the GA4GH Workflow Execution Service (WES)API specification.",
    "contactUrl": "mailto:info@ga4gh.org",
    "documentationUrl": "https://github.com/ga4gh/ga4gh-starter-kit-wes",
    "createdAt": "2020-01-15T12:00:00Z",
    "updatedAt": "2020-01-15T12:00:00Z",
    "environment": "test",
    "version": "0.3.2",
    "type": {
        "group": "org.ga4gh",
        "artifact": "wes",
        "version": "1.0.1"
    },
    "organization": {
        "name": "Global Alliance for Genomics and Health",
        "url": "https://ga4gh.org"
    },
    "workflow_type_versions": {
        "WDL": [
            "1.0"
        ],
        "NEXTFLOW": [
            "21.04.0"
        ]
    },
    "workflow_engine_versions": {
        "NA

2) Check if DRS endpoints are available

In [45]:
node_ips = ['ga4gh-starter-kit.ilifu.ac.za','osdp.ace.ac.ug','elwazi-node.icermali.org'] #, '196.43.136.22'] 
drs_port = "5000"
service_info_path = "/service-info"
http_method = "GET"

for node_ip in node_ips:
    ga4gh_base_url = "http://" + node_ip + ":{}/ga4gh/{}/v1"
    drs_base_url = ga4gh_base_url.format(drs_port,"drs")
    request_url = drs_base_url+service_info_path
    print_head("{} request to {}".format(http_method, request_url))
    # GET request to service-info endpoint
    drs_service_info_resp = requests.request(http_method, request_url)
    # print the response
    pretty_print_json(drs_service_info_resp)

[38;2;8;138;75mGET request to http://ga4gh-starter-kit.ilifu.ac.za:5000/ga4gh/drs/v1/service-info[0m
[38;2;8;75;138m{
    "id": "org.ga4gh.starterkit.drs",
    "name": "GA4GH Starter Kit DRS Service",
    "description": "An open source, community-driven implementation of the GA4GH Data Repository Service (DRS) API specification.",
    "contactUrl": "mailto:info@ga4gh.org",
    "documentationUrl": "https://github.com/ga4gh/ga4gh-starter-kit-drs",
    "createdAt": "2020-01-15T12:00:00Z",
    "updatedAt": "2020-01-15T12:00:00Z",
    "environment": "test",
    "version": "0.3.2",
    "type": {
        "group": "org.ga4gh",
        "artifact": "drs",
        "version": "1.3.0experimental"
    },
    "organization": {
        "name": "Global Alliance for Genomics and Health",
        "url": "https://ga4gh.org"
    }
}[0m
[38;2;8;138;75mGET request to http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/service-info[0m
[38;2;8;75;138m{
    "id": "org.ga4gh.starterkit.drs",
    "name": "GA4GH Starte

## Check if DRS objects exists on endpoint

1. Check ga4gh-starter-kit.ilifu.ac.za

In [46]:
http_method = "GET"
node_ip = 'ga4gh-starter-kit.ilifu.ac.za'
drs_port = "5000"
drs_ids = ['c542689dba1e54669335c8e25abe6207','5a436bec951fab59dd975bcd10f316f1']
object_path_get = "/objects/{}"
object_path_post = "/objects"
access_path = "/objects/{}/access/{}"

for drs_id in drs_ids:
    ga4gh_base_url = "http://" + node_ip + ":{}/ga4gh/{}/v1"
    drs_base_url = ga4gh_base_url.format(drs_port,"drs")
    request_url = drs_base_url+object_path_get.format(drs_id)
    print_head("{} request to {}".format(http_method, request_url))
    # GET request to /objects/{object_id} endpoint
    drs_object_response = requests.request(http_method, request_url)
    # print the response
    pretty_print_json(drs_object_response)

[38;2;8;138;75mGET request to http://ga4gh-starter-kit.ilifu.ac.za:5000/ga4gh/drs/v1/objects/c542689dba1e54669335c8e25abe6207[0m
[38;2;8;75;138m{
    "id": "c542689dba1e54669335c8e25abe6207",
    "description": "Patient: HG01857, Country: KHV, Region: EAS, Sex: female\n",
    "created_time": "2023-07-24T13:45:58Z",
    "mime_type": "application/cram",
    "name": "HG01857.final.chrX_15494566-15607236",
    "size": 500309,
    "updated_time": "2023-07-24T13:45:58Z",
    "checksums": [
        {
            "checksum": "70ef6da9822aecf071acda427a61e31cbbf8f25b",
            "type": "sha1"
        },
        {
            "checksum": "b9e9f9a1f85de2ee57ba05d3f75de865458c2645284e42deef20de38b9e6a37c",
            "type": "sha256"
        },
        {
            "checksum": "c542689dba1e54669335c8e25abe6207",
            "type": "md5"
        }
    ],
    "self_uri": "drs://ga4gh-starter-kit.ilifu.ac.za:5000/c542689dba1e54669335c8e25abe6207",
    "access_methods": [
        {
          

2.1 Check osdp.ace.ac.ug (ACE - Uganda)

In [47]:
http_method = "GET"
node_ip = 'osdp.ace.ac.ug'
drs_port = "5000"
drs_ids = ['6fa43c7de04b60c1a73a42aa2efc977d','be145a60bc059c154475a2561af0df6b']
object_path_get = "/objects/{}"
object_path_post = "/objects"
access_path = "/objects/{}/access/{}"

for drs_id in drs_ids:
    ga4gh_base_url = "http://" + node_ip + ":{}/ga4gh/{}/v1"
    drs_base_url = ga4gh_base_url.format(drs_port,"drs")
    request_url = drs_base_url+object_path_get.format(drs_id)
    print_head("{} request to {}".format(http_method, request_url))
    # GET request to /objects/{object_id} endpoint
    drs_object_response = requests.request(http_method, request_url)
    # print the response
    pretty_print_json(drs_object_response)

[38;2;8;138;75mGET request to http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/objects/6fa43c7de04b60c1a73a42aa2efc977d[0m
[38;2;8;75;138m{
    "id": "6fa43c7de04b60c1a73a42aa2efc977d",
    "description": "Patient: HG01879, Country: ACB, Region: AFR, Sex: male\n",
    "created_time": "2023-10-30T16:27:28Z",
    "mime_type": "application/cram",
    "name": "HG01879.final.chrX_15494566-15607236",
    "size": 0,
    "updated_time": "2023-10-30T16:27:28Z",
    "checksums": [
        {
            "checksum": "af30841a49bba8733cf1f070ff725dd9cdfc91f1",
            "type": "sha1"
        },
        {
            "checksum": "a61d4570f6d210220fd14b4f7744df46d6dce61df0da35782b528660def996f1",
            "type": "sha256"
        },
        {
            "checksum": "6fa43c7de04b60c1a73a42aa2efc977d",
            "type": "md5"
        }
    ],
    "self_uri": "drs://osdp.ace.ac.ug:5000/6fa43c7de04b60c1a73a42aa2efc977d",
    "access_methods": [
        {
            "access_url": {
                "ur

2.2 Check 196.43.136.22 (UVRI - Uganda)

In [None]:
http_method = "GET"
node_ip = '196.43.136.22' #UVRI - Uganda
drs_port = "5000"
drs_ids = ['1050d0443c2e83f9d9a8933481dcb405','be04a5a90617aeae9a05fb533f544ade']
object_path_get = "/objects/{}"
object_path_post = "/objects"
access_path = "/objects/{}/access/{}"

for drs_id in drs_ids:
    ga4gh_base_url = "http://" + node_ip + ":{}/ga4gh/{}/v1"
    drs_base_url = ga4gh_base_url.format(drs_port,"drs")
    request_url = drs_base_url+object_path_get.format(drs_id)
    print_head("{} request to {}".format(http_method, request_url))
    # GET request to /objects/{object_id} endpoint
    drs_object_response = requests.request(http_method, request_url)
    # print the response
    pretty_print_json(drs_object_response)

3. Check elwazi-node.icermali.org (Mali)

In [48]:
http_method = "GET"
node_ip = 'elwazi-node.icermali.org'
drs_port = "5000"
drs_ids = ['a68c60133f942881983d0e15827bf88f','168d353c6f474ca72e35e9209f921a59']
object_path_get = "/objects/{}"
object_path_post = "/objects"
access_path = "/objects/{}/access/{}"

for drs_id in drs_ids:
    ga4gh_base_url = "http://" + node_ip + ":{}/ga4gh/{}/v1"
    drs_base_url = ga4gh_base_url.format(drs_port,"drs")
    request_url = drs_base_url+object_path_get.format(drs_id)
    print_head("{} request to {}".format(http_method, request_url))
    # GET request to /objects/{object_id} endpoint
    drs_object_response = requests.request(http_method, request_url)
    # print the response
    pretty_print_json(drs_object_response)


[38;2;8;138;75mGET request to http://elwazi-node.icermali.org:5000/ga4gh/drs/v1/objects/a68c60133f942881983d0e15827bf88f[0m
[38;2;8;75;138m{
    "id": "a68c60133f942881983d0e15827bf88f",
    "description": "Patient: HG01880, Country: ACB, Region: AFR, Sex: female\n",
    "created_time": "2023-10-31T09:16:37Z",
    "mime_type": "application/cram",
    "name": "HG01880.final.chrX_15494566-15607236",
    "size": 0,
    "updated_time": "2023-10-31T09:16:37Z",
    "checksums": [
        {
            "checksum": "5747e183e64f2fe4d87da568fd30d0159086d1fd",
            "type": "sha1"
        },
        {
            "checksum": "50284d24e63c3c859f1e3d46a8aa54869e82f6fb37099926f2968fe7e64c15d7",
            "type": "sha256"
        },
        {
            "checksum": "a68c60133f942881983d0e15827bf88f",
            "type": "md5"
        }
    ],
    "self_uri": "drs://elwazi-node.icermali.org:5000/a68c60133f942881983d0e15827bf88f",
    "access_methods": [
        {
            "access_url":

## Launch workflow on test DRS objects from South-Africa, Mali and Uganda (using WES on ga4gh-starter-kit.ilifu.ac.za, elwazi-node.icermali.org or osdp.ace.ac.ug and 196.43.136.22)

1. Launch workflow
- Change the node_ip to run on a different WES endpoint
- Change the infput_file to run on different DRS objects

In [None]:
# node_ip = 'ga4gh-starter-kit.ilifu.ac.za'
# node_ip = 'elwazi-node.icermali.org'
node_ip = 'osdp.ace.ac.ug'
#node_ip = '196.43.136.22' #UVRI
wes_port = "6000"
service_info_path = "/service-info"
runs_path = "/runs"
http_method = "GET"
ga4gh_base_url = "http://" + node_ip + ":{}/ga4gh/{}/v1"
wes_base_url = ga4gh_base_url.format(wes_port,"wes")

http_method = "POST"
request_url = wes_base_url + runs_path

nextflow_workflow_url = "https://github.com/grbot/cram-qc"
# input_file = "drs://ga4gh-starter-kit.ilifu.ac.za:5000/c542689dba1e54669335c8e25abe6207 drs://ga4gh-starter-kit.ilifu.ac.za:5000/5a436bec951fab59dd975bcd10f316f1"
# input_file = "drs://elwazi-node.icermali.org:5000/a68c60133f942881983d0e15827bf88f drs://elwazi-node.icermali.org:5000/168d353c6f474ca72e35e9209f921a59"
input_file = "drs://osdp.ace.ac.ug:5000/6fa43c7de04b60c1a73a42aa2efc977d drs://osdp.ace.ac.ug:5000/be145a60bc059c154475a2561af0df6b" 
# input_file = "drs://196.43.136.22:5000/1050d0443c2e83f9d9a8933481dcb405 drs://196.43.136.22:5000/be04a5a90617aeae9a05fb533f544ade"
data = {
    'workflow_type': 'NEXTFLOW',
    'workflow_type_version': '21.04.0',
    'workflow_url': nextflow_workflow_url,
    'workflow_params': f'{{"input":"{input_file}"}}'
}

print_head("{} request to {}".format(http_method, request_url))

# Post a Nextflow workflow
wes_post_workflow_response = requests.request(http_method, request_url, data = data)

# print the response
pretty_print_json(wes_post_workflow_response)

current_run_id = wes_post_workflow_response.json()["run_id"]

print_head("run_id = {}".format(current_run_id))

2. Check output

In [None]:
http_method = "GET"
request_url = wes_base_url + runs_path + "/" + current_run_id

print_head("{} request to {}".format(http_method, request_url))

# Get request to /runs/{run_id}
monitor_run_response = requests.request(http_method, request_url)

# print the response
pretty_print_json(monitor_run_response)

# Data Connect
1. Check service-info

In [49]:
import requests
import json

dc_port = "8089"
dc_base_url = "http://ga4gh-starter-kit.ilifu.ac.za:{}".format(dc_port)


service_info_path = "/service-info"
tables_path = "/tables"
table_info_path = "/table/{}/info"
table_data_path = "/table/{}/data"
search_path = "/search"

def pretty_print_json(response):
    print(json.dumps(response.json(), indent=4))

In [50]:
dc_service_info_resp = requests.request("GET", dc_base_url+service_info_path)
pretty_print_json(dc_service_info_resp)

{
    "id": "",
    "name": "GA4GH Discovery Search API",
    "description": "",
    "documentationUrl": "",
    "contactUrl": "",
    "version": ""
}


2. Check data table we will be querying

In [51]:
dc_service_info_resp = requests.request("GET", dc_base_url+'/table/trino.public.genome_ilifu/info')
pretty_print_json(dc_service_info_resp)

{
    "name": "trino.public.genome_ilifu",
    "description": "Automatically generated schema",
    "data_model": {
        "$id": "http://ga4gh-starter-kit.ilifu.ac.za:8089/table/trino.public.genome_ilifu/info",
        "description": "Automatically generated schema",
        "$schema": "http://json-schema.org/draft-07/schema#",
        "properties": {
            "sample_id": {
                "format": "varchar",
                "type": "string",
                "$comment": "varchar"
            },
            "population_id": {
                "format": "varchar",
                "type": "string",
                "$comment": "varchar"
            },
            "super_population_id": {
                "format": "varchar",
                "type": "string",
                "$comment": "varchar"
            },
            "sex": {
                "format": "varchar",
                "type": "string",
                "$comment": "varchar"
            },
            "cram_drs_id": {
   

### Select address and id functions

In [52]:
import re

def get_address(s):
    address = s.replace("drs://","")
    address = re.sub(':.*', '', address)
    return address
    
def get_drs_id(s):
    drs_id = re.sub('.*/', '', s)
    return drs_id

### Use case 1

![Use case 1](use_case_1.png)

1. Do query
Select CRAM DRS ids for all African samples. Limit search to 10 samples for now.

In [53]:
import requests, json
q2 = {
  "query": "select cram_drs_id from trino.public.genome_ilifu where super_population_id='AFR' limit 10",
  "parameters": []
}
r = requests.post("http://ga4gh-starter-kit.ilifu.ac.za:8089/search", json = q2)
print(json.dumps(r.json(), indent=3))
data = r.json()

{
   "data": [],
   "pagination": {
      "next_page_url": "http://ga4gh-starter-kit.ilifu.ac.za:8089/search/v1/statement/queued/20231106_090301_00003_69yj9/ya33281a1fd05b2c025f51cf66192d2ef57733093/1?queryJobId=20231106_090301_00003_69yj9"
   }
}


2. Parse through pages to get results

In [54]:
next_page = data['pagination']['next_page_url']
# print (next_page)

# Poll till we get results
while not 'executing' in next_page:
    dc_service_info_resp = requests.request("GET", next_page)
    data = dc_service_info_resp.json()
    next_page = data['pagination']['next_page_url']
    # print (data)

dc_service_info_resp = requests.request("GET", next_page)
# pretty_print_json(dc_service_info_resp)

data = dc_service_info_resp.json()
next_page = data['pagination']['next_page_url']
dc_service_info_resp = requests.request("GET", next_page)
data = dc_service_info_resp.json()
# pretty_print_json(dc_service_info_resp)

drs_str = ""
drs_ids = []
for i in range(len(data['data'])):
    # print (data['data'][i]['cram_drs_id'])
    drs_ids.append(data['data'][i]['cram_drs_id'])
    drs_str = drs_str + data['data'][i]['cram_drs_id'] + " "

drs_str = drs_str[:-1]


3. Map DRS server to DRS objects

In [57]:
drs_servers = {}
for drs_id in drs_ids:
    address = get_address(drs_id)
    if address not in drs_servers:
        drs_servers[address] = {}
        drs_servers[address]['drs_ids'] = []
        drs_servers[address]['drs_ids'].append(drs_id)
        drs_servers[address]['total_file_size'] = 0
        drs_servers[address]['ingress'] = 0
        drs_servers[address]['run_id'] = 0
        drs_servers[address]['drs_ids_str'] = ""
        drs_servers[address]['outputs'] = {}
    else:
        drs_servers[address]['drs_ids'].append(drs_id)
print (drs_servers)

{'osdp.ace.ac.ug': {'drs_ids': ['drs://osdp.ace.ac.ug:5000/6fa43c7de04b60c1a73a42aa2efc977d', 'drs://osdp.ace.ac.ug:5000/be145a60bc059c154475a2561af0df6b', 'drs://osdp.ace.ac.ug:5000/9a45659fe478e5bb39d1dd1b08bd1807', 'drs://osdp.ace.ac.ug:5000/82adbcf7cc72c31a86e65d73bf6ef81b'], 'total_file_size': 0, 'ingress': 0, 'run_id': 0, 'drs_ids_str': '', 'outputs': {}}, 'elwazi-node.icermali.org': {'drs_ids': ['drs://elwazi-node.icermali.org:5000/a68c60133f942881983d0e15827bf88f', 'drs://elwazi-node.icermali.org:5000/45ca586b0921ffedf6a63679fbaacb68', 'drs://elwazi-node.icermali.org:5000/d36019bb63182abad672205a140f7e83', 'drs://elwazi-node.icermali.org:5000/b809bb9b81a9583ec67e787b0449e9bd', 'drs://elwazi-node.icermali.org:5000/168d353c6f474ca72e35e9209f921a59'], 'total_file_size': 0, 'ingress': 0, 'run_id': 0, 'drs_ids_str': '', 'outputs': {}}, 'ga4gh-starter-kit.ilifu.ac.za': {'drs_ids': ['drs://ga4gh-starter-kit.ilifu.ac.za:5000/5a436bec951fab59dd975bcd10f316f1'], 'total_file_size': 0, 'in

### Use case 2 
### Now we initiate a run in a gather/scatter/federated manner
#### - Runs are initiated on individual nodes (calculate flagstat)
#### - Output is gathered and MultiQC are run on the flagstat results on one WES endpoint

![Use case 1](use_case_2.png)

1) Launch flagstat runs. Use the dictionary structure generated previously. Launch workflow at WES endpoint on DRS objects only from that endpoint.

In [58]:
import time
# Populate with drs_ids_str
for drs_server in drs_servers:
    print ("Launching jobs on server: " + drs_server)
    drs_ids = drs_servers[drs_server]['drs_ids']
    drs_ids_str = ""
    for drs_id in drs_ids:
        drs_ids_str = drs_ids_str + drs_id + " "
    drs_ids_str = drs_ids_str[:-1]
#     print (drs_ids_str)
    drs_servers[drs_server]['drs_ids_str'] = drs_ids_str

# Launch workflow
for drs_server in drs_servers:
    wes_port = "6000"
    ga4gh_base_url = "http://" + drs_server + ":{}/ga4gh/{}/v1"
    wes_base_url = ga4gh_base_url.format(wes_port,"wes")

    runs_path = "/runs"

    http_method = "POST"
    request_url = wes_base_url + runs_path

    nextflow_workflow_url = "https://github.com/grbot/flagstat"

    input_file = drs_servers[drs_server]['drs_ids_str']
    
    data = {
        'workflow_type': 'NEXTFLOW',
        'workflow_type_version': '21.04.0',
        'workflow_url': nextflow_workflow_url,
        'workflow_params': f'{{"input":"{input_file}"}}'
    }

    print_head("{} request to {}".format(http_method, request_url))

    # Post a Nextflow workflow
    wes_post_workflow_response = requests.request(http_method, request_url, data = data)

    # print the response
    pretty_print_json(wes_post_workflow_response)

    current_run_id = wes_post_workflow_response.json()["run_id"]

    print_head("run_id = {}".format(current_run_id))

    ## We don't launch in parallel for now. Just poll a submitted job and retrieve the results
    http_method = "GET"
    request_url = wes_base_url + runs_path + "/" + current_run_id

    print_head("{} request to {}".format(http_method, request_url))

    # Get request to /runs/{run_id}
#     monitor_run_response = requests.request(http_method, request_url) 
    time.sleep(15)
    monitor_run_response = requests.request(http_method, request_url)
    
    # Poll until job is complete

    print(monitor_run_response.json())
    while monitor_run_response.json()["state"] != "COMPLETE":
        print("Current job status: " + monitor_run_response.json()["state"])
        time.sleep(5)
        monitor_run_response = requests.request(http_method, request_url)

    print("Job running status: " + monitor_run_response.json()["state"])
    pretty_print_json(monitor_run_response)

    drs_servers[drs_server]['run_id'] = current_run_id
    drs_servers[drs_server]['outputs'] = monitor_run_response.json()["outputs"]


Launching jobs on server: osdp.ace.ac.ug
Launching jobs on server: elwazi-node.icermali.org
Launching jobs on server: ga4gh-starter-kit.ilifu.ac.za
[38;2;8;138;75mPOST request to http://osdp.ace.ac.ug:6000/ga4gh/wes/v1/runs[0m
{
    "run_id": "1e68a829-4d66-480a-b56d-c494a911c626"
}
[38;2;8;138;75mrun_id = 1e68a829-4d66-480a-b56d-c494a911c626[0m
[38;2;8;138;75mGET request to http://osdp.ace.ac.ug:6000/ga4gh/wes/v1/runs/1e68a829-4d66-480a-b56d-c494a911c626[0m
{'run_id': '1e68a829-4d66-480a-b56d-c494a911c626', 'request': {'workflow_params': {'input': 'drs://osdp.ace.ac.ug:5000/6fa43c7de04b60c1a73a42aa2efc977d drs://osdp.ace.ac.ug:5000/be145a60bc059c154475a2561af0df6b drs://osdp.ace.ac.ug:5000/9a45659fe478e5bb39d1dd1b08bd1807 drs://osdp.ace.ac.ug:5000/82adbcf7cc72c31a86e65d73bf6ef81b'}, 'workflow_type': 'NEXTFLOW', 'workflow_type_version': '21.04.0', 'workflow_url': 'https://github.com/grbot/flagstat'}, 'state': 'RUNNING', 'run_log': {'name': 'grbot/flagstat', 'cmd': ['#!/bin/bash -

{
    "run_id": "7089aa4a-2b68-4091-af80-21bfcd231a31"
}
[38;2;8;138;75mrun_id = 7089aa4a-2b68-4091-af80-21bfcd231a31[0m
[38;2;8;138;75mGET request to http://elwazi-node.icermali.org:6000/ga4gh/wes/v1/runs/7089aa4a-2b68-4091-af80-21bfcd231a31[0m
{'run_id': '7089aa4a-2b68-4091-af80-21bfcd231a31', 'request': {'workflow_params': {'input': 'drs://elwazi-node.icermali.org:5000/a68c60133f942881983d0e15827bf88f drs://elwazi-node.icermali.org:5000/45ca586b0921ffedf6a63679fbaacb68 drs://elwazi-node.icermali.org:5000/d36019bb63182abad672205a140f7e83 drs://elwazi-node.icermali.org:5000/b809bb9b81a9583ec67e787b0449e9bd drs://elwazi-node.icermali.org:5000/168d353c6f474ca72e35e9209f921a59'}, 'workflow_type': 'NEXTFLOW', 'workflow_type_version': '21.04.0', 'workflow_url': 'https://github.com/grbot/flagstat'}, 'state': 'COMPLETE', 'run_log': {'name': 'grbot/flagstat', 'cmd': ['#!/bin/bash -ue', 'samtools flagstat     -@ 1     HG01882.final.chrX_15494566-15607236.cram > HG01882.final.chrX_15494566-

{'run_id': '09edc840-b9d0-4769-b434-d5c951e24170', 'request': {'workflow_params': {'input': 'drs://ga4gh-starter-kit.ilifu.ac.za:5000/5a436bec951fab59dd975bcd10f316f1'}, 'workflow_type': 'NEXTFLOW', 'workflow_type_version': '21.04.0', 'workflow_url': 'https://github.com/grbot/flagstat'}, 'state': 'COMPLETE', 'run_log': {'name': 'grbot/flagstat', 'cmd': ['#!/bin/bash -ue', 'samtools flagstat     -@ 1     HG01883.final.chrX_15494566-15607236.cram > HG01883.final.chrX_15494566-15607236.cram.flagstat'], 'start_time': '2023-11-06T11:05:50Z', 'end_time': '2023-11-06T11:05:50Z', 'stdout': 'http://ga4gh-starter-kit.ilifu.ac.za:6000/ga4gh/wes/v1/logs/nextflow/stdout/09edc840-b9d0-4769-b434-d5c951e24170?workdirs=73%2F65a4b12cf72837c631610ec5de2fc6', 'stderr': 'http://ga4gh-starter-kit.ilifu.ac.za:6000/ga4gh/wes/v1/logs/nextflow/stderr/09edc840-b9d0-4769-b434-d5c951e24170?workdirs=73%2F65a4b12cf72837c631610ec5de2fc6', 'exit_code': 0}, 'task_logs': [{'name': 'run_flagstat', 'cmd': ['#!/bin/bash -u

2. a) Upload results to individual DRS servers and get a list of DRS objects (this will not work need to run 2 b) and explained there)

In [None]:
# importlib.reload(upload_to_drs)

# drs_ids_str = ""

# for drs_server in drs_servers:

#     run_id = drs_servers[drs_server]['run_id']
#     outputs = drs_servers[drs_server]["outputs"]

#     for key in outputs:
#         if '.flagstat' in key:
#             file = outputs[key][7:]
#             file_ext = file.split(".")[-1]
#             print (file)
#             meta_d = upload_to_drs.files_metadata_test(run_id, file, file_ext)
#             upload_to_drs.add_file_to_server(meta_d, file_ext, drs_server,'5001') #adds drs object
#             drs_id = "drs://" + drs_server + ":5000/" + meta_d[0][3]
#             drs_ids_str = drs_ids_str + " "  + drs_id
            
# drs_ids_str = drs_ids_str[:-1]

    

2. b) Upload results to individual DRS servers and get a list of DRS access URLs
   

In [59]:
import importlib
import upload_to_drs

importlib.reload(upload_to_drs)

drs_urls_str = ""

for drs_server in drs_servers:

    run_id = drs_servers[drs_server]['run_id']
    outputs = drs_servers[drs_server]["outputs"]

    for key in outputs:
        if '.flagstat' in key:
            print (key)
            file = outputs[key][7:]
            file_ext = file.split(".")[-1]
            meta_d = upload_to_drs.files_metadata_test(run_id, file, file_ext)
            upload_to_drs.add_file_to_server(meta_d, file_ext, drs_server,'5001') #adds drs object
            drs_id = meta_d[0][3]
            drs_port = 5000
            object_path_get = "/objects/{}"
            http_method = "GET"
            ga4gh_base_url = "http://" + drs_server + ":{}/ga4gh/{}/v1"
            drs_base_url = ga4gh_base_url.format(drs_port,"drs")
            request_url = drs_base_url + object_path_get.format(drs_id)
            #print_head("{} request to {}".format(http_method, request_url))
            drs_object_response = requests.request(http_method, request_url)
            #pretty_print_json(drs_object_response)
            data = drs_object_response.json()
            # We cannot use DRS objects here and need to directly stream. DRS objects only resolve to local path
            # and if path is not on server their would be a failure. The disadvantage of using stream is that you loose
            # the file naming.
            access_url = request_url + "/access/" + (data['access_methods'][1]['access_id'])
            drs_object_response = requests.request(http_method, access_url)
            drs_url = drs_object_response.json()["url"]
            drs_urls_str = drs_urls_str + drs_url + " "
    
drs_urls_str = drs_urls_str[:-1]
print(drs_urls_str)


HG01896.final.chrX_15494566-15607236.cram.flagstat
HG01885.final.chrX_15494566-15607236.cram.flagstat
HG01894.final.chrX_15494566-15607236.cram.flagstat
HG01879.final.chrX_15494566-15607236.cram.flagstat
HG01880.final.chrX_15494566-15607236.cram.flagstat
HG01890.final.chrX_15494566-15607236.cram.flagstat
HG01882.final.chrX_15494566-15607236.cram.flagstat
HG01889.final.chrX_15494566-15607236.cram.flagstat
HG01886.final.chrX_15494566-15607236.cram.flagstat
HG01883.final.chrX_15494566-15607236.cram.flagstat
http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/stream/ea4954e23c25366d466b430731589866/493eadf8-2dd8-4bd9-b821-c272cdbf68c2 http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/stream/f07b256a7dd645b7c4a08e7249a4de79/dd7b0f32-7093-47ee-8389-441e34711e97 http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/stream/aa26c66b0179efa8b331ad8ccb4f82fb/4ff93fe4-c155-4481-b1cb-a0c6b8b8f73d http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/stream/4e28d519cd796ffe408adf054e1e71dc/b9aa8422-64de-40b8-ad71-0804f02d8807 http://elwazi-node.ice

3. Launch the workflow that will combine the results

In [60]:
wes_port = "6000"
drs_server_central = "ga4gh-starter-kit.ilifu.ac.za"
ga4gh_base_url = "http://" + drs_server_central + ":{}/ga4gh/{}/v1"
wes_base_url = ga4gh_base_url.format(wes_port,"wes")

service_info_path = "/service-info"
runs_path = "/runs"

http_method = "POST"
request_url = wes_base_url + runs_path

nextflow_workflow_url = "https://github.com/grbot/multiqc"
input_file = drs_urls_str

#print (drs_str)

data = {
    'workflow_type': 'NEXTFLOW',
    'workflow_type_version': '21.04.0',
    'workflow_url': nextflow_workflow_url,
    'workflow_params': f'{{"input":"{input_file}"}}'
}

print_head("{} request to {}".format(http_method, request_url))

# Post a Nextflow workflow
wes_post_workflow_response = requests.request(http_method, request_url, data = data)

# print the response
pretty_print_json(wes_post_workflow_response)

current_run_id = wes_post_workflow_response.json()["run_id"]

print_head("run_id = {}".format(current_run_id))

[38;2;8;138;75mPOST request to http://ga4gh-starter-kit.ilifu.ac.za:6000/ga4gh/wes/v1/runs[0m
{
    "run_id": "62541103-b540-4bae-9df7-a348a887f720"
}
[38;2;8;138;75mrun_id = 62541103-b540-4bae-9df7-a348a887f720[0m


4. Poll for results

In [61]:
import time
# On first run might get the error not able to find ["state"]. This is due to underlying isssue below.
#{
#    "timestamp": "2023-07-27T17:10:08Z",
#    "status_code": 400,
#    "error": "Bad Request",
#    "msg": "Could not load WES run log"
#}
# Just rerun for now and then polling will start

http_method = "GET"
request_url = wes_base_url + runs_path + "/" + current_run_id

print_head("{} request to {}".format(http_method, request_url))

# Get request to /runs/{run_id}
monitor_run_response = requests.request(http_method, request_url)
time.sleep(15) # This delay resolves the issue mentioned above
monitor_run_response = requests.request(http_method, request_url)

print(monitor_run_response)

# Poll until job is complete
while monitor_run_response.json()["state"]!="COMPLETE":
    print("Current job status: " + monitor_run_response.json()["state"])
    time.sleep(5)
    monitor_run_response = requests.request(http_method, request_url)

print("Job running status: " + monitor_run_response.json()["state"])
pretty_print_json(monitor_run_response)

[38;2;8;138;75mGET request to http://ga4gh-starter-kit.ilifu.ac.za:6000/ga4gh/wes/v1/runs/62541103-b540-4bae-9df7-a348a887f720[0m
<Response [200]>
Job running status: COMPLETE
{
    "run_id": "62541103-b540-4bae-9df7-a348a887f720",
    "request": {
        "workflow_params": {
            "input": "http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/stream/ea4954e23c25366d466b430731589866/493eadf8-2dd8-4bd9-b821-c272cdbf68c2 http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/stream/f07b256a7dd645b7c4a08e7249a4de79/dd7b0f32-7093-47ee-8389-441e34711e97 http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/stream/aa26c66b0179efa8b331ad8ccb4f82fb/4ff93fe4-c155-4481-b1cb-a0c6b8b8f73d http://osdp.ace.ac.ug:5000/ga4gh/drs/v1/stream/4e28d519cd796ffe408adf054e1e71dc/b9aa8422-64de-40b8-ad71-0804f02d8807 http://elwazi-node.icermali.org:5000/ga4gh/drs/v1/stream/1651d9f8432c5091be385648f418a4f7/9910c910-c24b-4886-b34d-4554ad258f2b http://elwazi-node.icermali.org:5000/ga4gh/drs/v1/stream/daf41a6c2067981f1ca096fd3823fc0a/ed79d3ec-ec7f

5. Upload `multiqc_report.html` to central DRS server

In [62]:
import importlib
import upload_to_drs
importlib.reload(upload_to_drs)

run_id = monitor_run_response.json()['run_id']
outputs = monitor_run_response.json()["outputs"]

for key in outputs:
    if 'multiqc_report.html' in key:
        print (outputs[key])
        file = outputs[key][7:]
        file_ext = file.split(".")[-1]
        meta_d = upload_to_drs.files_metadata_test(run_id, file, file_ext)
        upload_to_drs.add_file_to_server(meta_d, file_ext, drs_server_central,'5001') #adds drs object
        drs_id = meta_d[0][3]

file:///opt/ga4gh-starter-kit-wes/wes_runs/62/54/11/62541103-b540-4bae-9df7-a348a887f720/work/1d/2d713cd24ff517d691cabad66d33f0/multiqc_report.html


6. Download `multiqc_report.html`

In [63]:
import urllib.request

drs_port = 5000

object_path_get = "/objects/{}"
http_method = "GET"
ga4gh_base_url = "http://" + drs_server_central + ":{}/ga4gh/{}/v1"
drs_base_url = ga4gh_base_url.format(drs_port,"drs")
request_url = drs_base_url + object_path_get.format(drs_id)
#print_head("{} request to {}".format(http_method, request_url))
drs_object_response = requests.request(http_method, request_url)
#pretty_print_json(drs_object_response)
data = drs_object_response.json()
access_url = request_url + "/access/" + (data['access_methods'][1]['access_id'])
#print(access_path)
drs_object_response = requests.request(http_method, access_url)
download_url = drs_object_response.json()["url"]
print(download_url)
urllib.request.urlretrieve(download_url, "multiqc_report_2.html")

http://ga4gh-starter-kit.ilifu.ac.za:5000/ga4gh/drs/v1/stream/e6c29b7942a4b2875ee31368ac70874e/caced522-375f-45ee-937c-100277db7a4e


('multiqc_report_2.html', <http.client.HTTPMessage at 0x7fd02edc6470>)