# Accessing DSS via DOS

Data in the HCA DSS can be accessed using ga4gh methods.

In [53]:
from ga4gh.dos.client import Client
client = Client("https://spbnq0bc10.execute-api.us-west-2.amazonaws.com/api")
lc = local_client = client.client
models = client.models

List the data bundles offered by the service.

In [54]:
data_bundles = lc.ListDataBundles(body={}).result().data_bundles

In [55]:
print(data_bundles[0].id)

06c4bd47-c8e2-5045-8bae-bfad24633c87


### Get a page of results.

In [56]:
ListDataBundles = models.get_model('ga4ghListDataBundlesRequest')
request = ListDataBundles(page_size=10)
response = lc.ListDataBundles(body=request).result()

In [57]:
len(response.data_bundles)
response.next_page_token[0]

u'DnF1ZXJ5VGhlbkZldGNoBQAAAAAACiO8FmhEQWY2dm5fVG0yWDZuT2piTkt1S0EAAAAAAAojvxZoREFmNnZuX1RtMlg2bk9qYk5LdUtBAAAAAAAKI8AWaERBZjZ2bl9UbTJYNm5PamJOS3VLQQAAAAAACiO9FmhEQWY2dm5fVG0yWDZuT2piTkt1S0EAAAAAAAojvhZoREFmNnZuX1RtMlg2bk9qYk5LdUtB'

### Get the next page of results.

In [58]:
request = ListDataBundles(page_size=10, page_token=response.next_page_token[0])
page_2 = lc.ListDataBundles(body=request).result()

In [59]:
print('page 1')
print("\n".join([x.id for x in response.data_bundles]))
print('page 2')
print("\n".join([x.id for x in page_2.data_bundles]))

page 1
06c4bd47-c8e2-5045-8bae-bfad24633c87
0d6371a8-fc4f-5232-9660-e655903b17ea
0e727062-7fc9-5e46-b1e3-24537426ca4c
2277b3fc-5a75-5782-86a0-c29f13844e7d
139f30ba-62d3-50fb-9177-ab3d370e29f8
1ecf1c35-9e1e-55ef-8f42-71102c3abc33
44a8837b-4456-5709-b56b-54e23000f13a
108c3839-a48e-53d8-a765-e7bfa5da6c81
233bc61e-e9e8-5f75-a8d9-189cfced36fe
28bebda7-14b1-5c47-b9b7-52540f091866
page 2
108c3839-a48e-53d8-a765-e7bfa5da6c81
46e29f86-2983-5658-9f93-5f8aea24a4a2
492054ee-31e5-5516-ae96-fbba12fbc73d
4a51ff38-f4ea-5599-b752-8e65724864db
014a9de5-cb88-5e37-a196-b6e3ab30fff6
0583d98e-b079-51ae-affc-1c2d6200c84d
1111ec7b-675d-5c00-8aa4-7eea28f2b846
0a5f13d7-a1f5-55f6-994f-48f252ac61c7
1111ec7b-675d-5c00-8aa4-7eea28f2b846
197bc047-e917-55ed-852d-d563cdbc50e4


## Inspect a Data Bundle

In [77]:
data_bundle = lc.GetDataBundle(data_bundle_id=data_bundles[0].id).result().data_bundle

In [79]:
print(data_bundles[0].id)
print(data_bundle.id)

06c4bd47-c8e2-5045-8bae-bfad24633c87
06c4bd47-c8e2-5045-8bae-bfad24633c87


In [80]:
print("\n".join(data_bundle.data_object_ids))

1311414e-9f12-4596-99bd-6b06cac69025
40628e8c-6456-4d72-8600-91691cb1833d
8f3784cd-3e3f-4450-8861-e5a02c7ba554


## Download a Data Bundle

In [81]:
import requests

# https://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
def download_file(url, filename):
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                #f.flush() commented by recommendation from J.F.Sebastian
    return filename


In [93]:
data_object = lc.GetDataObject(
    data_object_id=data_bundle.data_object_ids[2]).result().data_object

In [94]:
print(data_object.urls[0].url)

https://commons-dss.ucsc-cgp-dev.org/v1/files/8f3784cd-3e3f-4450-8861-e5a02c7ba554?replica=aws


In [95]:
%time download_file(data_object.urls[0].url, data_object.id)

CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 1.29 s


u'8f3784cd-3e3f-4450-8861-e5a02c7ba554'

In [96]:
!head $data_object.id

{
    "center_name": "NYGC",
    "donor_uuid": "b8284a5b-429d-5652-8247-0257f1e2f61d",
    "program": "TOPMed",
    "project": "HapMap",
    "schema_version": "0.0.3",
    "specimen": [
        {
            "samples": [
                {
