# A notebook to get metadata for 10 HuBMAP datasets

# Import libraries

In [72]:
import requests
import argparse
import json
import pandas as pd
from pprint import pprint

# Define functions

In [73]:
def call_entity_api(endpoint: str, hubmap_id: str):
    """A function that calls the Entity API with the specified ID

    Args:
                    endpoint (str): Endpoint to call
                    hubmap_id (str): A HuBMAP ID
Returns:
                    data (json): the returned response in JSON
    """

    # get data from Entity API endpoint
    endpoint = f"https://entity.api.hubmapconsortium.org/{endpoint}/{hubmap_id}"

    # Use `argparse` to build URL
    parser = argparse.ArgumentParser(
        description="Get metadata for provided HuBMAP ID")
    parser.add_argument("--url", type=str,
                        help="URL of the API", default=endpoint)
    args, unknown = parser.parse_known_args()
    api_url = args.url

    # send request
    response = requests.get(api_url).text

    # parse to JSON
    data = json.loads(response)

    return data

# Main 

In [74]:
# HuBMAP IDs to search for
ids = [
    "HBM948.GXMD.986",
    "HBM975.WQQQ.853",
    "HBM456.GRCM.369",
    "HBM762.RPDR.282",
    "HBM578.BDBP.672",
    "HBM675.RVGB.258",
    "HBM929.VSJQ.633",
    "HBM265.FQWZ.384",
    "HBM444.DXLZ.643",
    "HBM468.SSXX.967"
]

# Dictionary to capture the result
result = {
    'dataset_id': ids,
    'description': [],
    'donor_id': [],
    'dataset_info': [],
    'dataset_type': [],
    'group_name': [],
    'title': [],
    'uuid': []
}

# loop through ids and fetch data for ancestor and entity
for i in ids:

    # via ancestors endpoint
    ancestor_response = call_entity_api("ancestors", i)
    for ancestor in ancestor_response:
        if ancestor['entity_type'] == "Donor":
            result["description"].append(ancestor['description'])
            result['donor_id'].append(ancestor['hubmap_id'])

    # via entity endpoint
    entity_response = call_entity_api("entities", i)
    result['dataset_info'].append(entity_response['dataset_info'])
    result['dataset_type'].append(entity_response['dataset_type'])
    result['group_name'].append(entity_response['group_name'])
    result['title'].append(entity_response['title'])
    result['uuid'].append(entity_response['uuid'])

pprint(result)

{'dataset_id': ['HBM948.GXMD.986',
                'HBM975.WQQQ.853',
                'HBM456.GRCM.369',
                'HBM762.RPDR.282',
                'HBM578.BDBP.672',
                'HBM675.RVGB.258',
                'HBM929.VSJQ.633',
                'HBM265.FQWZ.384',
                'HBM444.DXLZ.643',
                'HBM468.SSXX.967'],
 'dataset_info': ['salmon_rnaseq_snareseq__d2f1ba23eddf1484f4a794444a6536ee_01160f557b3544a163184137530ededf_7e452891b83ca984a2f210312d90495d_4ea28b66fb08185a484df0fa0b9543dc_eea962dabda0cec78f8f909494e0e775_e8efba8d06360eeba199f195da80f01d__salmon-rnaseq-snareseq',
                  'salmon_rnaseq_snareseq__59da4436e52d9cacc743c1a9bbdd566f_c820345b025ff959e531db797030a399_19274e67c639a003ffe17953fd6b55db_d68eb55a484036b3575043bdecd64a52__salmon-rnaseq-snareseq',
                  'salmon_rnaseq_snareseq__1bb63db033b3f7206a4d8c4ac4db1564_d615cc96a8606bb6819bed2a8d7f24ac_21cfe966008f784c47d0638e80b55ee0_500af39b83bfa9eb27f78746734f3db9__salmo

# Export result

In [75]:
df = pd.DataFrame(result)

df.to_csv("output/dataset_metadata.csv", index=False)