# A notebook to get metadata for 10 HuBMAP datasets

# Import libraries

In [6]:
import requests
import argparse
import json
import pandas as pd
from pprint import pprint

# Define functions

In [7]:
def call_entity_api(endpoint: str, hubmap_id: str):
    """A function that calls the Entity API with the specified ID

    Args:
                    endpoint (str): Endpoint to call
                    hubmap_id (str): A HuBMAP ID
Returns:
                    data (json): the returned response in JSON
    """

    # get data from Entity API endpoint
    endpoint = f"https://entity.api.hubmapconsortium.org/{endpoint}/{hubmap_id}"

    # Use `argparse` to build URL
    parser = argparse.ArgumentParser(
        description="Get metadata for provided HuBMAP ID")
    parser.add_argument("--url", type=str,
                        help="URL of the API", default=endpoint)
    args, unknown = parser.parse_known_args()
    api_url = args.url

    # send request
    response = requests.get(api_url).text

    # parse to JSON
    data = json.loads(response)

    return data

# Main 

In [8]:
# HuBMAP IDs to search for
ids = [
    "HBM948.GXMD.986",
    "HBM975.WQQQ.853",
    "HBM456.GRCM.369",
    "HBM762.RPDR.282",
    "HBM578.BDBP.672",
    "HBM675.RVGB.258",
    "HBM929.VSJQ.633",
    "HBM265.FQWZ.384",
    "HBM444.DXLZ.643",
    "HBM468.SSXX.967"
]

# Dictionary to capture the result
result = {
    'dataset_id': ids,
    'description': [],
    'donor_id': [],
    'dataset_info': [],
    'dataset_type': [],
    'group_name': [],
    'title': [],
    'uuid': []
}

# loop through ids and fetch data for ancestor and entity
for i in ids:

    # via ancestors endpoint
    ancestor_response = call_entity_api("ancestors", i)
    for ancestor in ancestor_response:
        if ancestor['entity_type'] == "Donor":
            result["description"].append(ancestor['description'])
            result['donor_id'].append(ancestor['hubmap_id'])

    # via entity endpoint
    entity_response = call_entity_api("entities", i)
    result['dataset_info'].append(entity_response['dataset_info'])
    result['dataset_type'].append(entity_response['dataset_type'])
    result['group_name'].append(entity_response['group_name'])
    result['title'].append(entity_response['title'])
    result['uuid'].append(entity_response['uuid'])

# convert to DataFrame
df = pd.DataFrame(result)

df

Unnamed: 0,dataset_id,description,donor_id,dataset_info,dataset_type,group_name,title,uuid
0,HBM948.GXMD.986,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_snareseq__d2f1ba23eddf1484f4a794...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,dd648e4648238b25a7d1062669ea89af
1,HBM975.WQQQ.853,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_snareseq__59da4436e52d9cacc743c1...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,176edb4b0e16059522f6f087576fbeec
2,HBM456.GRCM.369,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_snareseq__1bb63db033b3f7206a4d8c...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,a39fed027b51d97f83cec90c63c44744
3,HBM762.RPDR.282,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_10x_sn__a8652e9e3c545e61e1ffe9d5...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,e81c9c9753998b2ca69e8bceadf2409d
4,HBM578.BDBP.672,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_10x_sn__8e5c8f0cc61aad4fcbc5cc11...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,237b8f24c23821690519af862cf25a9f
5,HBM675.RVGB.258,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_10x_sn__26b642ddbae00e7ff6570ddd...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,171bd6114652ce2a1ff0e4757b5eaef9
6,HBM929.VSJQ.633,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_10x_sn__3b1490026022f850e4d3c3fb...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,37ccf28cb5c06e8ba076f587ee4e50be
7,HBM265.FQWZ.384,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_10x_sn__0736735768692d6ca0cd9614...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,d0c1bd09443c301413f44f7108d20e08
8,HBM444.DXLZ.643,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_snareseq__11c92e3b876f5f79cd46a9...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,2dac500177620d78ca174c2c1d36a6c8
9,HBM468.SSXX.967,37 yr old black male with intracranial hemorrh...,HBM943.SCQQ.877,salmon_rnaseq_10x_sn__68e6dfa4807ca615883f73a5...,RNAseq [Salmon],University of California San Diego TMC,RNAseq [Salmon] data from the lung (right) of ...,0b590c9e3a62178da592e85572e2f1bf


# Export result

In [9]:
df.to_csv("output/dataset_metadata.csv", index=False)