## HuBMAP metadata analysis

In [1]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import os
from pathlib import Path
import json

In [2]:
def get_hubmap_metadata(uuid: str, ):
    URL = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}"
    if "json" not in uuid:
        URL = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}.json"
    
    response = requests.get(URL)
    return response

In [52]:
get_hubmap_metadata(uuid="0de3181b777383b7b918d4402021fb34")

<Response [200]>

List of all json IDs.

In [6]:
def get_list_of_uuids():
    hubmap_metadata_general_url = "https://portal.hubmapconsortium.org/metadata/v0/datasets.tsv"
    df_raw = pd.read_csv(hubmap_metadata_general_url, delimiter='\t')
    hubmap_uuids = df_raw['uuid'][1:].tolist()
    return hubmap_uuids

In [7]:
hubmap_uuids = get_list_of_uuids()

In [8]:
# resp = get_hubmap_metadata(uuid=hubmap_uuids[1])
resp = get_hubmap_metadata(uuid="0de3181b777383b7b918d4402021fb34")

In [9]:
for key,value in resp.json().items():
    print(key, type(value))

ancestor_counts <class 'dict'>
ancestor_ids <class 'list'>
ancestors <class 'list'>
contacts <class 'list'>
contains_human_genetic_sequences <class 'bool'>
contributors <class 'list'>
created_by_user_displayname <class 'str'>
created_by_user_email <class 'str'>
created_timestamp <class 'int'>
data_access_level <class 'str'>
data_types <class 'list'>
descendant_counts <class 'dict'>
descendant_ids <class 'list'>
descendants <class 'list'>
description <class 'str'>
display_subtype <class 'str'>
doi_url <class 'str'>
donor <class 'dict'>
entity_type <class 'str'>
files <class 'list'>
group_name <class 'str'>
group_uuid <class 'str'>
hubmap_id <class 'str'>
immediate_ancestors <class 'list'>
immediate_descendants <class 'list'>
index_version <class 'str'>
last_modified_timestamp <class 'int'>
mapped_consortium <class 'str'>
mapped_data_access_level <class 'str'>
mapped_data_types <class 'list'>
mapped_last_modified_timestamp <class 'str'>
mapped_metadata <class 'dict'>
mapped_status <class

In [53]:
resp.json()

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}

In [54]:
def get_list_of_uuids():
    hubmap_metadata_general_url = "https://portal.hubmapconsortium.org/metadata/v0/datasets.tsv"
    df_raw = pd.read_csv(hubmap_metadata_general_url, delimiter='\t')
    hubmap_uuids = df_raw['uuid'][1:].tolist()
    return hubmap_uuids


def get_hubmap_metadata(uuid: str, ):
    URL = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}"
    if "json" not in uuid:
        URL = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}.json"
    
    response = requests.get(URL)
    return response


def get_all_hubmap_dataset_metadata_Sequential():
    all_dataset_metadata = {}
    failed_fetches = []
    hubmap_uuids = get_list_of_uuids()
    for i, uuid in tqdm(enumerate(hubmap_uuids), desc="Fetching HuBMAP dataset metadata: "):
        response = get_hubmap_metadata(uuid=uuid)
        if response.status_code != 200:
            failed_fetches.append(uuid)
        else:
            all_dataset_metadata[uuid] = response.json()

        if i % 100 == 0:
            time.sleep(5)  

    return all_dataset_metadata, failed_fetches  

In [55]:
all_dataset_metadata, failed_fetches = get_all_hubmap_dataset_metadata_Sequential()

Fetching HuBMAP dataset metadata: : 1330it [14:10,  1.56it/s]


In [57]:
with open("sample.json", "w") as outfile:
    json.dump(all_dataset_metadata, outfile)

In [58]:
failed_fetches

[]

In [64]:
len(all_dataset_metadata.keys())


1330

# HuBMAP data dump exploration

In [11]:
all_dataset_metadata = None
with open("sample.json", "r") as f:
    all_dataset_metadata = json.load(f)

In [16]:
fields = {k:list(v.keys()) for k,v in all_dataset_metadata.items()}

In [17]:
fields_counter = {}
for k,v in fields.items():
    for feature in v:
        if feature not in fields_counter:
            fields_counter[feature] = 0
        fields_counter[feature] += 1

In [18]:
fields_counter

{'anatomy_0': 1131,
 'anatomy_1': 1131,
 'ancestor_counts': 1321,
 'ancestor_ids': 1321,
 'ancestors': 1321,
 'contacts': 848,
 'contains_human_genetic_sequences': 1321,
 'contributors': 919,
 'created_by_user_displayname': 1321,
 'created_by_user_email': 1321,
 'created_timestamp': 1321,
 'data_access_level': 1321,
 'data_types': 1321,
 'dataset_info': 1182,
 'descendant_counts': 1321,
 'descendant_ids': 1321,
 'descendants': 1321,
 'description': 900,
 'display_subtype': 1321,
 'doi_url': 848,
 'donor': 1321,
 'entity_type': 1321,
 'files': 1321,
 'group_name': 1321,
 'group_uuid': 1321,
 'hubmap_id': 1321,
 'immediate_ancestors': 1321,
 'immediate_descendants': 1321,
 'index_version': 1321,
 'lab_dataset_id': 857,
 'last_modified_timestamp': 1321,
 'mapped_consortium': 1321,
 'mapped_data_access_level': 1321,
 'mapped_data_types': 1321,
 'mapped_last_modified_timestamp': 1321,
 'mapped_metadata': 1302,
 'mapped_status': 1321,
 'mapper_metadata': 1321,
 'metadata': 1302,
 'origin_sam

In [None]:
all_dataset_metadata

## HuBMAP restricted Data:
- Search API

In [21]:

BASE_URL = "https://search.api.hubmapconsortium.org/v3/" 
endpoint = "indices"
URL = BASE_URL + endpoint
resp = requests.get(URL)

In [22]:
resp.json()

{'indices': ['entities', 'portal', 'hm_antibodies', 'files']}

In [23]:
headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}

DSL_Query = {
  "query": {
    "bool": {
      "must": [
        {
          "match_phrase": {
            "donor.publication_status": "Published"
          }
        }
      ],
      "filter": [
        {
          "match": {
            "origin_sample.entity_type": "Sample"
          }
        }
      ]
    }
  }
}

BASE_URL = "https://search.api.hubmapconsortium.org/v3/" 
endpoint = "search"
URL = BASE_URL + endpoint
resp = requests.post(URL, headers=headers, json=DSL_Query)

In [24]:
resp.json()

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}

In [25]:
resp

b'{"_shards":{"failed":0,"skipped":0,"successful":5,"total":5},"hits":{"hits":[],"max_score":null,"total":{"relation":"eq","value":0}},"timed_out":false,"took":1}\n'

In [48]:
nexus_token = "<NEXUS TOKEN>"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}

DSL_Query = {
  "query": {
    "bool": {
      "should": [
        {
          "match_phrase": {
            "donor.publication_status": "New"
          }
        },
        { 
          "match_phrase": {
            "donor.publication_status": "QA"
          }
        },
      ],
    }
  }
}


BASE_URL = "https://search.api.hubmapconsortium.org/v3/" 
endpoint = "search"
URL = BASE_URL + endpoint
resp = requests.post(URL, headers=headers, json=DSL_Query)

In [49]:
resp.json()

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}

##### Using Entity API

In [76]:
nexus_token = "<NEXUS TOKEN>"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}

BASE_URL = "https://entity.api.hubmapconsortium.org/"
endpoint = "datasets/prov-info"
params = {
    "format" : "json"
}
URL = BASE_URL + endpoint
resp = requests.get(URL, headers=headers, params=params)



In [None]:
resp.json()

In [91]:
dt_uuid_set = set([dt["dataset_uuid"] for dt in resp.json()])

In [92]:
len(dt_uuid_set)

2137

In [88]:
set(all_dataset_metadata.keys()).__len__()

1330

In [95]:
len(dt_uuid_set.intersection(set(all_dataset_metadata.keys())))

924

In [98]:
len(set(all_dataset_metadata.keys()) - dt_uuid_set)

406

In [102]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}
ID = list(dt_uuid_set)[0]

BASE_URL = "https://entity.api.hubmapconsortium.org/"
endpoint = f"datasets/{ID}/prov-info"
params = {
    "format" : "json",
    "id" : ID,
}
URL = BASE_URL + endpoint
resp = requests.get(URL, headers=headers, params=params)



In [None]:
resp.json()

#### Unpublished

In [104]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}

BASE_URL = "https://entity.api.hubmapconsortium.org/"
endpoint = f"datasets/unpublished"
params = {
    "format" : "json",
}
URL = BASE_URL + endpoint
resp = requests.get(URL, headers=headers, params=params)



In [106]:
[resp.json()]

1942

In [108]:
unpublished_dataset_uuids = [dt['uuid'] for dt in resp.json()]

In [None]:
unpublished_dataset_uuids[:5]

#### Entity fetch

In [121]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}
ID = unpublished_dataset_uuids[1]

BASE_URL = "https://entity.api.hubmapconsortium.org/"
endpoint = f"entities/{ID}"
params = {
    "id" : ID,
}
URL = BASE_URL + endpoint
resp = requests.get(URL, headers=headers, )

In [None]:
resp.json()

#### Get all unpublished data

In [129]:
def get_unpublished_dataset_metadata(nexus_token, uuid):
    # nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
    headers = {
        'Authorization': 'Bearer ' + nexus_token
    }

    BASE_URL = "https://entity.api.hubmapconsortium.org/"
    endpoint = f"entities/{uuid}"
    URL = BASE_URL + endpoint
    resp = requests.get(URL, headers=headers)
    return resp


def get_all_hubmap_unpublished_dataset_uuids(nexus_token):
    headers = {
        'Authorization': 'Bearer ' + nexus_token
    }

    BASE_URL = "https://entity.api.hubmapconsortium.org/"
    endpoint = f"datasets/unpublished"
    params = {
        "format" : "json",
    }
    URL = BASE_URL + endpoint
    resp = requests.get(URL, headers=headers, params=params)
    unpublished_dataset_uuids = []
    if resp.status_code == 200:
        unpublished_dataset_uuids = [dt['uuid'] for dt in resp.json()]
    else:
        return {"error": "Unable to fetch unpublished dataset prov-info."}
    return unpublished_dataset_uuids


def get_all_hubmap_unpublished_dataset_metadata(nexus_token):
    unpublished_dataset_uuids = get_all_hubmap_unpublished_dataset_uuids(nexus_token)
    print(f"Total unpublished datasets : {len(unpublished_dataset_uuids)}")

    all_unpublished_dataset_metadata = {}
    failed_unpublished_fetches = []

    for uuid in tqdm(unpublished_dataset_uuids, desc="Fetching unpublished HuBMAP dataset metadata: "):    
        resp = get_unpublished_dataset_metadata(nexus_token, uuid)
        if resp.status_code != 200:
            failed_unpublished_fetches.append(uuid)
        else:
            all_unpublished_dataset_metadata[uuid] = resp.json()

        
    return all_unpublished_dataset_metadata, failed_unpublished_fetches

In [130]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
all_unpublished_dataset_metadata, failed_unpublished_fetches = get_all_hubmap_unpublished_dataset_metadata(nexus_token)

Fetching unpublished HuBMAP dataset metadata:   0%|          | 0/1942 [00:00<?, ?it/s]

Total unpublished datasets : 1942


Fetching unpublished HuBMAP dataset metadata: 100%|██████████| 1942/1942 [25:01<00:00,  1.29it/s]


In [None]:
all_unpublished_dataset_metadata

In [134]:
len(all_unpublished_dataset_metadata)

1938

In [None]:
all_unpublished_dataset_metadata['9b82e4f2bd429e49ec632c3132d380a5']

# Parallel fetching

In [3]:
import asyncio
from aiohttp import ClientSession

import nest_asyncio
nest_asyncio.apply()

In [20]:
async def fetch(url, session):
    async with session.get(url) as response:
        return await response.json()


async def bound_fetch(sem, url, session):
    # Getter function with semaphore.
    async with sem:
        return await fetch(url, session)
    
    
def get_all_hubmap_unpublished_dataset_uuids(nexus_token):
    headers = {
        'Authorization': 'Bearer ' + nexus_token
    }

    BASE_URL = "https://entity.api.hubmapconsortium.org/"
    endpoint = f"datasets/unpublished"
    params = {
        "format" : "json",
    }
    URL = BASE_URL + endpoint
    resp = requests.get(URL, headers=headers, params=params)
    unpublished_dataset_uuids = []
    if resp.status_code == 200:
        unpublished_dataset_uuids = [dt['uuid'] for dt in resp.json()]
    else:
        return {"error": "Unable to fetch unpublished dataset prov-info."}
    return unpublished_dataset_uuids


async def get_multiple_unpublished_dataset_metadata(unpublished_dataset_uuids, max_requests_per_sec):
    tasks = []
    headers = {
        'Authorization': 'Bearer ' + nexus_token
    }
    sem = asyncio.Semaphore(max_requests_per_sec)

    async with ClientSession(headers=headers) as session:
        for uuid in unpublished_dataset_uuids:    
            url = f"https://entity.api.hubmapconsortium.org/entities/{uuid}"
            task = asyncio.ensure_future(bound_fetch(sem, url, session))
            tasks.append(task)
            
        
        responses = asyncio.gather(*tasks)
        return await responses


def get_all_hubmap_unpublished_dataset_metadata_parallel(nexus_token, max_requests_per_sec=50):
    unpublished_dataset_uuids = get_all_hubmap_unpublished_dataset_uuids(nexus_token)
    print(f"Total unpublished datasets : {len(unpublished_dataset_uuids)}")

    all_unpublished_dataset_metadata = []
    failed_unpublished_fetches = []
    
    for i in tqdm(range(0, len(unpublished_dataset_uuids), max_requests_per_sec), desc="Fetching unpublished data parallel"):
        loop = asyncio.get_event_loop()
        future = asyncio.ensure_future(get_multiple_unpublished_dataset_metadata(unpublished_dataset_uuids[i:i+max_requests_per_sec], max_requests_per_sec))
        all_unpublished_dataset_metadata += loop.run_until_complete(future)

        
    return all_unpublished_dataset_metadata

In [21]:
%%time 

nexus_token = "Ag7paXD5MqbJ0na8VMvNa09an5N3e8GQGK0xVNr32ppk34bXawHWC27Gxkk2W7JP9qq6BVBxb9bw86Cno2apBtBxvr"
all_unpublished_dataset_metadata = get_all_hubmap_unpublished_dataset_metadata_parallel(nexus_token, max_requests_per_sec=200)

Fetching unpublished data parallel:   0%|          | 0/10 [00:00<?, ?it/s]

Total unpublished datasets : 1949


Fetching unpublished data parallel: 100%|██████████| 10/10 [02:05<00:00, 12.51s/it]

CPU times: user 4.78 s, sys: 1.5 s, total: 6.27 s
Wall time: 2min 10s





In [12]:
len(all_unpublished_dataset_metadata)

1948

In [16]:
all_unpublished_dataset_metadata_dict = {dt['uuid'] : dt for dt in all_unpublished_dataset_metadata}

In [None]:
all_unpublished_dataset_metadata_dict['9b82e4f2bd429e49ec632c3132d380a5']

In [36]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%Y_%M_%d_%H_%M_%S")
print("Current Time =", current_time)

Current Time = 2023_10_01_12_10_19
