In [1]:
# Imports
import os
import sys
import json

import asyncio
from aiohttp import ClientSession
import requests

from tqdm import tqdm
import time
from datetime import datetime

from config import *
import pandas as pd

import nest_asyncio
nest_asyncio.apply()

In [2]:

async def fetch(url, session):
    async with session.get(url) as response:
        return await response.json()

async def bound_fetch(sem, url, session):
    # Getter function with semaphore.
    async with sem:
        return await fetch(url, session)
    
def get_all_hubmap_dataset_uuids(nexus_token):
    # Published data
    hubmap_metadata_general_url = "https://portal.hubmapconsortium.org/metadata/v0/datasets.tsv"
    df_raw = pd.read_csv(hubmap_metadata_general_url, delimiter='\t')
    published_dataset_uuids = df_raw['uuid'][1:].tolist()

    # Unpublished data
    headers = {
        'Authorization': 'Bearer ' + nexus_token
    }
    params = {
        "format" : "json",
    }

    BASE_URL = "https://entity.api.hubmapconsortium.org/"
    endpoint = f"datasets/unpublished"
    URL = BASE_URL + endpoint
    resp = requests.get(URL, headers=headers, params=params)
    
    unpublished_dataset_uuids = [dt['uuid'] for dt in resp.json()]
    
    return {
        "published_uuids" : published_dataset_uuids,
        "unpublished_uuids" : unpublished_dataset_uuids,
    }

##################################################################################################################################
##################################################################################################################################
# Published data

async def fetch_published_dataset_metadata_parallel(published_dataset_uuids, max_requests_per_sec, nexus_token):
    tasks = []
    sem = asyncio.Semaphore(max_requests_per_sec)

    async with ClientSession() as session:
        for uuid in published_dataset_uuids:    
            url = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}.json"
            task = asyncio.ensure_future(bound_fetch(sem, url, session))
            tasks.append(task)
            
        responses = asyncio.gather(*tasks)
        return await responses


def get_published_hubmap_metadata_parallel(nexus_token, published_uuids, max_requests_per_sec):    
    all_published_dataset_metadata = []
    # failed_unpublished_fetches = []
    
    for i in tqdm(range(0, len(published_uuids), max_requests_per_sec), desc="Fetching published data parallel"):
        loop = asyncio.get_event_loop()
        future = asyncio.ensure_future(fetch_published_dataset_metadata_parallel(published_uuids[i:i+max_requests_per_sec], max_requests_per_sec, nexus_token))
        all_published_dataset_metadata += loop.run_until_complete(future)
  
    return all_published_dataset_metadata


##################################################################################################################################
##################################################################################################################################
# Unpublished data

async def fetch_unpublished_dataset_metadata_parallel(unpublished_dataset_uuids, max_requests_per_sec, nexus_token):
    tasks = []
    headers = {
        'Authorization': 'Bearer ' + nexus_token
    }
    sem = asyncio.Semaphore(max_requests_per_sec)

    async with ClientSession(headers=headers) as session:
        for uuid in unpublished_dataset_uuids:    
            url = f"https://entity.api.hubmapconsortium.org/entities/{uuid}"
            task = asyncio.ensure_future(bound_fetch(sem, url, session))
            tasks.append(task)
            
        responses = asyncio.gather(*tasks)
        return await responses


def get_unpublished_hubmap_metadata_parallel(nexus_token, unpublished_uuids, max_requests_per_sec):
    all_unpublished_dataset_metadata = []
    # failed_unpublished_fetches = []
    
    for i in tqdm(range(0, len(unpublished_uuids), max_requests_per_sec), desc="Fetching unpublished data parallel"):
        loop = asyncio.get_event_loop()
        future = asyncio.ensure_future(fetch_unpublished_dataset_metadata_parallel(unpublished_uuids[i:i+max_requests_per_sec], max_requests_per_sec, nexus_token))
        all_unpublished_dataset_metadata += loop.run_until_complete(future)
  
    return all_unpublished_dataset_metadata

In [9]:

def get_all_hubmap_metadata(filename, max_requests_per_sec, nexus_token):
    dataset_uuids = get_all_hubmap_dataset_uuids(nexus_token)
    
    published_dataset_metadata_list   = get_published_hubmap_metadata_parallel(nexus_token, dataset_uuids["published_uuids"], max_requests_per_sec)
    unpublished_dataset_metadata_list = get_unpublished_hubmap_metadata_parallel(nexus_token, dataset_uuids["unpublished_uuids"], max_requests_per_sec)
    
    published_dataset_metadata   = {dt['uuid'] : dt for dt in published_dataset_metadata_list} 
    unpublished_dataset_metadata = {dt['uuid'] : dt for dt in unpublished_dataset_metadata_list}

    all_dataset_metadata = {
        "published_metadata" : published_dataset_metadata,
        "unpublished_metadata" : unpublished_dataset_metadata,
    }

    return all_dataset_metadata

In [10]:
all_dataset_metadata = get_all_hubmap_metadata(configs["output_filename"], configs["max_requests_per_sec"], configs["nexus_token"])

Fetching published data parallel: 100%|██████████| 27/27 [00:48<00:00,  1.81s/it]
Fetching unpublished data parallel: 100%|██████████| 39/39 [02:13<00:00,  3.42s/it]


In [14]:
print(len(all_dataset_metadata['published_metadata'].keys()))
print(len(set(all_dataset_metadata['published_metadata'].keys())))

1330
1330


In [15]:
print(len(all_dataset_metadata['unpublished_metadata'].keys()))
print(len(set(all_dataset_metadata['unpublished_metadata'].keys())))

1946
1946


# Data for CTPOP Paper

#### “dataset-id”: : { “dataset-data-access-level”: “”, “sample-id”: “”, “sample-category”: “”, “sample-data-access-level”: “”, “rui-location”: “”}


### Devin's Pseudo Code

##### For Private Dataset (unpublished) - Step 1 and 4
##### For Public Dataset (unpublished) - all steps

```
    1.												
    If dataset-id has “ancestor” attribute, find the FIRST ancestor entity in the “ancestor_id” list with:
    “entity_type” = “Sample”
    a “rui_location” attribute, 

    then return its “hubmap_id”, “rui_location”, and “sample_category” values

    2.													
    If else, then find the FIRST ancestor entity in the “ancestor_id” list to have all of the following:
    “entity_type” = “Sample”
    “Sample_category” = “Block”

    then return its “hubmap_id” and “sample_category” values

    3.													
    If else, then find the LAST ancestor entity in the “ancestor_id” list to have all of the following:
    “entity_type” = “Sample”
    “Sample_category” DOES NOT EQUAL != “Organ”

    then return its “hubmap_id” and “sample_category” values

    4.													
    Else, return NULL
```



In [25]:
# Private Datasets:

unpublished_metadata = None
with open("hubmap_metadata_unpublished.json", "r") as f:
    unpublished_metadata = json.loads(f.read())

# Public Datasets:

published_metadata = None
with open("hubmap_metadata_published.json", "r") as f:
    published_metadata = json.loads(f.read())

In [22]:
list(unpublished_metadata.keys())[-1]

'9442fd8dcb0040b929ef1c048bf181ef'

In [45]:
def get_entites(uuid):
    headers = {
        'Authorization': 'Bearer ' + configs['nexus_token']
    }
    url = f"https://entity.api.hubmapconsortium.org/entities/{uuid}"
    resp = requests.get(url, headers=headers)
    return resp.json()

def get_ancestors(uuid):
    headers = {
        'Authorization': 'Bearer ' + configs['nexus_token']
    }
    url = f"https://entity.api.hubmapconsortium.org/ancestors/{uuid}"
    resp = requests.get(url, headers=headers)
    return resp.json()

#### Unpublished datasets

In [79]:
# Unpublished datasets:
unpublished = {}
for uuid, metadata in tqdm(unpublished_metadata.items(), desc="Populating : "):
    # print(json.dumps(metadata, indent=4))
    unpublished[uuid] = {
        # "hubmap-id" : metadata["hubmap_id"],
        "dataset-data-access-level" : metadata["data_access_level"],
        "sample-id" : "",
        "sample-category" : "",
        "sample-data-access-level" : "",
        "rui-location" : "",
    }
    
    # CASE 1: 
    if "direct_ancestors" in metadata:
        ancestors = get_ancestors(uuid)
        # print(json.dumps(ancestors, indent=4))
        for ancestor in ancestors:
            if (ancestor.get("entity_type", "") == "Sample") and ("rui_location" in ancestor):
                unpublished[uuid]["sample-id"] = ancestor.get("lab_tissue_sample_id", "")
                unpublished[uuid]["sample-category"] = ancestor.get("sample_category", "")
                unpublished[uuid]["sample-data-access-level"] = ancestor.get("data_access_level", "")
                unpublished[uuid]["hubmap-id"] = ancestor.get("hubmap_id", "")
                unpublished[uuid]["rui-location"] = ancestor.get("rui_location", "")
                break

    # CASE 4 - No updates.
    


Populating : 100%|██████████| 1946/1946 [28:08<00:00,  1.15it/s]


In [80]:
unpublished

{'9b82e4f2bd429e49ec632c3132d380a5': {'dataset-data-access-level': 'consortium',
  'sample-id': '117',
  'sample-category': 'block',
  'sample-data-access-level': 'consortium',
  'rui-location': {'@context': 'https://hubmapconsortium.github.io/hubmap-ontology/ccf-context.jsonld',
   '@id': 'http://purl.org/ccf/0.5/004c977d-dc98-472b-b02e-4209f8b41387',
   '@type': 'SpatialEntity',
   'ccf_annotations': [],
   'creation_date': '2/12/2020 10:12:17 AM',
   'creator': 'Jeff Spraggins',
   'creator_first_name': 'Jeff',
   'creator_last_name': 'Spraggins',
   'dimension_units': 'millimeter',
   'label': 'SpatialEntity for Female, Age 66, BMI 32.2',
   'placement': {'@context': 'https://hubmapconsortium.github.io/hubmap-ontology/ccf-context.jsonld',
    '@id': 'http://purl.org/ccf/0.5/004c977d-dc98-472b-b02e-4209f8b41387_placement',
    '@type': 'SpatialPlacement',
    'placement_date': '2/12/2020 10:12:17 AM',
    'rotation_order': 'XYZ',
    'rotation_units': 'degree',
    'scaling_units': 

#### Published datasets

In [77]:
# Published datasets:
published = {}
for uuid, metadata in tqdm(published_metadata.items(), desc="Populating : "):
    # print(json.dumps(metadata, indent=4))
    published[uuid] = {
        # "hubmap-id" : metadata["hubmap_id"],
        "dataset-data-access-level" :  metadata["data_access_level"],
        "sample-id" : "",
        "sample-category" : "",
        "sample-data-access-level" : "",
        "rui-location" : "",
    }
    
    
    if "ancestors" in metadata:
        ancestors = get_ancestors(uuid)
        
        condition_met = False
        # CASE 1: 
        for ancestor in ancestors:
            if (ancestor.get("entity_type", "") == "Sample") and ("rui_location" in ancestor):
                published[uuid]["sample-id"] = ancestor.get("lab_tissue_sample_id", "")
                published[uuid]["sample-category"] = ancestor.get("sample_category", "")
                published[uuid]["sample-data-access-level"] = ancestor.get("data_access_level", "")
                published[uuid]["hubmap-id"] = ancestor.get("hubmap_id", "")
                published[uuid]["rui-location"] = ancestor.get("rui_location", "")
                condition_met = True
                break

        # CASE 2: 
        if not condition_met:
            for ancestor in ancestors:
                if (ancestor.get("entity_type", "") == "Sample") and (ancestor.get("sample_category", "").lower() == "block"):
                    published[uuid]["sample-id"] = ancestor.get("lab_tissue_sample_id", "")
                    published[uuid]["hubmap-id"] = ancestor.get("hubmap_id", "")
                    # published[uuid]["sample-category"] = ancestor.get("sample_category", "")
                    # published[uuid]["sample-data-access-level"] = ancestor.get("data_access_level", "")
                    condition_met = True
                    break
        
        # CASE 3:
        if not condition_met:
            for ancestor in ancestors[::-1]:
                if (ancestor.get("entity_type", "") == "Sample") and (ancestor.get("sample_category", "").lower() != "organ"):
                    # published[uuid]["sample-id"] = ancestor.get("lab_tissue_sample_id", "")
                    published[uuid]["hubmap-id"] = ancestor.get("hubmap_id", "")
                    published[uuid]["sample-category"] = ancestor.get("sample_category", "")
                    # published[uuid]["sample-data-access-level"] = ancestor.get("data_access_level", "")
                    condition_met = True
                    break
        
        # CASE 4:
        # Pass



                





Populating : 100%|██████████| 1330/1330 [11:37<00:00,  1.91it/s]


In [78]:
published

{'ba41e71358136f6a202114681a217a95': {'dataset-data-access-level': 'public',
  'sample-id': 'W101 heart, LV',
  'sample-category': 'block',
  'sample-data-access-level': 'public',
  'rui-location': {'@context': 'https://hubmapconsortium.github.io/hubmap-ontology/ccf-context.jsonld',
   '@id': 'http://purl.org/ccf/1.5/191180d6-5e32-4978-af11-25ec2ceb52bd',
   '@type': 'SpatialEntity',
   'ccf_annotations': ['http://purl.obolibrary.org/obo/UBERON_0002084'],
   'creation_date': '2022-02-16',
   'creator': 'Andreas Bueckle',
   'creator_first_name': 'Andreas',
   'creator_last_name': 'Bueckle',
   'dimension_units': 'millimeter',
   'placement': {'@context': 'https://hubmapconsortium.github.io/hubmap-ontology/ccf-context.jsonld',
    '@id': 'http://purl.org/ccf/1.5/191180d6-5e32-4978-af11-25ec2ceb52bd_placement',
    '@type': 'SpatialPlacement',
    'placement_date': '2022-02-16',
    'rotation_order': 'XYZ',
    'rotation_units': 'degree',
    'scaling_units': 'ratio',
    'target': 'http