## HuBMAP metadata analysis

In [50]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import os
from pathlib import Path
import json

In [51]:
def get_hubmap_metadata(uuid: str, ):
    URL = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}"
    if "json" not in uuid:
        URL = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}.json"
    
    response = requests.get(URL)
    return response

In [52]:
get_hubmap_metadata(uuid="0de3181b777383b7b918d4402021fb34")

<Response [200]>

List of all json IDs.

In [6]:
def get_list_of_uuids():
    hubmap_metadata_general_url = "https://portal.hubmapconsortium.org/metadata/v0/datasets.tsv"
    df_raw = pd.read_csv(hubmap_metadata_general_url, delimiter='\t')
    hubmap_uuids = df_raw['uuid'][1:].tolist()
    return hubmap_uuids

In [7]:
hubmap_uuids = get_list_of_uuids()

In [8]:
# resp = get_hubmap_metadata(uuid=hubmap_uuids[1])
resp = get_hubmap_metadata(uuid="0de3181b777383b7b918d4402021fb34")

In [9]:
for key,value in resp.json().items():
    print(key, type(value))

ancestor_counts <class 'dict'>
ancestor_ids <class 'list'>
ancestors <class 'list'>
contacts <class 'list'>
contains_human_genetic_sequences <class 'bool'>
contributors <class 'list'>
created_by_user_displayname <class 'str'>
created_by_user_email <class 'str'>
created_timestamp <class 'int'>
data_access_level <class 'str'>
data_types <class 'list'>
descendant_counts <class 'dict'>
descendant_ids <class 'list'>
descendants <class 'list'>
description <class 'str'>
display_subtype <class 'str'>
doi_url <class 'str'>
donor <class 'dict'>
entity_type <class 'str'>
files <class 'list'>
group_name <class 'str'>
group_uuid <class 'str'>
hubmap_id <class 'str'>
immediate_ancestors <class 'list'>
immediate_descendants <class 'list'>
index_version <class 'str'>
last_modified_timestamp <class 'int'>
mapped_consortium <class 'str'>
mapped_data_access_level <class 'str'>
mapped_data_types <class 'list'>
mapped_last_modified_timestamp <class 'str'>
mapped_metadata <class 'dict'>
mapped_status <class

In [53]:
resp.json()

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}

In [54]:
def get_list_of_uuids():
    hubmap_metadata_general_url = "https://portal.hubmapconsortium.org/metadata/v0/datasets.tsv"
    df_raw = pd.read_csv(hubmap_metadata_general_url, delimiter='\t')
    hubmap_uuids = df_raw['uuid'][1:].tolist()
    return hubmap_uuids


def get_hubmap_metadata(uuid: str, ):
    URL = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}"
    if "json" not in uuid:
        URL = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}.json"
    
    response = requests.get(URL)
    return response


def get_all_hubmap_dataset_metadata_Sequential():
    all_dataset_metadata = {}
    failed_fetches = []
    hubmap_uuids = get_list_of_uuids()
    for i, uuid in tqdm(enumerate(hubmap_uuids), desc="Fetching HuBMAP dataset metadata: "):
        response = get_hubmap_metadata(uuid=uuid)
        if response.status_code != 200:
            failed_fetches.append(uuid)
        else:
            all_dataset_metadata[uuid] = response.json()

        if i % 100 == 0:
            time.sleep(5)  

    return all_dataset_metadata, failed_fetches  

In [55]:
all_dataset_metadata, failed_fetches = get_all_hubmap_dataset_metadata_Sequential()

Fetching HuBMAP dataset metadata: : 1330it [14:10,  1.56it/s]


In [57]:
with open("sample.json", "w") as outfile:
    json.dump(all_dataset_metadata, outfile)

In [58]:
failed_fetches

[]

In [64]:
len(all_dataset_metadata.keys())


1330

# HuBMAP data dump exploration

In [11]:
all_dataset_metadata = None
with open("sample.json", "r") as f:
    all_dataset_metadata = json.load(f)

In [16]:
fields = {k:list(v.keys()) for k,v in all_dataset_metadata.items()}

In [17]:
fields_counter = {}
for k,v in fields.items():
    for feature in v:
        if feature not in fields_counter:
            fields_counter[feature] = 0
        fields_counter[feature] += 1

In [18]:
fields_counter

{'anatomy_0': 1131,
 'anatomy_1': 1131,
 'ancestor_counts': 1321,
 'ancestor_ids': 1321,
 'ancestors': 1321,
 'contacts': 848,
 'contains_human_genetic_sequences': 1321,
 'contributors': 919,
 'created_by_user_displayname': 1321,
 'created_by_user_email': 1321,
 'created_timestamp': 1321,
 'data_access_level': 1321,
 'data_types': 1321,
 'dataset_info': 1182,
 'descendant_counts': 1321,
 'descendant_ids': 1321,
 'descendants': 1321,
 'description': 900,
 'display_subtype': 1321,
 'doi_url': 848,
 'donor': 1321,
 'entity_type': 1321,
 'files': 1321,
 'group_name': 1321,
 'group_uuid': 1321,
 'hubmap_id': 1321,
 'immediate_ancestors': 1321,
 'immediate_descendants': 1321,
 'index_version': 1321,
 'lab_dataset_id': 857,
 'last_modified_timestamp': 1321,
 'mapped_consortium': 1321,
 'mapped_data_access_level': 1321,
 'mapped_data_types': 1321,
 'mapped_last_modified_timestamp': 1321,
 'mapped_metadata': 1302,
 'mapped_status': 1321,
 'mapper_metadata': 1321,
 'metadata': 1302,
 'origin_sam

In [None]:
all_dataset_metadata

## HuBMAP restricted Data:
- Search API

In [21]:

BASE_URL = "https://search.api.hubmapconsortium.org/v3/" 
endpoint = "indices"
URL = BASE_URL + endpoint
resp = requests.get(URL)

In [22]:
resp.json()

{'indices': ['entities', 'portal', 'hm_antibodies', 'files']}

In [23]:
headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}

DSL_Query = {
  "query": {
    "bool": {
      "must": [
        {
          "match_phrase": {
            "donor.publication_status": "Published"
          }
        }
      ],
      "filter": [
        {
          "match": {
            "origin_sample.entity_type": "Sample"
          }
        }
      ]
    }
  }
}

BASE_URL = "https://search.api.hubmapconsortium.org/v3/" 
endpoint = "search"
URL = BASE_URL + endpoint
resp = requests.post(URL, headers=headers, json=DSL_Query)

In [24]:
resp.json()

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}

In [25]:
resp

b'{"_shards":{"failed":0,"skipped":0,"successful":5,"total":5},"hits":{"hits":[],"max_score":null,"total":{"relation":"eq","value":0}},"timed_out":false,"took":1}\n'

In [48]:
nexus_token = "Ag8ekXMxQp5N96BynDp6y0dGgd4GVyz19wgOpqor5oKWvGkr6bS8ClVW5YEwEq6ppv1QGdD5kEONobhy5Y0w8TEkg2"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}

DSL_Query = {
  "query": {
    "bool": {
      "should": [
        {
          "match_phrase": {
            "donor.publication_status": "New"
          }
        },
        { 
          "match_phrase": {
            "donor.publication_status": "QA"
          }
        },
      ],
    }
  }
}


BASE_URL = "https://search.api.hubmapconsortium.org/v3/" 
endpoint = "search"
URL = BASE_URL + endpoint
resp = requests.post(URL, headers=headers, json=DSL_Query)

In [49]:
resp.json()

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}

##### Using Entity API

In [76]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}

BASE_URL = "https://entity.api.hubmapconsortium.org/"
endpoint = "datasets/prov-info"
params = {
    "format" : "json"
}
URL = BASE_URL + endpoint
resp = requests.get(URL, headers=headers, params=params)



In [77]:
resp.json()

[{'dataset_created_by_email': 'hubmap@hubmapconsortium.org',
  'dataset_data_types': ['Autofluorescence Microscopy'],
  'dataset_date_time_created': '2022-10-11 17:57:39',
  'dataset_date_time_modified': '2022-10-11 17:57:39',
  'dataset_group_name': 'Vanderbilt TMC',
  'dataset_group_uuid': '73bb26e4-ed43-11e8-8f19-0a7c1eab007a',
  'dataset_hubmap_id': 'HBM858.MFFT.638',
  'dataset_modified_by_email': 'hubmap@hubmapconsortium.org',
  'dataset_portal_url': 'https://portal.hubmapconsortium.org/browse/dataset/34056fbd3adb97f19a653390869ea1de',
  'dataset_status': 'New',
  'dataset_uuid': '34056fbd3adb97f19a653390869ea1de',
  'donor_group_name': ['Vanderbilt TMC'],
  'donor_hubmap_id': ['HBM758.JRSC.348'],
  'donor_submission_id': ['VAN0007'],
  'donor_uuid': ['96a667104f92a38f5d4f97c38d94e738'],
  'first_sample_hubmap_id': ['HBM493.PMZQ.923'],
  'first_sample_portal_url': ['https://portal.hubmapconsortium.org/browse/sample/93b51537a638fb29c7df9584beeecd4b'],
  'first_sample_submission_id

In [91]:
dt_uuid_set = set([dt["dataset_uuid"] for dt in resp.json()])

In [92]:
len(dt_uuid_set)

2137

In [88]:
set(all_dataset_metadata.keys()).__len__()

1330

In [95]:
len(dt_uuid_set.intersection(set(all_dataset_metadata.keys())))

924

In [98]:
len(set(all_dataset_metadata.keys()) - dt_uuid_set)

406

In [102]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}
ID = list(dt_uuid_set)[0]

BASE_URL = "https://entity.api.hubmapconsortium.org/"
endpoint = f"datasets/{ID}/prov-info"
params = {
    "format" : "json",
    "id" : ID,
}
URL = BASE_URL + endpoint
resp = requests.get(URL, headers=headers, params=params)



In [103]:
resp.json()

{'dataset_created_by_email': 'Daniel.cotter@stanford.edu',
 'dataset_data_types': ['CODEX'],
 'dataset_date_time_created': '2022-08-30 20:08:04',
 'dataset_date_time_modified': '2022-08-30 20:08:04',
 'dataset_group_name': 'Stanford TMC',
 'dataset_group_uuid': 'def5fd76-ed43-11e8-b56a-0e8017bdda58',
 'dataset_hubmap_id': 'HBM285.VFDT.966',
 'dataset_modified_by_email': 'Daniel.cotter@stanford.edu',
 'dataset_portal_url': 'https://portal.hubmapconsortium.org/browse/dataset/7034950e109586361e73c1b9ddb81346',
 'dataset_status': 'New',
 'dataset_uuid': '7034950e109586361e73c1b9ddb81346',
 'donor_group_name': ['Stanford TMC'],
 'donor_hubmap_id': ['HBM275.QBCZ.562'],
 'donor_submission_id': ['STAN0014'],
 'donor_uuid': ['142d18f0a749d3d02d8b1addbca15589'],
 'first_sample_hubmap_id': ['HBM438.JGJH.887'],
 'first_sample_portal_url': ['https://portal.hubmapconsortium.org/browse/sample/5058beb6cdf7e9445d3e3d944822762e'],
 'first_sample_submission_id': ['STAN0014-SI-2-1'],
 'first_sample_type':

#### Unpublished

In [104]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}

BASE_URL = "https://entity.api.hubmapconsortium.org/"
endpoint = f"datasets/unpublished"
params = {
    "format" : "json",
}
URL = BASE_URL + endpoint
resp = requests.get(URL, headers=headers, params=params)



In [106]:
[resp.json()]

1942

In [108]:
unpublished_dataset_uuids = [dt['uuid'] for dt in resp.json()]

In [112]:
unpublished_dataset_uuids[:5]

['9b82e4f2bd429e49ec632c3132d380a5',
 '34056fbd3adb97f19a653390869ea1de',
 'a7446cfb37adfac308eb69ec307dd69a',
 '4176c40f5512a6e8fbc9c6975ddec2b5',
 '2fdbf2be6b297eb1951b11db5b79cadb']

#### Entity fetch

In [121]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
headers = {
    # 'accept': 'application/json',
    # 'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + nexus_token
}
ID = unpublished_dataset_uuids[1]

BASE_URL = "https://entity.api.hubmapconsortium.org/"
endpoint = f"entities/{ID}"
params = {
    "id" : ID,
}
URL = BASE_URL + endpoint
resp = requests.get(URL, headers=headers, )



In [122]:
resp.json()

{'contains_human_genetic_sequences': False,
 'created_by_user_displayname': 'HuBMAP Process',
 'created_by_user_email': 'hubmap@hubmapconsortium.org',
 'created_by_user_sub': '3e7bce63-129d-33d0-8f6c-834b34cd382e',
 'created_timestamp': 1665511059282,
 'data_access_level': 'consortium',
 'data_types': ['AF'],
 'description': 'Autofluorescence Microscopy collected from the left kidney of a 66 year old White male donor\nby the Biomolecular Multimodal Imaging Center (BIOMC) at Vanderbilt University. BIOMIC is a Tissue Mapping Center\nthat is part of the NIH funded Human Biomolecular Atlas Program (HuBMAP). Autofluorescence images were\ncollected with a Carl Zeiss microscopy AxioScan.Z1 using 3 channels/filters --\nDAPI (excitation: 335-383 nm/emission: 420-470 nm), eGFP (excitation: 450-490 nm/emission: 500-550 nm),\nand dsRed (excitation: 538-562 nm/emission: 570-640 nm). Support was provided by the NIH Common Fund\nand National Institute of Diabetes and Digestive and Kidney Diseases (U5

#### Get all unpublished data

In [129]:
def get_unpublished_dataset_metadata(nexus_token, uuid):
    # nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
    headers = {
        'Authorization': 'Bearer ' + nexus_token
    }

    BASE_URL = "https://entity.api.hubmapconsortium.org/"
    endpoint = f"entities/{uuid}"
    URL = BASE_URL + endpoint
    resp = requests.get(URL, headers=headers)
    return resp


def get_all_hubmap_unpublished_dataset_uuids(nexus_token):
    headers = {
        'Authorization': 'Bearer ' + nexus_token
    }

    BASE_URL = "https://entity.api.hubmapconsortium.org/"
    endpoint = f"datasets/unpublished"
    params = {
        "format" : "json",
    }
    URL = BASE_URL + endpoint
    resp = requests.get(URL, headers=headers, params=params)
    unpublished_dataset_uuids = []
    if resp.status_code == 200:
        unpublished_dataset_uuids = [dt['uuid'] for dt in resp.json()]
    else:
        return {"error": "Unable to fetch unpublished dataset prov-info."}
    return unpublished_dataset_uuids


def get_all_hubmap_unpublished_dataset_metadata(nexus_token):
    unpublished_dataset_uuids = get_all_hubmap_unpublished_dataset_uuids(nexus_token)
    print(f"Total unpublished datasets : {len(unpublished_dataset_uuids)}")

    all_unpublished_dataset_metadata = {}
    failed_unpublished_fetches = []

    for uuid in tqdm(unpublished_dataset_uuids, desc="Fetching unpublished HuBMAP dataset metadata: "):    
        resp = get_unpublished_dataset_metadata(nexus_token, uuid)
        if resp.status_code != 200:
            failed_unpublished_fetches.append(uuid)
        else:
            all_unpublished_dataset_metadata[uuid] = resp.json()

        
    return all_unpublished_dataset_metadata, failed_unpublished_fetches

In [130]:
nexus_token = "Agd373rPmMWnXaWYV7K61jvMOEvjkYBVaK9oEwPvywyxkrlvy0UXCopKXywKJopxG3vV8MgG3wv4ODuMX0oBrCrPa8"
all_unpublished_dataset_metadata, failed_unpublished_fetches = get_all_hubmap_unpublished_dataset_metadata(nexus_token)

Fetching unpublished HuBMAP dataset metadata:   0%|          | 0/1942 [00:00<?, ?it/s]

Total unpublished datasets : 1942


Fetching unpublished HuBMAP dataset metadata: 100%|██████████| 1942/1942 [25:01<00:00,  1.29it/s]


In [None]:
all_unpublished_dataset_metadata

In [134]:
len(all_unpublished_dataset_metadata)

1938

In [133]:
all_unpublished_dataset_metadata['9b82e4f2bd429e49ec632c3132d380a5']

{'contains_human_genetic_sequences': False,
 'created_by_user_displayname': 'HuBMAP Process',
 'created_by_user_email': 'hubmap@hubmapconsortium.org',
 'created_by_user_sub': '3e7bce63-129d-33d0-8f6c-834b34cd382e',
 'created_timestamp': 1665511029046,
 'data_access_level': 'consortium',
 'data_types': ['AF'],
 'description': 'Autofluorescence Microscopy collected from the left kidney of a 66 year old White male donor\nby the Biomolecular Multimodal Imaging Center (BIOMC) at Vanderbilt University. BIOMIC is a Tissue Mapping Center\nthat is part of the NIH funded Human Biomolecular Atlas Program (HuBMAP). Autofluorescence images were\ncollected with a Carl Zeiss microscopy AxioScan.Z1 using 3 channels/filters --\nDAPI (excitation: 335-383 nm/emission: 420-470 nm), eGFP (excitation: 450-490 nm/emission: 500-550 nm),\nand dsRed (excitation: 538-562 nm/emission: 570-640 nm). Support was provided by the NIH Common Fund\nand National Institute of Diabetes and Digestive and Kidney Diseases (U5