In [2]:
import pandas as pd
import requests
import re
import os
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)


In [9]:
def load_data(path):
    return pd.read_csv(path)[['HuBMAP ID']].dropna().reset_index(drop=True)

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

def get_last_uuid(hubmap_id):
    base_url = "https://entity.api.hubmapconsortium.org/entities/"
    url = f"{base_url}{hubmap_id}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            uuids = re.findall(r'"uuid"\s*:\s*"([a-f0-9\-]+)"', r.text)
            return uuids[-1] if uuids else None
        return None
    except Exception:
        return None


In [10]:
csv_path = "/u/sbdubey/crosswalks.csv"
df = load_data(csv_path)
df.head()


Unnamed: 0,HuBMAP ID
0,HBM235.VKNJ.237
1,HBM238.GTNW.259
2,HBM242.LSCK.393
3,HBM284.SBPR.357
4,HBM285.VFDT.966


In [12]:
df['last_uuid'] = [
    get_last_uuid(hid) for hid in tqdm(df['HuBMAP ID'], desc="Fetching UUIDs")
]


Fetching UUIDs:   0%|          | 0/64 [00:00<?, ?it/s]

Fetching UUIDs: 100%|██████████| 64/64 [00:28<00:00,  2.23it/s]


In [13]:
out_path = "/u/sbdubey/hubmap_uuid_map.csv"
df.to_csv(out_path, index=False)
print(f"Saved: {out_path}")
df.head()


Saved: /u/sbdubey/hubmap_uuid_map.csv


Unnamed: 0,HuBMAP ID,last_uuid
0,HBM235.VKNJ.237,c8ee4bd8f052b50fab2e9e9365793374
1,HBM238.GTNW.259,a7ecf4fd858a58dcaa3bab5df455065b
2,HBM242.LSCK.393,a702c94500a2f737b722e34f6df7e7dd
3,HBM284.SBPR.357,a9d4471b41cd0d8185e12dd6c5a48e96
4,HBM285.VFDT.966,7034950e109586361e73c1b9ddb81346


# Extract the Descendents

In [5]:
df = pd.read_csv('/u/sbdubey/hubmap_uuid_map.csv')

In [21]:
def fetch_json(uuid):
    url = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}.json"
    try:
        r = requests.get(url, timeout=10)
        return r.json() if r.status_code == 200 else None
    except:
        return None


In [22]:
def extract_fields(js):
    if js is None:
        return {"total_descendants": None, "first_descendant_id": None}
    
    counts = js.get("descendant_counts", {}).get("entity_type", {})
    total = sum(counts.values()) if isinstance(counts, dict) else None
    
    desc_ids = js.get("descendant_ids", [])
    first_id = desc_ids[0] if len(desc_ids) > 0 else None
    
    return {
        "total_descendants": total,
        "first_descendant_id": first_id
    }


In [23]:
results = []

for uid in tqdm(df["last_uuid"], desc="Fetching Descendants"):
    js = fetch_json(uid)
    fields = extract_fields(js)
    results.append(fields)

desc_df = pd.DataFrame(results)
desc_df.head()


Fetching Descendants: 100%|██████████| 64/64 [00:10<00:00,  5.85it/s]


Unnamed: 0,total_descendants,first_descendant_id
0,6,10c0c11280c00f324259fe38e2291ee4
1,6,8b21db2002a5179b03532d183a4885eb
2,6,6bdd149dc47782aefdd0e23599708183
3,6,550f3ef14b113c24fd21b8b0750bf078
4,6,aa54aad994ca8a64fa52b3f3945c01b7


In [24]:
final_df = pd.concat([df, desc_df], axis=1)

out_path = "/u/sbdubey/hubmap_descendants_map.csv"
final_df.to_csv(out_path, index=False)

print(f"Saved: {out_path}")
final_df.head()


Saved: /u/sbdubey/hubmap_descendants_map.csv


Unnamed: 0,HuBMAP ID,last_uuid,total_descendants,first_descendant_id
0,HBM235.VKNJ.237,c8ee4bd8f052b50fab2e9e9365793374,6,10c0c11280c00f324259fe38e2291ee4
1,HBM238.GTNW.259,a7ecf4fd858a58dcaa3bab5df455065b,6,8b21db2002a5179b03532d183a4885eb
2,HBM242.LSCK.393,a702c94500a2f737b722e34f6df7e7dd,6,6bdd149dc47782aefdd0e23599708183
3,HBM284.SBPR.357,a9d4471b41cd0d8185e12dd6c5a48e96,6,550f3ef14b113c24fd21b8b0750bf078
4,HBM285.VFDT.966,7034950e109586361e73c1b9ddb81346,6,aa54aad994ca8a64fa52b3f3945c01b7


In [None]:
final_df.dropna('')

In [20]:
final_df.head(64)

Unnamed: 0,HuBMAP ID,last_uuid,descendants_raw,descendant_uuids,num_descendants
0,HBM235.VKNJ.237,c8ee4bd8f052b50fab2e9e9365793374,"[{'entity_type': 'Dataset', 'uuid': '10c0c1128...","[10c0c11280c00f324259fe38e2291ee4, 77ab3588032...",6
1,HBM238.GTNW.259,a7ecf4fd858a58dcaa3bab5df455065b,"[{'entity_type': 'Dataset', 'uuid': '8b21db200...","[8b21db2002a5179b03532d183a4885eb, 77ab3588032...",6
2,HBM242.LSCK.393,a702c94500a2f737b722e34f6df7e7dd,"[{'entity_type': 'Dataset', 'uuid': '6bdd149dc...","[6bdd149dc47782aefdd0e23599708183, 77ab3588032...",6
3,HBM284.SBPR.357,a9d4471b41cd0d8185e12dd6c5a48e96,"[{'entity_type': 'Dataset', 'uuid': '550f3ef14...","[550f3ef14b113c24fd21b8b0750bf078, 77ab3588032...",6
4,HBM285.VFDT.966,7034950e109586361e73c1b9ddb81346,"[{'entity_type': 'Dataset', 'uuid': 'aa54aad99...","[aa54aad994ca8a64fa52b3f3945c01b7, 77ab3588032...",6
...,...,...,...,...,...
59,HBM945.FSHR.864,11b4f413984624557c2b1a4566b0d71a,"[{'entity_type': 'Dataset', 'uuid': 'dceadbb36...","[dceadbb36871071f30c308ca091fbdc8, aea510e37bc...",7
60,HBM953.KMTG.758,262419be2d6fc1fbbbdcbf9645d7b11a,"[{'entity_type': 'Dataset', 'uuid': 'a9afd0984...","[a9afd0984321fcd25e72bc049f434a89, 77ab3588032...",6
61,HBM964.FPNH.767,59f7fa785f9dbd46f45758e22454e912,"[{'entity_type': 'Dataset', 'uuid': '9d8677a79...","[9d8677a799797c76a8205988cd3888b2, aea510e37bc...",7
62,HBM974.CNWK.327,f7b14f3298c0676d8cc5b127e72c8bc7,"[{'entity_type': 'Dataset', 'uuid': 'b38730b26...","[b38730b2633e0b088619f9bcd514ba13, 77ab3588032...",6
