# Method 1: (prefered approach) 

**This method uses the Descendents API which I think only works if you are a HUBMAP member, if you are not prefer using the method method 2 using entity api**

In [29]:
import pandas as pd
import requests
import json
import os
from tqdm import tqdm

In [30]:
def load_data(path):
    df = pd.read_csv(path)
    return df[['HuBMAP ID']].dropna().reset_index(drop=True)

input_df = load_data("/u/sbdubey/deepcell_spleen.csv")
input_df.head()


Unnamed: 0,HuBMAP ID
0,HBM244.TJLK.223
1,HBM267.BZKT.867
2,HBM337.FSXL.564
3,HBM342.FSLD.938
4,HBM355.JDLK.244


In [31]:
TOKEN = "AgD81ng6Kwm9xBdYreXq0qez59KVD5YG8mMPqM9Xnnq7gPB221UaC2jovEKyaNwaY428D37vv7don4ToKjEoJipqz4o"
BASE = "https://entity.api.hubmapconsortium.org/descendants/"

headers = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json"
}

def fetch_descendants(hubmap_id):
    url = BASE + hubmap_id
    try:
        r = requests.get(url, headers=headers, timeout=10)
        if r.status_code == 200:
            return r.json()
        else:
            return None
    except:
        return None


In [32]:
def extract_codex_ids(descendants_json):
    if not isinstance(descendants_json, list):
        return []

    result = []
    for entry in descendants_json:
        if (
            entry.get("dataset_type") == "CODEX [Cytokit + SPRM]"
            and entry.get("status") == "Published"
            and entry.get("last_modified_user_displayname") == "Karl Burke"       # spleen dataset requried this condtion
        ):
            result.append(entry.get("hubmap_id"))
    return result


In [33]:
output_rows = []

for hubmap_id in tqdm(input_df['HuBMAP ID'], desc="Processing"):
    data = fetch_descendants(hubmap_id)
    codex_ids = extract_codex_ids(data) if data else []

    output_rows.append({
        "Input_HuBMAP_ID": hubmap_id,
        "Found_CODEX_IDs": ", ".join(codex_ids) if codex_ids else None
    })

output_df = pd.DataFrame(output_rows)
output_df.head()


Processing: 100%|██████████| 25/25 [00:12<00:00,  1.96it/s]


Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs
0,HBM244.TJLK.223,HBM339.LBCC.963
1,HBM267.BZKT.867,HBM946.WMTC.283
2,HBM337.FSXL.564,HBM279.RTXC.523
3,HBM342.FSLD.938,HBM496.ZJFC.554
4,HBM355.JDLK.244,HBM626.KXRZ.238


In [34]:
output_df

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs
0,HBM244.TJLK.223,HBM339.LBCC.963
1,HBM267.BZKT.867,HBM946.WMTC.283
2,HBM337.FSXL.564,HBM279.RTXC.523
3,HBM342.FSLD.938,HBM496.ZJFC.554
4,HBM355.JDLK.244,HBM626.KXRZ.238
5,HBM374.LLKS.325,HBM443.TZCQ.232
6,HBM389.PKHL.936,HBM863.FDNH.844
7,HBM427.SMGB.866,HBM898.LWCS.878
8,HBM432.LLCF.677,HBM573.GQRD.788
9,HBM498.TCSV.345,HBM455.PWQW.883


In [35]:
output_df.to_csv("descendant_hubmapID_spleen_01.csv", index=False)
print("Saved: codex_results.csv")

Saved: codex_results.csv


**Check if the file is present in Ross folder (I did not run for spleen)**

In [18]:
# Check if Input_HuBMAP_ID values are in processed_ross folder
import glob
import os

# Get list of processed files
processed_files = glob.glob("dataset_hiskers/processed_ross/*.csv")
processed_ids = [os.path.basename(f).split('_')[0] for f in processed_files]

results = []
for _, row in output_df.iterrows():
    input_id = row['Input_HuBMAP_ID']
    
    # Convert ID format: HBM496.ZJFC.554 -> HBM496ZJFC554
    id_no_dots = input_id.replace('.', '')
    
    # Check if matching file exists in processed_ross
    in_processed = id_no_dots in processed_ids
    
    results.append({
        'Input_HuBMAP_ID': input_id,
        'In_Processed_Ross': in_processed,
        'Processed_Filename': f"{id_no_dots}_deepcell_population.csv" if in_processed else None
    })

check_input_df = pd.DataFrame(results)
print(f"Total Input IDs: {len(check_input_df)}")
print(f"Found in processed_ross: {check_input_df['In_Processed_Ross'].sum()}")
check_input_df

Total Input IDs: 64
Found in processed_ross: 0


Unnamed: 0,Input_HuBMAP_ID,In_Processed_Ross,Processed_Filename
0,HBM235.VKNJ.237,False,
1,HBM238.GTNW.259,False,
2,HBM242.LSCK.393,False,
3,HBM284.SBPR.357,False,
4,HBM285.VFDT.966,False,
...,...,...,...
59,HBM945.FSHR.864,False,
60,HBM953.KMTG.758,False,
61,HBM964.FPNH.767,False,
62,HBM974.CNWK.327,False,


# Method 2: using Entity API (open to all) 
**Note: I did not try this method on this data**

In [1]:
import pandas as pd
import requests
import re
import os
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)


In [2]:
def load_data(path):
    return pd.read_csv(path)[['HuBMAP ID']].dropna().reset_index(drop=True)

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

def get_last_uuid(hubmap_id):
    base_url = "https://entity.api.hubmapconsortium.org/entities/"
    url = f"{base_url}{hubmap_id}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            uuids = re.findall(r'"uuid"\s*:\s*"([a-f0-9\-]+)"', r.text)
            return uuids[-1] if uuids else None
        return None
    except Exception:
        return None


In [4]:
csv_path = "/u/sbdubey/deepcell_intestine.csv"
df = load_data(csv_path)
df.head()


Unnamed: 0,HuBMAP ID
0,HBM235.VKNJ.237
1,HBM238.GTNW.259
2,HBM242.LSCK.393
3,HBM284.SBPR.357
4,HBM285.VFDT.966


In [5]:
df['last_uuid'] = [
    get_last_uuid(hid) for hid in tqdm(df['HuBMAP ID'], desc="Fetching UUIDs")
]


Fetching UUIDs: 100%|██████████| 64/64 [01:13<00:00,  1.15s/it]


In [6]:
out_path = "/u/sbdubey/hubmap_uuid_map.csv"
df.to_csv(out_path, index=False)
print(f"Saved: {out_path}")
df.head()


Saved: /u/sbdubey/hubmap_uuid_map.csv


Unnamed: 0,HuBMAP ID,last_uuid
0,HBM235.VKNJ.237,c8ee4bd8f052b50fab2e9e9365793374
1,HBM238.GTNW.259,a7ecf4fd858a58dcaa3bab5df455065b
2,HBM242.LSCK.393,a702c94500a2f737b722e34f6df7e7dd
3,HBM284.SBPR.357,a9d4471b41cd0d8185e12dd6c5a48e96
4,HBM285.VFDT.966,7034950e109586361e73c1b9ddb81346


# Extract the Descendents

In [7]:
df = pd.read_csv('/u/sbdubey/hubmap_uuid_map.csv')

In [8]:
def fetch_json(uuid):
    url = f"https://portal.hubmapconsortium.org/browse/dataset/{uuid}.json"
    try:
        r = requests.get(url, timeout=10)
        return r.json() if r.status_code == 200 else None
    except:
        return None


In [9]:
def extract_fields(js):
    if js is None:
        return {"total_descendants": None, "first_descendant_id": None}
    
    counts = js.get("descendant_counts", {}).get("entity_type", {})
    total = sum(counts.values()) if isinstance(counts, dict) else None
    
    desc_ids = js.get("descendant_ids", [])
    first_id = desc_ids[0] if len(desc_ids) > 0 else None
    
    return {
        "total_descendants": total,
        "first_descendant_id": first_id
    }


In [10]:
results = []

for uid in tqdm(df["last_uuid"], desc="Fetching Descendants"):
    js = fetch_json(uid)
    fields = extract_fields(js)
    results.append(fields)

desc_df = pd.DataFrame(results)
desc_df.head()


Fetching Descendants: 100%|██████████| 64/64 [00:11<00:00,  5.68it/s]


Unnamed: 0,total_descendants,first_descendant_id
0,6,10c0c11280c00f324259fe38e2291ee4
1,6,8b21db2002a5179b03532d183a4885eb
2,6,6bdd149dc47782aefdd0e23599708183
3,6,550f3ef14b113c24fd21b8b0750bf078
4,6,aa54aad994ca8a64fa52b3f3945c01b7


In [11]:
final_df = pd.concat([df, desc_df], axis=1)

out_path = "/u/sbdubey/hubmap_descendants_map.csv"
final_df.to_csv(out_path, index=False)

print(f"Saved: {out_path}")
final_df.head()


Saved: /u/sbdubey/hubmap_descendants_map.csv


Unnamed: 0,HuBMAP ID,last_uuid,total_descendants,first_descendant_id
0,HBM235.VKNJ.237,c8ee4bd8f052b50fab2e9e9365793374,6,10c0c11280c00f324259fe38e2291ee4
1,HBM238.GTNW.259,a7ecf4fd858a58dcaa3bab5df455065b,6,8b21db2002a5179b03532d183a4885eb
2,HBM242.LSCK.393,a702c94500a2f737b722e34f6df7e7dd,6,6bdd149dc47782aefdd0e23599708183
3,HBM284.SBPR.357,a9d4471b41cd0d8185e12dd6c5a48e96,6,550f3ef14b113c24fd21b8b0750bf078
4,HBM285.VFDT.966,7034950e109586361e73c1b9ddb81346,6,aa54aad994ca8a64fa52b3f3945c01b7


**`first_descendant_id` shows the real descendant UUID.
The count is 6 due to some weird/gibberish data in the JSON that I didn’t understand,
but `first_descendant_id` is the correct descendant.**
