## A notebook to check millitome datasets against hra-pop

In [7]:
# install and import libraries
%pip install pandas requests

import pandas as pd
import glob
import requests
from pprint import pprint
from io import StringIO
import json

Note: you may need to restart the kernel to use updated packages.


In [8]:
# Set up authentication, note that you need to provide this yourself!
with open('data/token.txt', "r") as file_object:
  TOKEN = file_object.readline()
  
headers = {'Authorization': f'Bearer {TOKEN}'}

## Get hra-pop Universe data from Sankey

In [9]:
# get sankey data
sankey_url = 'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/main/output-data/v0.11.1/reports/universe-ad-hoc/sankey.csv'

# read Sankey report from hra-pop as CSV file
sankey = pd.read_csv(sankey_url)

# filter out HuBMAP datasets and their RUI, atlas status, then keep relevant columns
sankey_hubmap = sankey[sankey['portal'] == "HuBMAP"][['portal', 'unique_dataset_id','is_rui_registered','is_atlas_dataset']]
sankey_hubmap['has_millitome_block'] = False
sankey_hubmap

  sankey = pd.read_csv(sankey_url)


Unnamed: 0,portal,unique_dataset_id,is_rui_registered,is_atlas_dataset,has_millitome_block
5,HuBMAP,http://purl.org/ccf/10.1016/j.cell.2022.12.028...,True,False,False
88,HuBMAP,http://purl.org/ccf/UFL0006-SP-1-2-1,True,False,False
89,HuBMAP,http://purl.org/ccf/UFL0006-SP-1-3-1,True,False,False
90,HuBMAP,http://purl.org/ccf/UFL0007-SP-1-1-1,True,False,False
91,HuBMAP,http://purl.org/ccf/UFL0007-SP-2-2-1,True,False,False
...,...,...,...,...,...
15584,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False
15585,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,True,False
15586,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False
15587,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False


In [10]:
# transform unique_dataset_id column and add to set
sankey_hubmap['unique_dataset_id_stripped'] = sankey_hubmap['unique_dataset_id'].apply(lambda s: s.split('/')[-1])
sankey_hubmap

sankey_ids = set(sankey_hubmap['unique_dataset_id_stripped'])
sankey_ids

{'37988db44acc8d0780e4e31cd057e789',
 '6ae023b460185e73a654b43fb5b1da39',
 'a2bf1303a535bba11900932213b0f987',
 'WD-76845-043',
 'aa637516132ad79fe7df054b52147495',
 'd52b2a1c42cee03c016a19864feae511',
 '2746ce8d450265ace2c6aee762c8dfda',
 '96c782b502cdb23122b2e7fb376d949d',
 '4c2e3e48e301e09dfb97950e376cf5bc',
 'd44fee159a959c2ffe840e24fe2ccfe5',
 'df7b5c077c2fc442f56d8667fe8ea37c',
 'eaad67a6c6e891ea72cc397c26bd607f',
 '6420fcb2995e718d313b4b2b2cb72878',
 'b449bd94ba1498ba8501d419eb985f51',
 'rui_locations.jsonld#HBM937.MZGP.398',
 '166ea9ac6dbfa53966c60176b92d35ca',
 '1de1a4a2ff1295e88b51008ef199f11f',
 '66921ae1bb19ad3e3f4b18b741244a57',
 '1c6c94b1fb4da454fed05c1c728c751f',
 '6a0522bf6f1e7207516b04c8f3399a28',
 '060dfa0fdf2b840864f62d2cd1a7a456',
 'ab07fe937fe55a6a5eb0b7de9c28396b',
 '0f35f3bd23a8f773c7213d558338e74d',
 'd55e751748bd094ff5b0b55befb08d41',
 'd7abd52f93203c91a63bd9060e1b3012',
 'a8560706a0a6bcffd974ed4dd8082571',
 'c945cbd75f2da373d1ccfe86694f9c34',
 'be0cbd5e3801085

## Get millitome data

In [11]:
# load data
csv_files = glob.glob('data/*.csv')

# Create an empty list to store the dataframes
dfs = []

# Loop through each CSV file and read it into a dataframe
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# identify HuBMAP IDs from all the columns
combined_df.columns

Index(['HuBMAP Sample ID', 'Donor', 'Link', 'Lab ID', 'Parent organ type',
       'BLOCK Location', 'Assay Group', 'millitome_ID', 'HubMAP ID',
       'Submission ID', 'sample_lab_id', 'TIssue Block HuBMAP IDs',
       'RUI Location', 'Id'],
      dtype='object')

In [12]:
# get HuBMAP IDs
id_lists = []

keep = ['HuBMAP Sample ID','HubMAP ID','TIssue Block HuBMAP IDs']

for col_name in combined_df.columns:
  if col_name in keep:
    id_lists.append(combined_df[col_name].dropna().unique().tolist())

hubmap_ids_flat = [id for sub_list in id_lists for id in sub_list]
hubmap_ids_flat

['HBM692.LCNB.765',
 'HBM792.WJLJ.923',
 'HBM256.MTWQ.585',
 'HBM568.LZHR.425',
 'HBM252.QVCK.893',
 'HBM495.WLDR.795',
 'HBM588.TNGJ.868',
 'HBM644.XFGJ.857',
 'HBM385.QFHD.475',
 'HBM334.CHVK.238',
 'HBM587.DFCW.749',
 'HBM995.CQKZ.339',
 'HBM966.WGFQ.597',
 'HBM553.WQWN.884',
 'HBM374.SRZD.953',
 'HBM762.SPJF.928',
 'HBM776.DNJZ.945',
 'HBM459.VMJC.864',
 'HBM696.HWTM.483',
 'HBM992.ZLZB.786',
 'HBM759.ZTGN.372',
 'HBM379.BGFQ.837',
 'HBM476.NQXM.289',
 'HBM872.TSRG.986',
 'HBM997.HRCG.585',
 'HBM567.DVNV.954',
 'HBM779.SKXG.842',
 'HBM399.TBPX.343',
 'HBM337.QSBH.972',
 'HBM324.HZVF.467',
 'HBM269.BZSG.442',
 'HBM278.XLBD.662',
 'HBM235.XDSM.559',
 'HBM727.WSZX.242',
 'HBM927.XWTL.358',
 'HBM255.FFJQ.856',
 'HBM369.WRDC.345',
 'HBM724.GMZM.797',
 'HBM474.TQQQ.496',
 'HBM367.JCKP.625',
 'HBM785.MVWL.456',
 'HBM773.MKCR.985',
 'HBM827.MZXW.224',
 'HBM947.XDKM.768',
 'HBM224.TTJB.522',
 'HBM299.VQZH.986',
 'HBM367.VSDK.374',
 'HBM863.NCVL.825',
 'HBM256.ZGQF.566',
 'HBM353.RTXH.756',


## Create dictionary of HuBMAP IDs -> UUIDs

In [13]:
entity_url = 'https://entity.api.hubmapconsortium.org/entities/'

# initialize dict to hold mapping of HuBMAP ID to UUID
hubmap_id_uuid_dict = {}

for id in hubmap_ids_flat:
  response = requests.get(f'{entity_url}{id}', headers=headers)
  descendants = response.json()
  hubmap_id_uuid_dict[id] = descendants['uuid']

## Check against HuBMAP Portal to get descendant dataset IDs

In [14]:
# portal URL to get descendants
url = 'https://entity.api.hubmapconsortium.org/descendants/'

# initialize dict to hold tissue block ID and descendant dataset IDs
result = {
  'tissue_blocks':[],
  'datasets':[],
  'is_in_hra_pop_universe':[]
}

# loop through millitome tissue block IDs and get descendant datasets
for tissue_block in hubmap_id_uuid_dict:
  uuid = hubmap_id_uuid_dict[tissue_block]
  
  request_url = f'{url}{uuid}'
  
  response = requests.get(request_url, headers=headers)
  
  descendants = response.json()
  
  for entity in descendants:
    if entity['entity_type'] == "Dataset":
      result['tissue_blocks'].append(uuid)
      result['datasets'].append(entity['uuid'])
      result['is_in_hra_pop_universe'] = entity['uuid'] in sankey_ids
      
# print result
pprint(result)

{'datasets': ['dff05c80d45b38db1505bac63c8cd972',
              '23418f8577fb803588a3fb387329a452',
              '8cd1104314bc8c73ea40f84a1aac309d',
              'a88d8eb0b1135c28626b705094d3fa48',
              '47971ceaefc00c5c8e6624042829f2dc',
              '39c820bde703f5960af105a2ed9e5a06',
              'bf7a4b7c36980de5b8a3794cdf147607',
              '83dc9aed80c1777c9cef7a5e506e5c9e',
              '38b60caa71979886aa51bb1228a80ab7',
              '4f54ccaba498e734be76f8999a42d517',
              '71773b51bbfa8cf3236d6855aa443f62',
              '28188e88ef749766b5fefd4671b3bd18',
              '2473f80b3067febcaab60417d7e613aa',
              'eccc21a1c60d0cff673e9008adafc0b0',
              '77ddab8e0831d9bc6f91b10677b8451a',
              'a9908ae31dfca8322d0e20c521964cb2',
              'b8a02ec8f08a803afdb9c194326e2c2c',
              '400bcaea205e55d3e898064fd528b179',
              'ac78ee31ee1e5c2300bcb63327d91df4',
              'a7113acfefd0171922b5df16ade53074',


## Export to CSV

In [15]:
# Convert dictionary to DataFrame
df = pd.DataFrame(result)

# Export to CSV
df.to_csv('output/millitome_datasets_in_hra_pop.csv', index=False)
print("CSV file has been created successfully.")

CSV file has been created successfully.
