## A notebook to check millitome datasets against hra-pop

In [1]:
# install and import libraries
%pip install pandas requests

import pandas as pd
import glob
import requests
from pprint import pprint
from io import StringIO

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Set up authentication, note that you need to provide this yourself!
with open('data/token.txt', "r") as file_object:
  TOKEN = file_object.readline()
  
headers = {'Authorization': f'Bearer {TOKEN}'}

In [3]:
# load data
csv_files = glob.glob('data/*.csv')

# Create an empty list to store the dataframes
dfs = []

# Loop through each CSV file and read it into a dataframe
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# identify HuBMAP IDs from all the columns
combined_df.columns

Index(['HuBMAP Sample ID', 'Donor', 'Link', 'Lab ID', 'Parent organ type',
       'BLOCK Location', 'Assay Group', 'millitome_ID', 'HubMAP ID',
       'Submission ID', 'sample_lab_id', 'TIssue Block HuBMAP IDs',
       'RUI Location', 'Id'],
      dtype='object')

In [4]:
# get HuBMAP IDs
id_lists = []

keep = ['HuBMAP Sample ID','HubMAP ID','TIssue Block HuBMAP IDs']

for col_name in combined_df.columns:
  if col_name in keep:
    id_lists.append(combined_df[col_name].dropna().unique().tolist())

hubmap_ids_flat = [id for sub_list in id_lists for id in sub_list]
hubmap_ids_flat

['HBM692.LCNB.765',
 'HBM792.WJLJ.923',
 'HBM256.MTWQ.585',
 'HBM568.LZHR.425',
 'HBM252.QVCK.893',
 'HBM495.WLDR.795',
 'HBM588.TNGJ.868',
 'HBM644.XFGJ.857',
 'HBM385.QFHD.475',
 'HBM334.CHVK.238',
 'HBM587.DFCW.749',
 'HBM995.CQKZ.339',
 'HBM966.WGFQ.597',
 'HBM553.WQWN.884',
 'HBM374.SRZD.953',
 'HBM762.SPJF.928',
 'HBM776.DNJZ.945',
 'HBM459.VMJC.864',
 'HBM696.HWTM.483',
 'HBM992.ZLZB.786',
 'HBM759.ZTGN.372',
 'HBM379.BGFQ.837',
 'HBM476.NQXM.289',
 'HBM872.TSRG.986',
 'HBM997.HRCG.585',
 'HBM567.DVNV.954',
 'HBM779.SKXG.842',
 'HBM399.TBPX.343',
 'HBM337.QSBH.972',
 'HBM324.HZVF.467',
 'HBM269.BZSG.442',
 'HBM278.XLBD.662',
 'HBM235.XDSM.559',
 'HBM727.WSZX.242',
 'HBM927.XWTL.358',
 'HBM255.FFJQ.856',
 'HBM369.WRDC.345',
 'HBM724.GMZM.797',
 'HBM474.TQQQ.496',
 'HBM367.JCKP.625',
 'HBM785.MVWL.456',
 'HBM773.MKCR.985',
 'HBM827.MZXW.224',
 'HBM947.XDKM.768',
 'HBM224.TTJB.522',
 'HBM299.VQZH.986',
 'HBM367.VSDK.374',
 'HBM863.NCVL.825',
 'HBM256.ZGQF.566',
 'HBM353.RTXH.756',


## Create dictionary of HuBMAP IDs -> UUIDs

In [5]:
entity_url = 'https://entity.api.hubmapconsortium.org/entities/'

# initialize dict to hold mapping of HuBMAP ID to UUID
hubmap_id_uuid_dict = {}

for id in hubmap_ids_flat:
  response = requests.get(f'{entity_url}{id}', headers=headers)
  descendants = response.json()
  hubmap_id_uuid_dict[id] = descendants['uuid']

Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response code: <Response [200]>
Response

## Check against HuBMAP Portal to get descendant dataset IDs

In [None]:
# portal URL to get descendants
url = 'https://entity.api.hubmapconsortium.org/descendants/'

# initialize dict to hold tissue block ID and descendant dataset IDs
result = {
  'tissue_blocks':[],
  'datasets':[],
  'is_in_hra_pop_universe':[]
}

# loop through millitome tissue block IDs and get descendant datasets
for tissue_block in hubmap_id_uuid_dict:
  uuid = hubmap_id_uuid_dict[tissue_block]
  
  request_url = f'{url}{uuid}'
  
  response = requests.get(request_url, headers=headers)
  
  descendants = response.json()
  
  for entity in descendants:
    if entity['entity_type'] == "Dataset":
      result['tissue_blocks'].append(uuid)
      result['datasets'].append(entity['uuid'])
      
pprint(result)

checking b406b6179f920d2cf00b2c5e2ef52835
https://entity.api.hubmapconsortium.org/descendants/b406b6179f920d2cf00b2c5e2ef52835
Response code: <Response [200]>




checking dc8846a8b3d71adfc79e1c84fe92bbe2
https://entity.api.hubmapconsortium.org/descendants/dc8846a8b3d71adfc79e1c84fe92bbe2
Response code: <Response [200]>





checking 95a6836e2b53eef8cc79d89b455c0a87
https://entity.api.hubmapconsortium.org/descendants/95a6836e2b53eef8cc79d89b455c0a87
Response code: <Response [200]>




checking 1a244380c0450b8fd6c704506e9b6afc
https://entity.api.hubmapconsortium.org/descendants/1a244380c0450b8fd6c704506e9b6afc
Response code: <Response [200]>




checking ecdfaf78a3a7ba008d36034d228e35c6
https://entity.api.hubmapconsortium.org/descendants/ecdfaf78a3a7ba008d36034d228e35c6
Response code: <Response [200]>




checking d1112bfafa84458a9a1f320b65e10dbc
https://entity.api.hubmapconsortium.org/descendants/d1112bfafa84458a9a1f320b65e10dbc
Response code: <Response [200]>




checking dd943ec6c1ae

## Get hra-pop Universe data from Sankey

In [None]:
# get sankey data
sankey_url = 'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/main/output-data/v0.11.1/reports/universe-ad-hoc/sankey.csv'

# read Sankey report from hra-pop as CSV file
sankey = pd.read_csv(sankey_url)

# filter out HuBMAP datasets and their RUI, atlas status, then keep relevant columns
sankey_hubmap = sankey[sankey['portal'] == "HuBMAP"][['portal', 'unique_dataset_id','is_rui_registered','is_atlas_dataset']]
sankey_hubmap['has_millitome_block'] = False
sankey_hubmap

  sankey = pd.read_csv(sankey_url)


Unnamed: 0,portal,unique_dataset_id,is_rui_registered,is_atlas_dataset,has_millitome_block
5,HuBMAP,http://purl.org/ccf/10.1016/j.cell.2022.12.028...,True,False,False
88,HuBMAP,http://purl.org/ccf/UFL0006-SP-1-2-1,True,False,False
89,HuBMAP,http://purl.org/ccf/UFL0006-SP-1-3-1,True,False,False
90,HuBMAP,http://purl.org/ccf/UFL0007-SP-1-1-1,True,False,False
91,HuBMAP,http://purl.org/ccf/UFL0007-SP-2-2-1,True,False,False
...,...,...,...,...,...
15584,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False
15585,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,True,False
15586,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False
15587,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False


## Compare millitome datasets against hra-pop datasets