## A notebook to check millitome datasets against hra-pop

In [140]:
# install and import libraries
%pip install pandas requests

import pandas as pd
import glob
import requests
from pprint import pprint
from io import StringIO
import json

Note: you may need to restart the kernel to use updated packages.


In [141]:
# Set up authentication, note that you need to provide this yourself!
with open('data/token.txt', "r") as file_object:
  TOKEN = file_object.readline()
  
headers = {'Authorization': f'Bearer {TOKEN}'}

## Get hra-pop Universe data from Sankey

In [142]:
# get sankey data
sankey_url = 'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/main/output-data/v0.11.1/reports/universe-ad-hoc/sankey.csv'

# read Sankey report from hra-pop as CSV file
sankey = pd.read_csv(sankey_url)

# filter out HuBMAP datasets and their RUI, atlas status, then keep relevant columns
sankey_hubmap = sankey[sankey['portal'] == "HuBMAP"][['portal', 'unique_dataset_id','is_rui_registered','is_atlas_dataset']]
sankey_hubmap['has_millitome_block'] = False
sankey_hubmap

  sankey = pd.read_csv(sankey_url)


Unnamed: 0,portal,unique_dataset_id,is_rui_registered,is_atlas_dataset,has_millitome_block
5,HuBMAP,http://purl.org/ccf/10.1016/j.cell.2022.12.028...,True,False,False
88,HuBMAP,http://purl.org/ccf/UFL0006-SP-1-2-1,True,False,False
89,HuBMAP,http://purl.org/ccf/UFL0006-SP-1-3-1,True,False,False
90,HuBMAP,http://purl.org/ccf/UFL0007-SP-1-1-1,True,False,False
91,HuBMAP,http://purl.org/ccf/UFL0007-SP-2-2-1,True,False,False
...,...,...,...,...,...
15584,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False
15585,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,True,False
15586,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False
15587,HuBMAP,https://entity.api.hubmapconsortium.org/entiti...,True,False,False


In [143]:
# transform unique_dataset_id column and add to set
sankey_hubmap['unique_dataset_id_stripped'] = sankey_hubmap['unique_dataset_id'].apply(lambda s: s.split('/')[-1])
sankey_hubmap

sankey_ids = set(sankey_hubmap['unique_dataset_id_stripped'])
sankey_ids

{'4a682d67bea887e5bb1ade2bd137489e',
 'b2fa01d2afd1d4f8c71d59bc3cbe9f84',
 '61df0bd2e6ac362380910bec92890f00',
 'rui_locations.jsonld#HBM823.KKKP.894',
 'WD-76845-073',
 '130df85d80c6e9adcbfdf8e374bfa163',
 'f2647d9533956fdb12da6b1fc6254441',
 '4c5234a24a4a21ca98f74d15b3dfb4a7',
 '453b06e1f62ec3e54d8380e4e974a5a5',
 'ed5d4bbf40edd765ef8d1f378a0d7b35',
 '0ca0721ae67b78b5eed7c0ef20dc6ac1',
 'd1e29bf7808a8daf6be8f555cd24eb9f',
 'd1a6bed45735b4154e10d66c3277ce3f',
 '399643b5aed6b71edee96d4bf9e0d306',
 '275c81f5f8b4f53f925f9742cb440efa',
 '7e4c13acb523952bdfd19a4892729462',
 '5302ee81cb9692dfda92b2c5dad37198',
 '9c2bd56273b1400e9249eaec12290b14',
 '04968c1fe0149ee367b0e53af55763e4',
 '1f99c0bc4fd3a60bb569410878e4a817',
 'a91cdb02917f0ebfb508f5548c553c3a',
 'bc68fe67a089ab19c1449de6d0703d71',
 'c15f813973ff7b6f1a7e674022119733',
 '19be408fb97c09f603c906921d7b0945',
 '8f27d93b71147597f38de827f233930e',
 'd52b2a1c42cee03c016a19864feae511',
 '81326ba2085657141f9932f135154e39',
 'acd20b9d6b00deb

## Get millitome data

In [144]:
# load data
csv_files = glob.glob('data/*.csv')

# Create an empty list to store the dataframes
dfs = []

# Loop through each CSV file and read it into a dataframe
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# identify HuBMAP IDs from all the columns
combined_df.columns

Index(['HuBMAP Sample ID', 'Donor', 'Link', 'Lab ID', 'Parent organ type',
       'BLOCK Location', 'Assay Group', 'millitome_ID', 'HubMAP ID',
       'Submission ID', 'sample_lab_id', 'TIssue Block HuBMAP IDs',
       'RUI Location', 'Id'],
      dtype='object')

In [145]:
# get HuBMAP IDs
id_lists = []

keep = ['HuBMAP Sample ID','HubMAP ID','TIssue Block HuBMAP IDs']

for col_name in combined_df.columns:
  if col_name in keep:
    id_lists.append(combined_df[col_name].dropna().unique().tolist())

hubmap_ids_flat = [id for sub_list in id_lists for id in sub_list]
hubmap_ids_flat

['HBM692.LCNB.765',
 'HBM792.WJLJ.923',
 'HBM256.MTWQ.585',
 'HBM568.LZHR.425',
 'HBM252.QVCK.893',
 'HBM495.WLDR.795',
 'HBM588.TNGJ.868',
 'HBM644.XFGJ.857',
 'HBM385.QFHD.475',
 'HBM334.CHVK.238',
 'HBM587.DFCW.749',
 'HBM995.CQKZ.339',
 'HBM966.WGFQ.597',
 'HBM553.WQWN.884',
 'HBM374.SRZD.953',
 'HBM762.SPJF.928',
 'HBM776.DNJZ.945',
 'HBM459.VMJC.864',
 'HBM696.HWTM.483',
 'HBM992.ZLZB.786',
 'HBM759.ZTGN.372',
 'HBM379.BGFQ.837',
 'HBM476.NQXM.289',
 'HBM872.TSRG.986',
 'HBM997.HRCG.585',
 'HBM567.DVNV.954',
 'HBM779.SKXG.842',
 'HBM399.TBPX.343',
 'HBM337.QSBH.972',
 'HBM324.HZVF.467',
 'HBM269.BZSG.442',
 'HBM278.XLBD.662',
 'HBM235.XDSM.559',
 'HBM727.WSZX.242',
 'HBM927.XWTL.358',
 'HBM255.FFJQ.856',
 'HBM369.WRDC.345',
 'HBM724.GMZM.797',
 'HBM474.TQQQ.496',
 'HBM367.JCKP.625',
 'HBM785.MVWL.456',
 'HBM773.MKCR.985',
 'HBM827.MZXW.224',
 'HBM947.XDKM.768',
 'HBM224.TTJB.522',
 'HBM299.VQZH.986',
 'HBM367.VSDK.374',
 'HBM863.NCVL.825',
 'HBM256.ZGQF.566',
 'HBM353.RTXH.756',


## Create dictionary of HuBMAP IDs -> UUIDs

In [146]:
entity_url = 'https://entity.api.hubmapconsortium.org/entities/'

# initialize dict to hold mapping of HuBMAP ID to UUID
hubmap_id_uuid_dict = {}

for id in hubmap_ids_flat:
  response_descendants = requests.get(f'{entity_url}{id}', headers=headers)
  descendants = response_descendants.json()
  hubmap_id_uuid_dict[id] = descendants['uuid']

## Load organ look-up

In [147]:
organ_lookup = {}
with open('data/organ-lookup.json') as f:
  organ_lookup = json.load(f)
pprint(organ_lookup)

{'LF': 'left fallopian tube',
 'LO': 'left ovary',
 'RF': 'right fallopian tube',
 'RO': 'right ovary',
 'UT': 'uterus'}


## Check against HuBMAP Portal to get descendant dataset IDs + other metadata

In [148]:
# portal URL to get descendants
api_base = 'https://entity.api.hubmapconsortium.org'
endpoint_entity = f'{api_base}/entities/'
endpoint_descendants = f'{api_base}/descendants/'
endpoint_ancestors = f'{api_base}/ancestors/'

# initialize dict to hold tissue block ID and descendant dataset IDs
result = {
  'tissue_blocks':[],
  'datasets':[],
  'is_in_hra_pop_universe':[],
  'dataset_type':[],
  'organ':[],
  'age':[],
  'race':[],
  'sex':[],
  'lead':[],
  'email':[]
}

# loop through millitome tissue block IDs and get descendant datasets
for tissue_block, value in list(hubmap_id_uuid_dict.items()):
  uuid = hubmap_id_uuid_dict[tissue_block]
  
  # get associated datasets and metadata: assay types, lead
  descendants = requests.get(endpoint_descendants+uuid, headers=headers).json()
  
  dataset_counter = 0
  for descendant in descendants:
    if descendant['entity_type'] == "Dataset":
      result['tissue_blocks'].append(uuid)
      result['datasets'].append(descendant['uuid'])
      result['is_in_hra_pop_universe'].append(descendant['uuid'] in sankey_ids)
      result['dataset_type'].append(descendant['dataset_type'])
      dataset_counter = dataset_counter + 1
  
  # get lead name + email address
  entity = requests.get(endpoint_entity+uuid, headers=headers).json()
  for _ in range(dataset_counter):
    result['lead'].append(entity['created_by_user_displayname'])
    result['email'].append(entity['created_by_user_email'])
  
  # get other metadata for tissue block (organ, donor age/race/sex, millitome (L/R, M/F), tissue section)
  ancestors = requests.get(endpoint_ancestors+uuid, headers=headers).json()
  for ancestor in ancestors:
    if ancestor['entity_type'] == 'Sample':
       for _ in range(dataset_counter):
        result['organ'].append(organ_lookup[ancestor['organ']]) # use organ look-up to get more verbose string
    elif ancestor['entity_type'] == 'Donor':
      if 'metadata' in ancestor:
        metadata = ancestor['metadata']['organ_donor_data']
        labels = [data['data_value'] for data in metadata][:3]
        for _ in range(dataset_counter):
          result['age'].append(labels[0])
          result['race'].append(labels[1])
          result['sex'].append(labels[2])
      else:
        for _ in range(dataset_counter):
          result['age'].append("")
          result['race'].append("")
          result['sex'].append("")

## Export to CSV

In [149]:
# Convert dictionary to DataFrame
df = pd.DataFrame(result)

# Export to CSV
df.to_csv('output/millitome_datasets_in_hra_pop.csv', index=False)
print("CSV file has been created successfully.")

CSV file has been created successfully.
