# A notebook to compile counts for the HRApop paper

# Import libraries

In [2]:
%pip install pandas numpy requests

import pandas as pd
import numpy as np
import requests
import io
from pprint import pprint


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Set global variables

In [3]:
hra_pop_version = "v1.0"
branch = 'main'

# Load data

In [4]:
sankey = pd.read_csv(
    f"https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/sankey.csv")

sankey

  sankey = pd.read_csv(


Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,unique_dataset_id,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset
0,HCA,,,,,TSP27,Female,56.0,,,...,hhttps://api.cellxgene.cziscience.com/dp/v1/co...,https://cellxgene.cziscience.com/e/a357414d-20...,,,,,,,True,False
1,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor1,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,,,,,,True,False
2,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor2,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,,,,,,True,False
3,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor3,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1016/j.trsl.2017.07.006,,,,,,,True,False
4,HRA,,,,,Donor1,Female,38.0,,,...,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,https://hubmapconsortium.github.io/ccf-release...,,,,,,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22189,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22190,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22191,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22192,KPMP,,https://www.nature.com/articles/s41467-023-389...,,,Donor1,Male,,,,...,https://zenodo.org/records/7653239#Donor1_Tiss...,https://zenodo.org/records/7653239,,,,,,,True,False


In [5]:
# unique cells
universe_sc_transcriptomics_cell_counts = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/universe-sc-transcriptomics-cell-counts.csv', index_col=False)
universe_sc_proteomics_cell_counts = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/universe-sc-proteomics-cell-counts.csv', index_col=False)
universe_sc_transcriptomics_cell_instance_counts = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/universe-sc-transcriptomics-cell-instance-counts.csv', index_col=False)

# Pre-processing steps

## Simplify rows with multiple annotations for getting accurate counts

In [6]:
tool_replacement = "sc_transcriptomics with Cell Summary"

sankey['cell_type_annotation_tool'] = sankey['cell_type_annotation_tool'].replace({
    'azimuth': tool_replacement,
    'celltypist': tool_replacement,
    'popv': tool_replacement,
    np.nan: "No Cell Summary"
})

## Manually fix missing cell type annotation values for SenNet atlas datasets

See GitHub issue: https://github.com/x-atlas-consortia/hra-pop/issues/91

In [7]:
# Define the indexing criteria
criteria = (sankey['portal'] == "SenNet") & (
    sankey['is_atlas_dataset'] == True)

# Apply the change to the SenNet atlas datasets (2 as of HRApop v0.10.2)
sankey.loc[criteria, 'cell_type_annotation_tool'] = tool_replacement

sankey = sankey.drop_duplicates()

# Get counts for HRApop paper

The following sections provide counts of datasets and other metrics for HRApop v0.10.2.

## Report numbers for Highlights

In [8]:
# All datasets downloaded and retrieved from extraction sites
all_datasets = sankey['unique_dataset_id'].unique()

print(f"Number of UNIVERSE datasets: {len(all_datasets)}")

Number of UNIVERSE datasets: 16293


In [9]:
# All sc-proteomics
all_sc_proteomics = sankey[['dataset_id', 'cell_type_annotation_tool']
                           ].loc[sankey['cell_type_annotation_tool'] == "sc_proteomics"].drop_duplicates()

print(f"Number of sc-proteomics datasets: {len(all_sc_proteomics)}")

Number of sc-proteomics datasets: 104


In [10]:
print(
    f'Number of sc-transcriptomics datasets: {len(sankey[(sankey['is_atlas_dataset'] == True) & (sankey['cell_type_annotation_tool'] != 'sc_proteomics')].drop_duplicates(subset=['unique_dataset_id']))}')

Number of sc-transcriptomics datasets: 558


In [11]:
# All datasets with cell summary
sc_transcriptomics_with_cell_summary = sankey[['unique_dataset_id', 'cell_type_annotation_tool']].loc[
    sankey['cell_type_annotation_tool'] == tool_replacement]['unique_dataset_id'].drop_duplicates()

print(f"Number of sc-transcriptomics datasets with cell summary: {len(sc_transcriptomics_with_cell_summary)}")

Number of sc-transcriptomics datasets with cell summary: 6953


In [12]:
# Organs in HRApop Atlas
organs_in_hra_pop = sankey.loc[sankey['is_atlas_dataset']
                                == True]['organ_name'].unique()
print(
    f"Unique organs in HRApop Atlas: {len(organs_in_hra_pop)}")

Unique organs in HRApop Atlas: 17


In [13]:
# Organs (m/f) in HRApop Atlas
organs_in_hra_pop_sex = sankey.loc[sankey['is_atlas_dataset']
                               == True]['organ_name_glb_file'].unique()
print(
    f"Organs (m/f) in HRApop Atlas: {len(organs_in_hra_pop_sex)}")

Organs (m/f) in HRApop Atlas: 31


In [14]:
# Volume covered by HRApop tissue blocks
volume = sankey.loc[sankey['is_atlas_dataset']== True].drop_duplicates(subset=['unique_dataset_id'])['tissue_block_volume'].sum()
print(
    f"Volume covered by HRApop tissue blocks: {volume}")

Volume covered by HRApop tissue blocks: 12052677.858


## Report numbers for Sankey/experimental data

In [15]:
# atlas datasets
atlas = sankey.loc[sankey['is_atlas_dataset'] == True]['unique_dataset_id'].unique()
print(f"Atlas datasets: {len(atlas)}\n")

# datasets with extraction site but without cell summary
no_cell_summary = sankey.loc[(sankey['is_rui_registered'] == True) & (
    sankey['cell_type_annotation_tool'] == "No Cell Summary")]['unique_dataset_id'].unique()
print(f"Datasets with extraction site but without cell summary: {
      len(no_cell_summary)}\n")

# datasets with cell summary but without extraction site
no_rui = sankey.loc[(sankey['is_rui_registered'] ==False) & (
    sankey['cell_type_annotation_tool'] != "No Cell Summary")]
print(f"Datasets with cell summary but without extraction site: {
      len(no_rui)}\n")

# datasets with cell summary 
cell_summary = sankey.loc[sankey['cell_type_annotation_tool'] != "No Cell Summary"]
print(f"Datasets with cell summary: {
      len(cell_summary)}\n")

# datasets with neither
non_atlas_without_either = sankey.loc[(sankey['cell_type_annotation_tool'] == "No Cell Summary") & (sankey['is_rui_registered'] == False)]
print(f"Datasets with neither: {len(non_atlas_without_either)}\n")

# non-atlas datasets total
non_atlas_total = sankey.loc[(
    sankey['is_atlas_dataset'] == False)]['unique_dataset_id'].unique()
print(f"Non-atlas datasets total: {len(non_atlas_total)}\n")

# unique cells
sc_transcriptomics_cell_counts = universe_sc_transcriptomics_cell_counts[
    'universe_sc_transcriptomics_cell_count'].iloc[0]
print(
    f'Unique cells from sc-transcriptomics datasets in the Universe: {sc_transcriptomics_cell_counts}\n')

sc_transcriptomics_cell_counts_preannotated = universe_sc_transcriptomics_cell_counts[
    'universe_sc_transcriptomics_preannotated_cell_count'].iloc[0]
print(
    f'Unique cells from sc-transcriptomics datasets in the Universe (preannotated): {sc_transcriptomics_cell_counts_preannotated}\n')

sc_proteomics_cell_counts = universe_sc_proteomics_cell_counts[
    'universe_sc_proteomics_cell_count'].iloc[0]
print(
    f'Unique cells from sc-proteomics datasets in the Universe: {sc_proteomics_cell_counts}\n')

Atlas datasets: 662

Datasets with extraction site but without cell summary: 5672

Datasets with cell summary but without extraction site: 6395

Datasets with cell summary: 7057

Datasets with neither: 3564

Non-atlas datasets total: 15631

Unique cells from sc-transcriptomics datasets in the Universe: 40645506

Unique cells from sc-transcriptomics datasets in the Universe (preannotated): 40712979

Unique cells from sc-proteomics datasets in the Universe: 16576863



In [16]:
# h5ad files
print(f'Unique h5ad files: {sankey['link_to_h5ad_file'].nunique()}')

Unique h5ad files: 6077


## AS Counts

In [17]:
# Read the CSV data
df_as_data = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/atlas-ad-hoc/cell-types-in-anatomical-structurescts-per-as.csv')

# Display the DataFrame
df_as_data

Unnamed: 0,organ,as,as_label,sex,tool,modality,cell_id,cell_label,cell_count,cell_percentage,dataset_count
0,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_colonocyte,Colonocyte,1.205,0.147653,3
1,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_iga-plasma-cell,IgA plasma cell,1.182,0.144835,3
2,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_best4-epithelial,BEST4+ epithelial,0.699,0.085651,3
3,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_activated-cd4-t,Activated CD4 T,0.690,0.084548,3
4,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_ta,TA,0.540,0.066168,3
...,...,...,...,...,...,...,...,...,...,...,...
8891,lung,http://purl.org/sig/ont/fma/fma7508,Left posterior basal segmental bronchus,Male,sc_proteomics,sc_proteomics,http://purl.obolibrary.org/obo/CL_0000097,Mast Cell,15322.464,0.024702,1
8892,lung,http://purl.org/sig/ont/fma/fma7508,Left posterior basal segmental bronchus,Male,sc_proteomics,sc_proteomics,http://purl.obolibrary.org/obo/CL_4033039,CD8+ T Cell,3691.176,0.005951,1
8893,lung,http://purl.org/sig/ont/fma/fma7508,Left posterior basal segmental bronchus,Male,sc_proteomics,sc_proteomics,https://purl.org/ccf/ASCTB-TEMP_lymphatic-endo...,Lymphatic Endothelial (and some immune cells),1753.956,0.002828,1
8894,lung,http://purl.org/sig/ont/fma/fma7508,Left posterior basal segmental bronchus,Male,sc_proteomics,sc_proteomics,https://purl.org/ccf/ASCTB-TEMP_basal-epitheli...,Basal Epithelial Cell,970.104,0.001564,1


In [18]:
# unique AS
print(f'Number of unique AS IDs in HRApop {hra_pop_version}: {len(df_as_data['as_label'].unique())}')

Number of unique AS IDs in HRApop v1.0: 73


In [19]:
unique_as_by_sex = df_as_data[['as_label', 'sex']].drop_duplicates()
print(
    f'Number of unique AS in HRApop {hra_pop_version} separated by sex: {len(unique_as_by_sex)}')

Number of unique AS in HRApop v1.0 separated by sex: 112


In [20]:
print(f'Number of organs covered by sc-transcriptomics: {sankey[(sankey['is_atlas_dataset'] == True) & (sankey['cell_type_annotation_tool'] != 'sc_proteomics')]['organ_name'].nunique()}')

Number of organs covered by sc-transcriptomics: 17


In [21]:
print(f'Number of AS covered by sc-proteomics: {df_as_data[df_as_data['tool'] == 'sc_proteomics']['as_label'].nunique()}')

Number of AS covered by sc-proteomics: 16


## Extraction sites

In [22]:
url = f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/extraction-sites.csv'

universe_extraction_sites = pd.read_csv(url)
universe_extraction_sites

Unnamed: 0,extraction_site
0,http://purl.org/ccf/1.5/e21afc3c-3a7c-4717-90d...
1,http://purl.org/ccf/1.5/e240c0a4-5e53-4464-832...
2,http://purl.org/ccf/1.5/e43280c9-840b-481f-a8a...
3,http://purl.org/ccf/1.5/e4853294-071d-4bef-bff...
4,http://purl.org/ccf/1.5/e4a44b76-53fd-4c88-9cc...
...,...
1127,http://purl.org/ccf/1.5/bc516774-fcd9-4022-bc5...
1128,http://purl.org/ccf/1.5/c7265539-ebc3-4a1e-893...
1129,http://purl.org/ccf/1.5/c7ed4142-ddad-4cf5-b83...
1130,http://purl.org/ccf/1.5/f2a0eb6b-7729-40ac-baa...


In [23]:
def get_etraction_site_and_mesh_collisions(iri:str):
  """Takes an IRI, gets the extraction site and mesh collisions

  Args:
      iri (str): IRI for the extraction site
  """
  # initialize result
  result = (set(), set())
  
  # loop through extraction site IDs and get extraction site data
  api_extraction_site_base = 'https://apps.humanatlas.io/api/v1/extraction-site?iri='
  api_collisions_base = 'https://apps.humanatlas.io/api/v1/collisions'
  
  try:
    response = requests.get(api_extraction_site_base+iri)
    if response.ok:
        print(f'Successfully got extraction site data for {iri}!')
        extraction_site = response.json()
        try:
          headers = {
              "accept": "application/json",
              "content-type": "application/json"
          }
          data = extraction_site
          response = requests.post(api_collisions_base, headers=headers, json=data)
          if response.ok:
              print(f'Successfully got mesh collisions for {extraction_site['@id']}!')
              mesh = response.json()
              mesh_iris = [collision['representation_of'] for collision in mesh]
              result[0].update(mesh_iris)
              
              organ_iris = [collision['organ'] for collision in mesh]
              result[1].update(organ_iris)
              
          else:
              print(f"Request failed with status code {response.status_code}")
        except requests.exceptions.RequestException as e:
          print(f"An error occurred: {e}")
    else: 
      print(f"Request failed with status code {response.status_code}")
  except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")
    
  print(f"Returning {result}")
  print()
  return result

In [24]:
unique_iris = {
    'anatomical_structures': set(),
    'organs': set()
}

# Apply the function to each row
results = universe_extraction_sites['extraction_site'].apply(
    lambda iri: get_etraction_site_and_mesh_collisions(iri)
)

# Unpack the tuple of sets and update each unique set
for result in results:
    print(f'now working with {result}')
    if isinstance(result, tuple) and len(result) == 2:
        as_set, organ_set = result
        unique_iris['anatomical_structures'].update(as_set)
        unique_iris['organs'].update(organ_set)

# Print results
pprint(unique_iris)

Successfully got extraction site data for http://purl.org/ccf/1.5/e21afc3c-3a7c-4717-90d5-e5739fd8432c!
Successfully got mesh collisions for http://purl.org/ccf/1.5/e21afc3c-3a7c-4717-90d5-e5739fd8432c!
Returning ({'http://purl.org/sig/ont/fma/fma15828'}, {'spleen-male'})

Successfully got extraction site data for http://purl.org/ccf/1.5/e240c0a4-5e53-4464-8320-ad775f2c4bf7!
Successfully got mesh collisions for http://purl.org/ccf/1.5/e240c0a4-5e53-4464-8320-ad775f2c4bf7!
Returning ({'http://purl.obolibrary.org/obo/UBERON_0002115'}, {'small-intestine-female'})

Successfully got extraction site data for http://purl.org/ccf/1.5/e43280c9-840b-481f-a8a6-90114c2c5cc9!
Successfully got mesh collisions for http://purl.org/ccf/1.5/e43280c9-840b-481f-a8a6-90114c2c5cc9!
Returning (set(), set())

Successfully got extraction site data for http://purl.org/ccf/1.5/e4853294-071d-4bef-bff3-2fd52a5c6af0!
Successfully got mesh collisions for http://purl.org/ccf/1.5/e4853294-071d-4bef-bff3-2fd52a5c6af0!


In [25]:
print(
    f'Number of AS covered across extraction sites in Universe: {len(unique_iris['anatomical_structures'])}')

print(
    f'Number of organs covered across extraction sites in Universe: {len(unique_iris['organs'])}')

Number of AS covered across extraction sites in Universe: 164
Number of organs covered across extraction sites in Universe: 49


# Random queries

In [26]:
# get ATLAS datasets with donors < 18
underage = sankey[(sankey['donor_age'] < 18) & (sankey['is_atlas_dataset'] == True)]
underage

Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,unique_dataset_id,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset
