# A notebook to compile counts for the HRApop paper

# Import libraries

In [2]:
%pip install pandas numpy requests

import pandas as pd
import numpy as np
import requests
import io

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Set global variables

In [3]:
hra_pop_version = "v0.11.1"

# Load data

In [4]:
sankey = pd.read_csv(f"../../hra-pop/output-data/{hra_pop_version}/reports/universe-ad-hoc/sankey.csv")

# View first few rows
sankey.head()

  sankey = pd.read_csv(f"../../hra-pop/output-data/{hra_pop_version}/reports/universe-ad-hoc/sankey.csv")


Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,unique_dataset_id,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset
0,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor1,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,,,,,,True,False
1,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor2,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,,,,,,True,False
2,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor3,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1016/j.trsl.2017.07.006,,,,,,,True,False
3,HRA,,,,,Donor1,Female,38.0,,,...,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,https://hubmapconsortium.github.io/ccf-release...,,,,,,,True,False
4,"HIRN, ESPACE",,https://doi.org/10.5281/zenodo.7742474,,,Donor1,Male,,,,...,http://purl.org/ccf/1.5/omap-6#Donor1_TissueBl...,https://zenodo.org/record/7742474,,,,,,,True,False


In [5]:
# unique cells
universe_sc_transcriptomics_cell_counts = pd.read_csv(
    'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/main/output-data/v0.11.1/reports/universe-ad-hoc/universe-sc-transcriptomics-cell-counts.csv', index_col=False)
universe_sc_proteomics_cell_counts = pd.read_csv(
    'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/main/output-data/v0.11.1/reports/universe-ad-hoc/universe-sc-proteomics-cell-counts.csv', index_col=False)
universe_sc_transcriptomics_cell_instance_counts = pd.read_csv(
    'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/main/output-data/v0.11.1/reports/universe-ad-hoc/universe-sc-transcriptomics-cell-instance-counts.csv', index_col=False)

# Pre-processing steps

## Simplify rows with multiple annotations for getting accurate counts

In [6]:
tool_replacement = "sc_transcriptomics with Cell Summary"

sankey['cell_type_annotation_tool'] = sankey['cell_type_annotation_tool'].replace({
    'azimuth': tool_replacement,
    'celltypist': tool_replacement,
    'popv': tool_replacement,
    np.nan: "No Cell Summary"
})

## Manually fix missing cell type annotation values for SenNet atlas datasets

See GitHub issue: https://github.com/x-atlas-consortia/hra-pop/issues/91

In [7]:
# Define the indexing criteria
criteria = (sankey['portal'] == "SenNet") & (
    sankey['is_atlas_dataset'] == True)

# Apply the change to the SenNet atlas datasets (2 as of HRApop v0.10.2)
sankey.loc[criteria, 'cell_type_annotation_tool'] = tool_replacement

sankey = sankey.drop_duplicates()

# Get counts for HRApop paper

The following sections provide counts of datasets and other metrics for HRApop v0.10.2.

## Report numbers for Highlights

In [8]:
# All datasets downloaded and retrieved from extraction sites
all_datasets = sankey['unique_dataset_id'].unique()

print(f"Number of UNIVERSE datasets: {len(all_datasets)}")

Number of UNIVERSE datasets: 11817


In [9]:
# All sc-proteomics
all_sc_proteomics = sankey[['dataset_id', 'cell_type_annotation_tool']
                           ].loc[sankey['cell_type_annotation_tool'] == "sc_proteomics"].drop_duplicates()

print(f"Number of sc-proteomics datasets: {len(all_sc_proteomics)}")

Number of sc-proteomics datasets: 107


In [10]:
# All datasets with cell summary
sc_transcriptomics_with_cell_summary = sankey[['unique_dataset_id', 'cell_type_annotation_tool']].loc[
    sankey['cell_type_annotation_tool'] == tool_replacement]['unique_dataset_id'].drop_duplicates()

print(f"Number of sc-transcriptomics datasets with cell summary: {len(sc_transcriptomics_with_cell_summary)}")

Number of sc-transcriptomics datasets with cell summary: 5587


## Report numbers for Sankey/experimental data

In [11]:
# atlas datasets
atlas = sankey.loc[sankey['is_atlas_dataset'] == True]['unique_dataset_id'].unique()
print(f"Atlas datasets: {len(atlas)}\n")

# datasets with extraction site but without cell summary
no_cell_summary = sankey.loc[(sankey['is_rui_registered'] == True) & (
    sankey['cell_type_annotation_tool'] == "No Cell Summary")]['unique_dataset_id'].unique()
print(f"Datasets with extraction site but without cell summary: {
      len(no_cell_summary)}\n")

# datasets with cell summary but without extraction site
no_rui = sankey.loc[(sankey['is_rui_registered'] ==False) & (
    sankey['cell_type_annotation_tool'] != "No Cell Summary")]
print(f"Datasets with cell summary but without extraction site: {
      len(no_rui)}\n")

# datasets with neither
non_atlas_without_either = sankey.loc[(sankey['cell_type_annotation_tool'] == "No Cell Summary") & (sankey['is_rui_registered'] == False)]
print(f"Datasets with neither: {len(non_atlas_without_either)}\n")

# non-atlas datasets total
non_atlas_total = sankey.loc[(
    sankey['is_atlas_dataset'] == False)]['unique_dataset_id'].unique()
print(f"Non-atlas datasets total: {len(non_atlas_total)}\n")

# unique cells
sc_transcriptomics_cell_counts = universe_sc_transcriptomics_cell_counts[
    'universe_sc_transcriptomics_cell_count'].iloc[0]
print(
    f'Unique cells from sc-transcriptomics datasets in the Universe: {sc_transcriptomics_cell_counts}\n')

sc_transcriptomics_cell_counts_preannotated = universe_sc_transcriptomics_cell_counts[
    'universe_sc_transcriptomics_preannotated_cell_count'].iloc[0]
print(
    f'Unique cells from sc-transcriptomics datasets in the Universe (preannotated): {sc_transcriptomics_cell_counts_preannotated}\n')

sc_proteomics_cell_counts = universe_sc_proteomics_cell_counts[
    'universe_sc_proteomics_cell_count'].iloc[0]
print(
    f'Unique cells from sc-proteomics datasets in the Universe: {sc_proteomics_cell_counts}\n')

atlas_cell_count = sankey['is_atlas_dataset' == True]['']

Atlas datasets: 619

Datasets with extraction site but without cell summary: 3799

Datasets with cell summary but without extraction site: 5075

Datasets with neither: 2324

Non-atlas datasets total: 11198

Unique cells from sc-transcriptomics datasets in the Universe: 33996049

Unique cells from sc-transcriptomics datasets in the Universe (preannotated): 34066061

Unique cells from sc-proteomics datasets in the Universe: 17547511



## AS Counts

In [12]:
as_data = requests.get(
    'https://grlc.io/api-git/hubmapconsortium/ccf-grlc/subdir/hra-pop/cell_types_in_anatomical_structurescts_per_as', headers={
      'Accept': 'text/csv'
    }).text

# Read the CSV data
df_as_data_grlc = pd.read_csv(io.StringIO(as_data))

# Display the DataFrame
df_as_data_grlc

Unnamed: 0,organ,as,as_label,sex,tool,modality,cell_id,cell_label,cell_count,cell_percentage
0,Spatial entity of male colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_smc-plpp2-,SMC (PLPP2+),92.848,0.114140
1,Spatial entity of female colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_smc-plpp2-,SMC (PLPP2+),92.848,0.114140
2,Spatial entity of male colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_cycling-plasma...,Cycling plasma cell,63.448,0.077998
3,Spatial entity of female colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_cycling-plasma...,Cycling plasma cell,63.448,0.077998
4,Spatial entity of male colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_mesoderm-1-hand1-,Mesoderm 1 (HAND1+),57.400,0.070563
...,...,...,...,...,...,...,...,...,...,...
8015,Spatial entity of male male reproductive system,http://purl.obolibrary.org/obo/UBERON_8410027,central zone of prostate,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000625,"CD8-positive, alpha-beta T cell",52.962,0.018737
8016,Spatial entity of male male reproductive system,http://purl.obolibrary.org/obo/UBERON_8410027,central zone of prostate,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000576,monocyte,52.052,0.018415
8017,Spatial entity of male male reproductive system,http://purl.obolibrary.org/obo/UBERON_8410027,central zone of prostate,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000235,macrophage,34.762,0.012298
8018,Spatial entity of male male reproductive system,http://purl.obolibrary.org/obo/UBERON_8410027,central zone of prostate,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000066,epithelial cell,11.830,0.004185


In [13]:
# unique AS
print(f'Number of unique AS IDs in HRApop {hra_pop_version} on grlc: {len(df_as_data_grlc['as_label'].unique())}')

Number of unique AS IDs in HRApop v0.11.1 on grlc: 28


In [14]:
# load Table S3
table_s3 = requests.get(
    'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/main/output-data/v0.11.1/reports/atlas/table-s3.csv').text

# Read the CSV data
df_as_data_reports = pd.read_csv(io.StringIO(table_s3))

# Display the DataFrame
df_as_data_reports

Unnamed: 0,organ,as_label,cell_label,cell_id,mean_cell_count,cell_count
0,VHFHeart,Posteromedial head of posterior papillary musc...,Adip1,http://purl.obolibrary.org/obo/CL_0000136,33.000000,792
1,VHFHeart,Posteromedial head of posterior papillary musc...,Adip2,http://purl.obolibrary.org/obo/CL_0000136,20.500000,246
2,VHFHeart,Posteromedial head of posterior papillary musc...,Adipocyte,http://purl.obolibrary.org/obo/CL_0000136,20.166667,726
3,VHFHeart,Posteromedial head of posterior papillary musc...,Arterial Endothelial,http://purl.obolibrary.org/obo/CL_1000413,84.333333,3036
4,VHFHeart,Posteromedial head of posterior papillary musc...,Atrial Cardiomyocyte,http://purl.obolibrary.org/obo/CL_0002129,4.000000,72
...,...,...,...,...,...,...
6689,VHMUrinaryBladder,trigone of urinary bladder,myofibroblast cell,http://purl.obolibrary.org/obo/CL_0000186,338.000000,1014
6690,VHMUrinaryBladder,trigone of urinary bladder,pericyte,http://purl.obolibrary.org/obo/CL_0000669,199.500000,798
6691,VHMUrinaryBladder,trigone of urinary bladder,plasma cell,http://purl.obolibrary.org/obo/CL_0000786,1.333333,4
6692,VHMUrinaryBladder,trigone of urinary bladder,smooth muscle cell,http://purl.obolibrary.org/obo/CL_0000192,11714.750000,46859


In [15]:
print(
    f'Number of unique AS IDs in HRApop {hra_pop_version} in reports: {len(df_as_data_reports['as_label'].unique())}')

Number of unique AS IDs in HRApop v0.11.1 in reports: 58
