# A notebook to compile counts for the HRApop paper

# Import libraries

In [1]:
%pip install pandas numpy requests 

import pandas as pd
import numpy as np
import requests
import io
from pprint import pprint

Note: you may need to restart the kernel to use updated packages.


# Set global variables

In [2]:
hra_pop_version = "v1.0"
branch = 'main'

# Load data

In [3]:
sankey = pd.read_csv(
    f"https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/sankey.csv")

sankey

  sankey = pd.read_csv(


Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,unique_dataset_id,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset
0,HCA,,,,,TSP27,Female,56.0,,,...,hhttps://api.cellxgene.cziscience.com/dp/v1/co...,https://cellxgene.cziscience.com/e/a357414d-20...,,,,,,,True,False
1,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor1,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,,,,,,True,False
2,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor2,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,,,,,,True,False
3,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor3,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1016/j.trsl.2017.07.006,,,,,,,True,False
4,HRA,,,,,Donor1,Female,38.0,,,...,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,https://hubmapconsortium.github.io/ccf-release...,,,,,,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22189,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22190,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22191,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22192,KPMP,,https://www.nature.com/articles/s41467-023-389...,,,Donor1,Male,,,,...,https://zenodo.org/records/7653239#Donor1_Tiss...,https://zenodo.org/records/7653239,,,,,,,True,False


In [4]:
# unique cells
universe_sc_transcriptomics_cell_counts = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/universe-sc-transcriptomics-cell-counts.csv', index_col=False)
universe_sc_proteomics_cell_counts = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/universe-sc-proteomics-cell-counts.csv', index_col=False)
universe_sc_transcriptomics_cell_instance_counts = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/universe-sc-transcriptomics-cell-instance-counts.csv', index_col=False)

# Pre-processing steps

## Simplify rows with multiple annotations for getting accurate counts

In [5]:
tool_replacement = "sc_transcriptomics with Cell Summary"

sankey['cell_type_annotation_tool'] = sankey['cell_type_annotation_tool'].replace({
    'azimuth': tool_replacement,
    'celltypist': tool_replacement,
    'popv': tool_replacement,
    np.nan: "No Cell Summary"
})

## Manually fix missing cell type annotation values for SenNet atlas datasets

See GitHub issue: https://github.com/x-atlas-consortia/hra-pop/issues/91

In [6]:
# Define the indexing criteria
criteria = (sankey['portal'] == "SenNet") & (
    sankey['is_atlas_dataset'] == True)

# Apply the change to the SenNet atlas datasets (2 as of HRApop v0.10.2)
sankey.loc[criteria, 'cell_type_annotation_tool'] = tool_replacement

sankey = sankey.drop_duplicates()

# Get counts for HRApop paper

The following sections provide counts of datasets and other metrics for HRApop v0.10.2.

## Report numbers for Highlights

In [7]:
# All datasets downloaded and retrieved from extraction sites
all_datasets = sankey['unique_dataset_id'].unique()

print(f"Number of UNIVERSE datasets: {len(all_datasets)}")

Number of UNIVERSE datasets: 16293


In [8]:
# All sc-proteomics
all_sc_proteomics = sankey[['dataset_id', 'cell_type_annotation_tool']
                           ].loc[sankey['cell_type_annotation_tool'] == "sc_proteomics"].drop_duplicates()

print(f"Number of sc-proteomics datasets: {len(all_sc_proteomics)}")

Number of sc-proteomics datasets: 104


In [9]:
print(
    f'Number of sc-transcriptomics datasets: {len(sankey[(sankey['is_atlas_dataset'] == True) & (sankey['cell_type_annotation_tool'] != 'sc_proteomics')].drop_duplicates(subset=['unique_dataset_id']))}')

Number of sc-transcriptomics datasets: 558


In [10]:
# All datasets with cell summary
sc_transcriptomics_with_cell_summary = sankey[['unique_dataset_id', 'cell_type_annotation_tool']].loc[
    sankey['cell_type_annotation_tool'] == tool_replacement]['unique_dataset_id'].drop_duplicates()

print(f"Number of sc-transcriptomics datasets with cell summary: {len(sc_transcriptomics_with_cell_summary)}")

Number of sc-transcriptomics datasets with cell summary: 6953


In [11]:
# Organs in HRApop Atlas
organs_in_hra_pop = sankey.loc[sankey['is_atlas_dataset']
                                == True]['organ_name'].unique()
print(
    f"Unique organs in HRApop Atlas: {len(organs_in_hra_pop)}")

Unique organs in HRApop Atlas: 17


In [12]:
# Organs (m/f) in HRApop Atlas
organs_in_hra_pop_sex = sankey.loc[sankey['is_atlas_dataset']
                               == True]['organ_name_glb_file'].unique()
print(
    f"Organs (m/f) in HRApop Atlas: {len(organs_in_hra_pop_sex)}")

Organs (m/f) in HRApop Atlas: 31


In [13]:
# Volume covered by HRApop tissue blocks
volume = sankey.loc[sankey['is_atlas_dataset']== True].drop_duplicates(subset=['unique_dataset_id'])['tissue_block_volume'].sum()
print(
    f"Volume covered by HRApop tissue blocks: {volume}")

Volume covered by HRApop tissue blocks: 12052677.858


## Report numbers for Sankey/experimental data

In [14]:
# atlas datasets
atlas = sankey.loc[sankey['is_atlas_dataset'] == True]['unique_dataset_id'].unique()
print(f"Atlas datasets: {len(atlas)}\n")

# datasets with extraction site but without cell summary
no_cell_summary = sankey.loc[(sankey['is_rui_registered'] == True) & (
    sankey['cell_type_annotation_tool'] == "No Cell Summary")]['unique_dataset_id'].unique()
print(f"Datasets with extraction site but without cell summary: {
      len(no_cell_summary)}\n")

# datasets with cell summary but without extraction site
no_rui = sankey.loc[(sankey['is_rui_registered'] ==False) & (
    sankey['cell_type_annotation_tool'] != "No Cell Summary")]
print(f"Datasets with cell summary but without extraction site: {
      len(no_rui)}\n")

# datasets with cell summary 
cell_summary = sankey.loc[sankey['cell_type_annotation_tool'] != "No Cell Summary"]
print(f"Datasets with cell summary: {
      len(cell_summary)}\n")

# datasets with neither
non_atlas_without_either = sankey.loc[(sankey['cell_type_annotation_tool'] == "No Cell Summary") & (sankey['is_rui_registered'] == False)]
print(f"Datasets with neither: {len(non_atlas_without_either)}\n")

# non-atlas datasets total
non_atlas_total = sankey.loc[(
    sankey['is_atlas_dataset'] == False)]['unique_dataset_id'].unique()
print(f"Non-atlas datasets total: {len(non_atlas_total)}\n")

# unique cells
sc_transcriptomics_cell_counts = universe_sc_transcriptomics_cell_counts[
    'universe_sc_transcriptomics_cell_count'].iloc[0]
print(
    f'Unique cells from sc-transcriptomics datasets in the Universe: {sc_transcriptomics_cell_counts}\n')

sc_transcriptomics_cell_counts_preannotated = universe_sc_transcriptomics_cell_counts[
    'universe_sc_transcriptomics_preannotated_cell_count'].iloc[0]
print(
    f'Unique cells from sc-transcriptomics datasets in the Universe (preannotated): {sc_transcriptomics_cell_counts_preannotated}\n')

sc_proteomics_cell_counts = universe_sc_proteomics_cell_counts[
    'universe_sc_proteomics_cell_count'].iloc[0]
print(
    f'Unique cells from sc-proteomics datasets in the Universe: {sc_proteomics_cell_counts}\n')

Atlas datasets: 662

Datasets with extraction site but without cell summary: 5672

Datasets with cell summary but without extraction site: 6395

Datasets with cell summary: 7057

Datasets with neither: 3564

Non-atlas datasets total: 15631

Unique cells from sc-transcriptomics datasets in the Universe: 40645506

Unique cells from sc-transcriptomics datasets in the Universe (preannotated): 40712979

Unique cells from sc-proteomics datasets in the Universe: 16576863



In [15]:
# h5ad files
print(f'Unique h5ad files: {sankey['link_to_h5ad_file'].nunique()}')

Unique h5ad files: 6077


## AS Counts

In [16]:
# Read the CSV data
df_as_data = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/atlas-ad-hoc/cell-types-in-anatomical-structurescts-per-as.csv')

# Display the DataFrame
df_as_data

Unnamed: 0,organ,as,as_label,sex,tool,modality,cell_id,cell_label,cell_count,cell_percentage,dataset_count
0,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_colonocyte,Colonocyte,1.205,0.147653,3
1,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_iga-plasma-cell,IgA plasma cell,1.182,0.144835,3
2,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_best4-epithelial,BEST4+ epithelial,0.699,0.085651,3
3,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_activated-cd4-t,Activated CD4 T,0.690,0.084548,3
4,large intestine,http://purl.obolibrary.org/obo/UBERON_0001052,rectum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_ta,TA,0.540,0.066168,3
...,...,...,...,...,...,...,...,...,...,...,...
8891,lung,http://purl.org/sig/ont/fma/fma7508,Left posterior basal segmental bronchus,Male,sc_proteomics,sc_proteomics,http://purl.obolibrary.org/obo/CL_0000097,Mast Cell,15322.464,0.024702,1
8892,lung,http://purl.org/sig/ont/fma/fma7508,Left posterior basal segmental bronchus,Male,sc_proteomics,sc_proteomics,http://purl.obolibrary.org/obo/CL_4033039,CD8+ T Cell,3691.176,0.005951,1
8893,lung,http://purl.org/sig/ont/fma/fma7508,Left posterior basal segmental bronchus,Male,sc_proteomics,sc_proteomics,https://purl.org/ccf/ASCTB-TEMP_lymphatic-endo...,Lymphatic Endothelial (and some immune cells),1753.956,0.002828,1
8894,lung,http://purl.org/sig/ont/fma/fma7508,Left posterior basal segmental bronchus,Male,sc_proteomics,sc_proteomics,https://purl.org/ccf/ASCTB-TEMP_basal-epitheli...,Basal Epithelial Cell,970.104,0.001564,1


In [17]:
# unique AS
print(f'Number of unique AS IDs in HRApop {hra_pop_version}: {len(df_as_data['as_label'].unique())}')

Number of unique AS IDs in HRApop v1.0: 73


In [18]:
unique_as_by_sex = df_as_data[['as_label', 'sex']].drop_duplicates()
print(
    f'Number of unique AS in HRApop {hra_pop_version} separated by sex: {len(unique_as_by_sex)}')

Number of unique AS in HRApop v1.0 separated by sex: 112


In [19]:
print(f'Number of organs covered by sc-transcriptomics: {sankey[(sankey['is_atlas_dataset'] == True) & (sankey['cell_type_annotation_tool'] != 'sc_proteomics')]['organ_name'].nunique()}')

Number of organs covered by sc-transcriptomics: 17


In [20]:
print(f'Number of AS covered by sc-proteomics: {df_as_data[df_as_data['tool'] == 'sc_proteomics']['as_label'].nunique()}')

Number of AS covered by sc-proteomics: 16


## Get counts for HRA 10th release

In [21]:
sankey.groupby(['organ_name', 'is_atlas_dataset'])[
    'unique_dataset_id'].nunique().reset_index()

Unnamed: 0,organ_name,is_atlas_dataset,unique_dataset_id
0,Left knee,False,4
1,Right knee,False,22
2,Set of lactiferous glands in left breast,True,2
3,Set of lactiferous glands in right breast,False,4
4,Set of lactiferous glands in right breast,True,3
5,blood vasculature,False,2
6,brain,False,41
7,heart,False,427
8,heart,True,111
9,large intestine,False,410


## Crosswalks

In [22]:
crosswalk_azimuth = pd.read_csv(
    'https://cdn.humanatlas.io/digital-objects/ctann/azimuth/v1.2/assets/azimuth-crosswalk.csv', skiprows=10)
crosswalk_azimuth

Unnamed: 0,Organ_Level,Organ_ID,Annotation_Label,Annotation_Label_ID,CL_Label,CL_ID,CL_Match
0,Heart_L2,UBERON:0000948,Adipocyte,AZ:0000001,adipocyte,CL:0000136,skos:exactMatch
1,Heart_L2,UBERON:0000948,Arterial Endothelial,AZ:0000002,endothelial cell of artery,CL:1000413,skos:exactMatch
2,Heart_L2,UBERON:0000948,Atrial Cardiomyocyte,AZ:0000003,regular atrial cardiac myocyte,CL:0002129,skos:exactMatch
3,Heart_L2,UBERON:0000948,B,AZ:0000004,B cell,CL:0000236,skos:exactMatch
4,Heart_L2,UBERON:0000948,Capillary Endothelial,AZ:0000005,capillary endothelial cell,CL:0002144,skos:exactMatch
...,...,...,...,...,...,...,...
759,Kidney,UBERON:0002113,Peritubular Capilary Endothelial,,peritubular capillary endothelial cell,CL:1001033,skos:exactMatch
760,Bone_marrow,UBERON:0002371,CD8 Effector_1,,"effector CD8-positive, alpha-beta T cell:1",CL:0001050,skos:narrowMatch
761,Bone_marrow,UBERON:0002371,CD8 Effector_2,,"effector CD8-positive, alpha-beta T cell:2",CL:0001050,skos:narrowMatch
762,Bone_marrow,UBERON:0002371,CD8 Effector_3,,"effector CD8-positive, alpha-beta T cell:3",CL:0001050,skos:narrowMatch


In [23]:
crosswalk_celltypist = pd.read_csv(
    'https://cdn.humanatlas.io/digital-objects/ctann/celltypist/v1.1/assets/celltypist-crosswalk.csv', skiprows=10)
crosswalk_celltypist

Unnamed: 0,Organ_Level,Organ_ID,Annotation_Label,Annotation_Label_ID,CL_Label,CL_ID,CL_Match
0,blood_L1,UBERON:0000178,Age-associated B cells,CT:0000001,B cell:age-associated,CL:0000236,skos:narrowMatch
1,blood_L1,UBERON:0000178,C1 non-classical monocytes,CT:0000002,non-classical monocyte:C1,CL:0000875,skos:narrowMatch
2,blood_L1,UBERON:0000178,CD16+ NK cells,CT:0000003,"CD16-positive, CD56-dim natural killer cell, h...",CL:0000939,skos:exactMatch
3,blood_L1,UBERON:0000178,CD16- NK cells,CT:0000004,"CD16-negative, CD56-bright natural killer cell...",CL:0000938,skos:exactMatch
4,blood_L1,UBERON:0000178,Classical monocytes,CT:0000005,classical monocyte,CL:0000860,skos:exactMatch
...,...,...,...,...,...,...,...
888,Small_Intestine,UBERON:0002108,myofibroblast,,myofibroblast cell,CL:0000186,skos:exactMatch
889,Small_Intestine,UBERON:0002108,myofibroblast (RSPO2+),,myofibroblast cell:RSPO2+,CL:0000186,skos:narrowMatch
890,Small_Intestine,UBERON:0002108,pDC,,plasmacytoid dendritic cell,CL:0000784,skos:exactMatch
891,Small_Intestine,UBERON:0002108,venous capillary,,pre-venule capillary cell,CL:4047030,skos:exactMatch


In [24]:
crosswalk_popv = pd.read_csv(
    'https://cdn.humanatlas.io/digital-objects/ctann/popv/v1.2/assets/popv-crosswalk.csv', skiprows=10)
crosswalk_popv

Unnamed: 0,Organ_Level,Organ_ID,Annotation_Label,Annotation_Label_ID,CL_Label,CL_ID,CL_Match
0,blood,UBERON:0000178,CD141-positive myeloid dendritic cell,PV:0000001,CD141-positive myeloid dendritic cell,CL:0002394,skos:exactMatch
1,blood,UBERON:0000178,"CD4-positive, alpha-beta memory T cell",PV:0000002,"CD4-positive, alpha-beta memory T cell",CL:0000897,skos:exactMatch
2,blood,UBERON:0000178,"CD8-positive, alpha-beta T cell",PV:0000003,"CD8-positive, alpha-beta T cell",CL:0000625,skos:exactMatch
3,blood,UBERON:0000178,"CD8-positive, alpha-beta cytokine secreting ef...",PV:0000004,"CD8-positive, alpha-beta cytokine secreting ef...",CL:0000908,skos:exactMatch
4,blood,UBERON:0000178,T cell,PV:0000005,T cell,CL:0000084,skos:exactMatch
...,...,...,...,...,...,...,...
442,prostate gland,UBERON:0002367,bronchial epithelial cell,,epithelial cell,CL:0000066,skos:narrowMatch
443,thymus,UBERON:0002370,"CD4-positive, CD25-positive, alpha-beta regula...",,"CD4-positive, CD25-positive, alpha-beta regula...",CL:0000792,skos:exactMatch
444,thymus,UBERON:0002370,"CD4-positive, alpha-beta T cell",,"CD4-positive, alpha-beta T cell",CL:0000624,skos:exactMatch
445,bone marrow,UBERON:0002371,"B cell, CD19-positive",,"B cell, CD19-positive",CL:0001201,skos:exactMatch


In [25]:
crosswalk_vccf = pd.read_csv(
    'https://cdn.humanatlas.io/digital-objects/ctann/vccf/v1.0/assets/vccf-crosswalk.csv', skiprows=10)
crosswalk_vccf

Unnamed: 0,Organ_Level,Organ_ID,Annotation_Label,Annotation_Label_ID,CL_Label,CL_ID,CL_Match
0,bonemarrow-codex-chop_L1,UBERON:0002371,VCCF:0000001,mesenchymal cell,mesenchymal cell,CL:0008019,skos:exactMatch
1,bonemarrow-codex-chop_L1,UBERON:0002371,VCCF:0000002,unknown cell,cell:unknown,CL:0000000,skos:narrowMatch
2,bonemarrow-codex-chop_L1,UBERON:0002371,VCCF:0000003,immune cell,leukocyte,CL:0000738,skos:exactMatch
3,bonemarrow-codex-chop_L1,UBERON:0002371,VCCF:0000004,hematopoeitic precursor cell,hematopoietic precursor cell,CL:0008001,skos:exactMatch
4,bonemarrow-codex-chop_L1,UBERON:0002371,VCCF:0000005,endothelial cell,endothelial cell,CL:0000115,skos:exactMatch
...,...,...,...,...,...,...,...
497,tonsil-codex-stanford_L3,UBERON:0002372,VCCF:0000498,squamous epithelial cell,squamous epithelial cell,CL:0000076,skos:exactMatch
498,tonsil-codex-stanford_L3,UBERON:0002372,VCCF:0000499,stromal cell,stromal cell,CL:0000499,skos:exactMatch
499,tonsil-codex-stanford_L3,UBERON:0002372,VCCF:0000500,t cell,T cell,CL:0000084,skos:exactMatch
500,,,VCCF:0000501,endothelial cell,endothelial cell,CL:0000115,skos:exactMatch


In [26]:
# extract CL IDs and labels with tool
extract = ['Organ_ID','Annotation_Label','Annotation_Label_ID','CL_ID', 'CL_Label', 'CL_Match']

# Extract the columns from each DataFrame
az_selected = crosswalk_azimuth[extract].assign(tool='azimuth')
ct_selected = crosswalk_celltypist[extract].assign(tool='celltypist')
popv_selected = crosswalk_popv[extract].assign(tool='popv')
vccf_selected = crosswalk_vccf[extract].assign(tool='vccf')

# Concatenate them into one DataFrame
df_crosswalks_combined = pd.concat(
    [az_selected, ct_selected, popv_selected, vccf_selected], ignore_index=True)

df_crosswalks_combined

Unnamed: 0,Organ_ID,Annotation_Label,Annotation_Label_ID,CL_ID,CL_Label,CL_Match,tool
0,UBERON:0000948,Adipocyte,AZ:0000001,CL:0000136,adipocyte,skos:exactMatch,azimuth
1,UBERON:0000948,Arterial Endothelial,AZ:0000002,CL:1000413,endothelial cell of artery,skos:exactMatch,azimuth
2,UBERON:0000948,Atrial Cardiomyocyte,AZ:0000003,CL:0002129,regular atrial cardiac myocyte,skos:exactMatch,azimuth
3,UBERON:0000948,B,AZ:0000004,CL:0000236,B cell,skos:exactMatch,azimuth
4,UBERON:0000948,Capillary Endothelial,AZ:0000005,CL:0002144,capillary endothelial cell,skos:exactMatch,azimuth
...,...,...,...,...,...,...,...
2601,UBERON:0002372,VCCF:0000498,squamous epithelial cell,CL:0000076,squamous epithelial cell,skos:exactMatch,vccf
2602,UBERON:0002372,VCCF:0000499,stromal cell,CL:0000499,stromal cell,skos:exactMatch,vccf
2603,UBERON:0002372,VCCF:0000500,t cell,CL:0000084,T cell,skos:exactMatch,vccf
2604,,VCCF:0000501,endothelial cell,CL:0000115,endothelial cell,skos:exactMatch,vccf


In [27]:
# get counts
print(f'Unique Annotation_Labels across all crosswalks: {df_crosswalks_combined['Annotation_Label'].nunique()}')

Unique Annotation_Labels across all crosswalks: 1615


In [28]:
print(f'Unique CL IDs across all crosswalks: {df_crosswalks_combined['CL_ID'].nunique()}')

Unique CL IDs across all crosswalks: 495


In [29]:
# types of matches
df_crosswalks_combined.groupby('CL_Match').size()

CL_Match
skos:exactMatch     1923
skos:narrowMatch     683
dtype: int64

In [None]:
print(f'Number of unique crosswalking operations: {len(df_crosswalks_combined)}')

Number unique crosswalking operations: 2606


In [60]:
print(f'Number of unique CTs: {df_crosswalks_combined['CL_Label'].nunique()}')

Number of unique CTs: 885


In [62]:
print(f'Number of unique organs: {df_crosswalks_combined['Organ_ID'].nunique()}')

Number of unique organs: 36


In [31]:
# not crosswalked
df_not_crosswalked = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/atlas-ad-hoc/unmapped-cell-ids.csv')
df_not_crosswalked

Unnamed: 0,tool,organ,cell_label,dataset_count
0,azimuth,UBERON:0002048,Alveolar M�� CCL3+,2
1,azimuth,UBERON:0002048,Alveolar M�� proliferating,12
2,azimuth,UBERON:0002048,Interstitial M�� perivascular,55
3,azimuth,UBERON:0002048,Monocyte-derived M��,80
4,azimuth,UBERON:0002048,Non-classical monocytes,78
...,...,...,...,...
283,sc_proteomics,unknown,Lymphatic Endothelial (and some immune cells),1
284,sc_proteomics,unknown,MPO+,1
285,sc_proteomics,unknown,Neutrophils/Monocytes,4
286,sc_proteomics,unknown,P53,10


In [32]:
print(
    f'Number of cell labels from CTann tools and sc-proteomics data that were not crosswalked: {df_not_crosswalked['cell_label'].nunique()}')

Number of cell labels from CTann tools and sc-proteomics data that were not crosswalked: 154


In [33]:
# number of cell IDs aggregated tp higher levels
df_level_1_2 = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/atlas-ad-hoc/cell-types-level-mapping.csv')
df_level_1_2

Unnamed: 0,cell_label,cell_id,level_1_cell_id,level_1_cell_label,level_2_cell_id,level_2_cell_label
0,cell,http://purl.obolibrary.org/obo/CL_0000000,http://purl.obolibrary.org/obo/CL_0000000,no mapped parent cell,http://purl.obolibrary.org/obo/CL_0000000,no mapped parent cell
1,hematopoietic stem cell,http://purl.obolibrary.org/obo/CL_0000037,http://purl.obolibrary.org/obo/CL_0000988,hematopoietic cell,http://purl.obolibrary.org/obo/CL_0000988,hematopoietic cell
2,fibroblast,http://purl.obolibrary.org/obo/CL_0000057,http://purl.obolibrary.org/obo/CL_0002320,connective tissue cell,http://purl.obolibrary.org/obo/CL_0000499,stromal cell
3,epithelial cell,http://purl.obolibrary.org/obo/CL_0000066,http://purl.obolibrary.org/obo/CL_0000066,epithelial cell,http://purl.obolibrary.org/obo/CL_0000066,epithelial cell
4,blood vessel endothelial cell,http://purl.obolibrary.org/obo/CL_0000071,http://purl.obolibrary.org/obo/CL_0000115,endothelial cell,http://purl.obolibrary.org/obo/CL_0000115,endothelial cell
...,...,...,...,...,...,...
196,lung migratory dendritic cell,http://purl.obolibrary.org/obo/CL_4033045,http://purl.obolibrary.org/obo/CL_0000988,hematopoietic cell,http://purl.obolibrary.org/obo/CL_0000451,dendritic cell
197,respiratory tract suprabasal cell,http://purl.obolibrary.org/obo/CL_4033048,http://purl.obolibrary.org/obo/CL_0000066,epithelial cell,http://purl.obolibrary.org/obo/CL_0000066,epithelial cell
198,cycling macrophage,http://purl.obolibrary.org/obo/CL_4033076,http://purl.obolibrary.org/obo/CL_0000988,hematopoietic cell,http://purl.obolibrary.org/obo/CL_0000235,macrophage
199,cycling alveolar macrophage,http://purl.obolibrary.org/obo/CL_4033077,http://purl.obolibrary.org/obo/CL_0000988,hematopoietic cell,http://purl.obolibrary.org/obo/CL_0000235,macrophage


In [34]:
print(
    f'Number of unique  cell IDs aggregated to higher levels: {df_level_1_2['cell_label'].nunique()}')
print(
    f'Number of unique cell IDs in level 1: {df_level_1_2['level_1_cell_label'].nunique()}')
print(
    f'Number of unique cell IDs in level 2: {df_level_1_2['level_2_cell_label'].nunique()}')

Number of unique  cell IDs aggregated to higher levels: 201
Number of unique cell IDs in level 1: 9
Number of unique cell IDs in level 2: 19


In [35]:
# CTs per organ per tool from crosswalks. Alt source: https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/105da2e49b1b10d6531fb0fc302bd4cbb3c197eb/output-data/v1.0/reports/hra/ct-per-organ-per-tool.csv


In [36]:
# from crosswalks
df_crosswalks_combined.groupby(['tool','CL_ID']).size()

tool     CL_ID     
azimuth  AZ:0000557    1
         AZ:0000560    1
         AZ:0000568    1
         AZ:0000569    1
         AZ:0000570    1
                      ..
vccf     CL:4033054    1
         CL:4033068    1
         CL:4033092    1
         CL:4047054    1
         CL:4052009    4
Length: 814, dtype: int64

In [37]:
df_crosswalks_combined

Unnamed: 0,Organ_ID,Annotation_Label,Annotation_Label_ID,CL_ID,CL_Label,CL_Match,tool
0,UBERON:0000948,Adipocyte,AZ:0000001,CL:0000136,adipocyte,skos:exactMatch,azimuth
1,UBERON:0000948,Arterial Endothelial,AZ:0000002,CL:1000413,endothelial cell of artery,skos:exactMatch,azimuth
2,UBERON:0000948,Atrial Cardiomyocyte,AZ:0000003,CL:0002129,regular atrial cardiac myocyte,skos:exactMatch,azimuth
3,UBERON:0000948,B,AZ:0000004,CL:0000236,B cell,skos:exactMatch,azimuth
4,UBERON:0000948,Capillary Endothelial,AZ:0000005,CL:0002144,capillary endothelial cell,skos:exactMatch,azimuth
...,...,...,...,...,...,...,...
2601,UBERON:0002372,VCCF:0000498,squamous epithelial cell,CL:0000076,squamous epithelial cell,skos:exactMatch,vccf
2602,UBERON:0002372,VCCF:0000499,stromal cell,CL:0000499,stromal cell,skos:exactMatch,vccf
2603,UBERON:0002372,VCCF:0000500,t cell,CL:0000084,T cell,skos:exactMatch,vccf
2604,,VCCF:0000501,endothelial cell,CL:0000115,endothelial cell,skos:exactMatch,vccf


In [38]:
tally = (
    df_crosswalks_combined.groupby(['Organ_ID', 'tool'])['CL_Label']
    .nunique()
    .unstack(fill_value=0)
    .reset_index()
)

def try_get_organ_label(id:str):
  """_summary_

  Args:
      id (str): _description_
  """
  
  # Taken from https://github.com/hubmapconsortium/hra-workflows-runner/blob/main/src%2Fgtex%2Fdownloader.js#L24-L39
  ORGAN_MAPPING = {
      "bladder": "UBERON:0001255",
      "blood": "UBERON:0000178",
      "bone_marrow": "UBERON:0002371",
      "eye": "UBERON:0000970",
      "heart": "UBERON:0000948",
      "large_intestine": "UBERON:0000059",
      "liver": "UBERON:0002107",
      "lung": "UBERON:0002048",
      # or mesenteric lymph node (UBERON:0002509)?
      "lymph_node": "UBERON:0000029",
      "mammary": "UBERON:0001911",
      "pancreas": "UBERON:0001264",
      "prostate": "UBERON:0002367",
      "skin": "UBERON:0002097",
      "small_intestine": "UBERON:0002108",
      "spleen": "UBERON:0002106",
      "thymus": "UBERON:0002370",
      "trachea": "UBERON:0003126",
      "uterus": "UBERON:0000995",
      "vasculature": "UBERON:0004537",
      "breast": "UBERON:0001911",
      "esophagus mucosa": "UBERON:0002469",
      "esophagus muscularis": "UBERON:0004648",
      "skeletal muscle": "UBERON:0001134",
  }
  
  try:
    value_to_find = id
    keys = [k for k, v in ORGAN_MAPPING.items() if v == value_to_find]
   # Output: ['a', 'c']
    return keys
  except:
    return ""
  

tally['Organ_Label'] = tally['Organ_ID'].apply(lambda id: try_get_organ_label(id))
tally

tool,Organ_ID,azimuth,celltypist,popv,vccf,Organ_Label
0,UBERON:0000006,0,12,0,0,[]
1,UBERON:0000029,0,30,22,36,[lymph_node]
2,UBERON:0000059,0,133,18,0,[large_intestine]
3,UBERON:0000079,0,0,13,0,[]
4,UBERON:0000160,0,43,0,34,[]
5,UBERON:0000167,0,0,0,37,[]
6,UBERON:0000178,77,27,20,0,[blood]
7,UBERON:0000948,25,91,5,0,[heart]
8,UBERON:0000970,0,0,30,0,[eye]
9,UBERON:0000995,0,0,13,0,[uterus]


In [39]:
def get_uberon_label(uberon_id: str) -> str:
    """Fetch the label for a given UBERON ID using the OLS API."""
    base_url = "https://www.ebi.ac.uk/ols/api/ontologies/uberon/terms"
    iri = f"http://purl.obolibrary.org/obo/{uberon_id.replace(':', '_')}"

    response = requests.get(base_url, params={"iri": iri})
    print("Request URL:", response.url)

    if response.ok:
        data = response.json()
        terms = data.get("_embedded", {}).get("terms", [])
        if terms:
            return terms[0].get("label", "Label not found")
        else:
            return "Label not found in response"
    else:
        return f"Error: {response.status_code}"


# Example
label = get_uberon_label("UBERON:0002450")
print("Label:", label)

Request URL: https://www.ebi.ac.uk/ols4/api/ontologies/uberon/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FUBERON_0002450
Label: decidua


# Random queries

In [40]:
# get ATLAS datasets with donors < 18
underage = sankey[(sankey['donor_age'] < 18) & (sankey['is_atlas_dataset'] == True)]
underage

Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,unique_dataset_id,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset


In [41]:
sankey

Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,unique_dataset_id,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset
0,HCA,,,,,TSP27,Female,56.0,,,...,hhttps://api.cellxgene.cziscience.com/dp/v1/co...,https://cellxgene.cziscience.com/e/a357414d-20...,,No Cell Summary,,,,,True,False
1,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor1,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,No Cell Summary,,,,,True,False
2,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor2,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,No Cell Summary,,,,,True,False
3,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor3,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1016/j.trsl.2017.07.006,,No Cell Summary,,,,,True,False
4,HRA,,,,,Donor1,Female,38.0,,,...,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,https://hubmapconsortium.github.io/ccf-release...,,No Cell Summary,,,,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22189,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,No Cell Summary,,,,,True,False
22190,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,No Cell Summary,,,,,True,False
22191,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,No Cell Summary,,,,,True,False
22192,KPMP,,https://www.nature.com/articles/s41467-023-389...,,,Donor1,Male,,,,...,https://zenodo.org/records/7653239#Donor1_Tiss...,https://zenodo.org/records/7653239,,No Cell Summary,,,,,True,False


In [42]:
# dataset IDs for sc-proteomics
dois = sankey[sankey['cell_type_annotation_tool'] =='sc_proteomics']
dois['doi'].unique()

array(['https://doi.org/10.1016/j.cell.2022.12.028', nan,
       'https://doi.org/10.1038/s42003-023-04991-z',
       'https://doi.org/10.1038/s41586-023-05915-x'], dtype=object)

In [43]:
# numbers for 10th 2/3D Datasets: https://docs.google.com/spreadsheets/d/1xG4stdTZW37pmgX4kAMOnsqtjHv_tbiDbGET2Tcokuk/edit?gid=1213346061#gid=1213346061
print(f'Universe datasets for brain: ')

sankey[sankey['organ_name'].str.contains('brain', na=False)]

sankey[(sankey['organ_name'].str.contains('lymph', na=False)) & (sankey['is_atlas_dataset'] == False)]['unique_dataset_id'].nunique()

Universe datasets for brain: 


99

In [44]:
# 16,293 datasets -- all human & healthy, all ages. For how many can we do US1/2 prediction?
df_us1 = pd.read_csv(f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/atlas/application-a1.csv')

#US#2: most similar AS
df_us2_as = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/atlas/application-a2p1.csv')

#US#2: most similar extraction site
df_us2_es = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/atlas/application-a2p3.csv')

In [45]:
df_us1

Unnamed: 0,sample,rui_location,organ,organId,sex,tool,modality,cell_id,cell_label,cell_count,percentage_of_total
0,http://purl.org/ccf/1.5/omap-6#Donor1_TissueBl...,http://purl.org/ccf/1.5/168abaaf-f115-4606-a65...,pancreas,UBERON:0001264,Male,azimuth,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000738,immune,39.193500,0.375514
1,http://purl.org/ccf/1.5/omap-6#Donor1_TissueBl...,http://purl.org/ccf/1.5/168abaaf-f115-4606-a65...,pancreas,UBERON:0001264,Male,celltypist,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000173,delta,35.688270,0.341930
2,http://purl.org/ccf/1.5/omap-6#Donor1_TissueBl...,http://purl.org/ccf/1.5/168abaaf-f115-4606-a65...,pancreas,UBERON:0001264,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000115,endothelial cell,30.008910,0.287516
3,http://purl.org/ccf/1.5/omap-6#Donor1_TissueBl...,http://purl.org/ccf/1.5/168abaaf-f115-4606-a65...,pancreas,UBERON:0001264,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000763,myeloid cell,29.609580,0.283690
4,http://purl.org/ccf/1.5/omap-6#Donor1_TissueBl...,http://purl.org/ccf/1.5/168abaaf-f115-4606-a65...,pancreas,UBERON:0001264,Male,celltypist,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000171,alpha_immature,29.136300,0.279155
...,...,...,...,...,...,...,...,...,...,...,...
60112,https://zenodo.org/records/7653239#Donor2_Tiss...,http://purl.org/ccf/1.5/b392110d-05a0-477e-840...,left kidney,UBERON:0004538,Female,azimuth,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_4030020,Connecting Tubule Intercalated Type A,0.186666,0.000077
60113,https://zenodo.org/records/7653239#Donor2_Tiss...,http://purl.org/ccf/1.5/b392110d-05a0-477e-840...,left kidney,UBERON:0004538,Female,azimuth,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_1000412,Afferent / Efferent Arteriole Endothelial,0.109182,0.000045
60114,https://zenodo.org/records/7653239#Donor2_Tiss...,http://purl.org/ccf/1.5/b392110d-05a0-477e-840...,left kidney,UBERON:0004538,Female,azimuth,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0001058,Plasmacytoid Dendritic,0.109182,0.000045
60115,https://zenodo.org/records/7653239#Donor2_Tiss...,http://purl.org/ccf/1.5/b392110d-05a0-477e-840...,left kidney,UBERON:0004538,Female,azimuth,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0002201,Intercalated Type B,0.093333,0.000039


In [46]:
df_us2_as

Unnamed: 0,dataset,reported_organ,sex,tool,modality,as,as_tool,similarity
0,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/UBERON_0002084,azimuth,0.344000
1,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/UBERON_0002097,celltypist,0.382837
2,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Male,celltypist,sc_transcriptomics,http://purl.obolibrary.org/obo/UBERON_0005457,popv,0.107954
3,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/UBERON_0005457,popv,0.472117
4,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/UBERON_0005469,popv,0.472117
...,...,...,...,...,...,...,...,...
103968,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,popv,sc_transcriptomics,http://purl.org/sig/ont/fma/fma7508,azimuth,0.162705
103969,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,popv,sc_transcriptomics,http://purl.org/sig/ont/fma/fma7402,azimuth,0.529518
103970,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,azimuth,sc_transcriptomics,http://purl.org/sig/ont/fma/fma7402,azimuth,0.630345
103971,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,azimuth,sc_transcriptomics,http://purl.org/sig/ont/fma/fma7402,popv,0.475920


In [47]:
df_us2_es

Unnamed: 0,dataset,reported_organ,sex,tool,modality,corridor,corridor_tool,similarity
0,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,azimuth,sc_transcriptomics,http://purl.org/ccf/1.5/50709361-c670-4ccf-932...,azimuth,0.665025
1,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,popv,sc_transcriptomics,http://purl.org/ccf/1.5/50709361-c670-4ccf-932...,popv,0.587656
2,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,azimuth,sc_transcriptomics,http://purl.org/ccf/1.5/6acd66b8-2659-4626-bef...,azimuth,0.484705
3,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,celltypist,sc_transcriptomics,http://purl.org/ccf/1.5/50709361-c670-4ccf-932...,celltypist,0.480317
4,https://api.cellxgene.cziscience.com/dp/v1/col...,http://purl.obolibrary.org/obo/UBERON_0002048,Female,popv,sc_transcriptomics,http://purl.org/ccf/1.5/2cfde066-3472-4cf0-b1c...,popv,0.462221
...,...,...,...,...,...,...,...,...
171789,https://entity.api.sennetconsortium.org/entiti...,http://purl.obolibrary.org/obo/UBERON_0001013,Female,popv,sc_transcriptomics,http://purl.org/ccf/1.5/04baf323-eda0-4f72-bea...,celltypist,0.141409
171790,https://entity.api.sennetconsortium.org/entiti...,http://purl.obolibrary.org/obo/UBERON_0001013,Female,popv,sc_transcriptomics,http://purl.org/ccf/1.5/2156f837-2ab2-4305-8e7...,celltypist,0.141409
171791,https://entity.api.sennetconsortium.org/entiti...,http://purl.obolibrary.org/obo/UBERON_0001013,Female,popv,sc_transcriptomics,http://purl.org/ccf/1.5/2816c343-c908-45bf-896...,celltypist,0.141409
171792,https://entity.api.sennetconsortium.org/entiti...,http://purl.obolibrary.org/obo/UBERON_0001013,Female,popv,sc_transcriptomics,http://purl.org/ccf/1.5/e13cd910-c2df-4b83-ad6...,celltypist,0.141409


In [48]:
print(f'Unique extraction sites for which we predict US#1: {df_us1['rui_location'].nunique()}.')

Unique extraction sites for which we predict US#1: 617.


In [49]:
print(f'Unique datasets for US#2 (most similar AS): {df_us2_as['dataset'].nunique()}.')

Unique datasets for US#2 (most similar AS): 5453.


In [50]:
print(
    f'Unique datasets for US#2 (most similar extraction site/corridor): {df_us2_es['dataset'].nunique()}.')

Unique datasets for US#2 (most similar extraction site/corridor): 5450.


In [51]:
# How many are adult?
sankey[sankey['donor_age'] >= 18]['unique_dataset_id'].nunique()

5944

## Extraction sites

In [52]:
url = f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/extraction-sites.csv'

universe_extraction_sites = pd.read_csv(url)
universe_extraction_sites

Unnamed: 0,extraction_site
0,http://purl.org/ccf/1.5/e21afc3c-3a7c-4717-90d...
1,http://purl.org/ccf/1.5/e240c0a4-5e53-4464-832...
2,http://purl.org/ccf/1.5/e43280c9-840b-481f-a8a...
3,http://purl.org/ccf/1.5/e4853294-071d-4bef-bff...
4,http://purl.org/ccf/1.5/e4a44b76-53fd-4c88-9cc...
...,...
1127,http://purl.org/ccf/1.5/bc516774-fcd9-4022-bc5...
1128,http://purl.org/ccf/1.5/c7265539-ebc3-4a1e-893...
1129,http://purl.org/ccf/1.5/c7ed4142-ddad-4cf5-b83...
1130,http://purl.org/ccf/1.5/f2a0eb6b-7729-40ac-baa...


In [53]:
def get_etraction_site_and_mesh_collisions(iri:str):
  """Takes an IRI, gets the extraction site and mesh collisions

  Args:
      iri (str): IRI for the extraction site
  """
  # initialize result
  result = (set(), set())
  
  # loop through extraction site IDs and get extraction site data
  api_extraction_site_base = 'https://apps.humanatlas.io/api/v1/extraction-site?iri='
  api_collisions_base = 'https://apps.humanatlas.io/api/v1/collisions'
  
  try:
    response = requests.get(api_extraction_site_base+iri)
    if response.ok:
        print(f'Successfully got extraction site data for {iri}!')
        extraction_site = response.json()
        try:
          headers = {
              "accept": "application/json",
              "content-type": "application/json"
          }
          data = extraction_site
          response = requests.post(api_collisions_base, headers=headers, json=data)
          if response.ok:
              print(f'Successfully got mesh collisions for {extraction_site['@id']}!')
              mesh = response.json()
              mesh_iris = [collision['representation_of'] for collision in mesh]
              result[0].update(mesh_iris)
              
              organ_iris = [collision['organ'] for collision in mesh]
              result[1].update(organ_iris)
              
          else:
              print(f"Request failed with status code {response.status_code}")
        except requests.exceptions.RequestException as e:
          print(f"An error occurred: {e}")
    else: 
      print(f"Request failed with status code {response.status_code}")
  except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")
    
  print(f"Returning {result}")
  print()
  return result

In [54]:
unique_iris = {
    'anatomical_structures': set(),
    'organs': set()
}

# Apply the function to each row
results = universe_extraction_sites['extraction_site'].apply(
    lambda iri: get_etraction_site_and_mesh_collisions(iri)
)

# Unpack the tuple of sets and update each unique set
for result in results:
    print(f'now working with {result}')
    if isinstance(result, tuple) and len(result) == 2:
        as_set, organ_set = result
        unique_iris['anatomical_structures'].update(as_set)
        unique_iris['organs'].update(organ_set)

# Print results
pprint(unique_iris)

Successfully got extraction site data for http://purl.org/ccf/1.5/e21afc3c-3a7c-4717-90d5-e5739fd8432c!
Successfully got mesh collisions for http://purl.org/ccf/1.5/e21afc3c-3a7c-4717-90d5-e5739fd8432c!
Returning ({'http://purl.org/sig/ont/fma/fma15828'}, {'spleen-male'})

Successfully got extraction site data for http://purl.org/ccf/1.5/e240c0a4-5e53-4464-8320-ad775f2c4bf7!
Successfully got mesh collisions for http://purl.org/ccf/1.5/e240c0a4-5e53-4464-8320-ad775f2c4bf7!
Returning ({'http://purl.obolibrary.org/obo/UBERON_0002115'}, {'small-intestine-female'})

Successfully got extraction site data for http://purl.org/ccf/1.5/e43280c9-840b-481f-a8a6-90114c2c5cc9!
Successfully got mesh collisions for http://purl.org/ccf/1.5/e43280c9-840b-481f-a8a6-90114c2c5cc9!
Returning (set(), set())

Successfully got extraction site data for http://purl.org/ccf/1.5/e4853294-071d-4bef-bff3-2fd52a5c6af0!
Successfully got mesh collisions for http://purl.org/ccf/1.5/e4853294-071d-4bef-bff3-2fd52a5c6af0!


KeyboardInterrupt: 

In [None]:
print(
    f'Number of AS covered across extraction sites in Universe: {len(unique_iris['anatomical_structures'])}')

print(
    f'Number of organs covered across extraction sites in Universe: {len(unique_iris['organs'])}')

Number of AS covered across extraction sites in Universe: 164
Number of organs covered across extraction sites in Universe: 49
