## Build tables for HRApop paper

## GLobal settings and variables

In [36]:
OUTPUT = 'output'
HRA_POP_VERSION = 'v0.11.1'

## Install and import libraries 

In [37]:
%pip install requests pandas

import requests
import io
import pandas as pd

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Table SdataTableForAs

In [38]:
as_data = requests.get(
    'https://grlc.io/api-git/hubmapconsortium/ccf-grlc/subdir/hra-pop/cell_types_in_anatomical_structurescts_per_as', headers={
        'Accept': 'text/csv'
    }).text

# Read CSV 
df_as_data_grlc = pd.read_csv(io.StringIO(as_data))

# Display the DataFrame
df_as_data_grlc

Unnamed: 0,organ,as,as_label,sex,tool,modality,cell_id,cell_label,cell_count,cell_percentage
0,Spatial entity of male colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_smc-plpp2-,SMC (PLPP2+),92.848,0.114140
1,Spatial entity of female colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_smc-plpp2-,SMC (PLPP2+),92.848,0.114140
2,Spatial entity of male colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_cycling-plasma...,Cycling plasma cell,63.448,0.077998
3,Spatial entity of female colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_cycling-plasma...,Cycling plasma cell,63.448,0.077998
4,Spatial entity of male colon,http://purl.obolibrary.org/obo/UBERON_0001153,caecum,Female,celltypist,sc_transcriptomics,https://purl.org/ccf/ASCTB-TEMP_mesoderm-1-hand1-,Mesoderm 1 (HAND1+),57.400,0.070563
...,...,...,...,...,...,...,...,...,...,...
8015,Spatial entity of male male reproductive system,http://purl.obolibrary.org/obo/UBERON_8410027,central zone of prostate,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000625,"CD8-positive, alpha-beta T cell",52.962,0.018737
8016,Spatial entity of male male reproductive system,http://purl.obolibrary.org/obo/UBERON_8410027,central zone of prostate,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000576,monocyte,52.052,0.018415
8017,Spatial entity of male male reproductive system,http://purl.obolibrary.org/obo/UBERON_8410027,central zone of prostate,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000235,macrophage,34.762,0.012298
8018,Spatial entity of male male reproductive system,http://purl.obolibrary.org/obo/UBERON_8410027,central zone of prostate,Male,popv,sc_transcriptomics,http://purl.obolibrary.org/obo/CL_0000066,epithelial cell,11.830,0.004185


In [39]:
# Make DataFrame shorter through aggregating cell columns
aggregated_df = df_as_data_grlc.groupby(['organ', 'as', 'as_label', 'sex', 'tool', 'modality']) \
                  .agg({
                      'cell_id': 'nunique',  # Count of unique cell_ids
                      'cell_count': 'sum',   # Optional: sum of cell counts
                      'cell_percentage': 'mean'  # Optional: average of cell percentages
                  }).reset_index()

# Rename the cell_id column to be clear it's a unique count
aggregated_df = aggregated_df.rename(columns={
  'cell_id': 'unique_cell_id_count',
  'cell_count' : 'sum_of_cell_counts',
  'cell_percentage':'mean_cell_percentage'
  }).drop(['as','sex'], axis=1)

print("Aggregated DataFrame:")
aggregated_df

Aggregated DataFrame:


Unnamed: 0,organ,as_label,tool,modality,unique_cell_id_count,sum_of_cell_counts,mean_cell_percentage
0,Spatial entity of female colon,caecum,celltypist,sc_transcriptomics,131,813.456,0.007634
1,Spatial entity of female colon,caecum,popv,sc_transcriptomics,16,652.064,0.062500
2,Spatial entity of female colon,caecum,sc_proteomics,sc_proteomics,25,2110.864,0.040000
3,Spatial entity of female colon,caecum,sc_proteomics,sc_proteomics,21,1722349.035,0.047619
4,Spatial entity of female colon,ascending colon,celltypist,sc_transcriptomics,131,3704.130,0.007634
...,...,...,...,...,...,...,...
195,Spatial entity of male spleen,hilum of spleen,popv,sc_transcriptomics,19,72.824,0.052632
196,Spatial entity of male spleen,hilum of spleen,sc_proteomics,sc_proteomics,12,761.660,0.083333
197,Spatial entity of male urinary bladder,trigone of urinary bladder,popv,sc_transcriptomics,14,32985.979,0.071429
198,Spatial entity of male urinary bladder,fundus of urinary bladder,popv,sc_transcriptomics,14,13432.645,0.071429


In [40]:
# Export to CSV
df_as_data_grlc.to_csv('output/table_SdataTableForAs.csv', index=False)