## A notebook to retrieve mean biomarker expression values from HRApop for a list of genes

In [13]:
# Install and import libraries
%pip install pandas pyld requests

import pandas as pd
import requests
from pyld import jsonld
from pprint import pprint


Note: you may need to restart the kernel to use updated packages.


In [14]:
# load data
data = pd.read_csv("data/Priority.Genes.HRA.20241010.csv")
data

Unnamed: 0,Priority,Ensembl_ID,chromosome_name,start_position,end_position,external_gene_name,gene_biotype
0,high,ENSG00000021826,2,211342406,211543831,CPS1,protein_coding
1,high,ENSG00000003989,8,17354597,17428082,SLC7A2,protein_coding
2,high,ENSG00000154122,5,14704910,14871887,ANKH,protein_coding
3,high,ENSG00000141485,17,6588032,6616886,SLC13A5,protein_coding
4,high,ENSG00000112499,6,160592093,160698670,SLC22A2,protein_coding
...,...,...,...,...,...,...,...
114,medium,ENSG00000198650,16,71599563,71611033,TAT,protein_coding
115,medium,ENSG00000157045,16,15131710,15149921,NTAN1,protein_coding
116,medium,ENSG00000175564,11,73711326,73720480,UCP3,protein_coding
117,medium,ENSG00000138823,4,100484918,100545156,MTTP,protein_coding


In [15]:
# extract list of unique Ensembl IDs
ensembl_unique = list(data['Ensembl_ID'].unique())
ensembl_unique

['ENSG00000021826',
 'ENSG00000003989',
 'ENSG00000154122',
 'ENSG00000141485',
 'ENSG00000112499',
 'ENSG00000198569',
 'ENSG00000142494',
 'ENSG00000021488',
 'ENSG00000139209',
 'ENSG00000164756',
 'ENSG00000130173',
 'ENSG00000165795',
 'ENSG00000196660',
 'ENSG00000123612',
 'ENSG00000149577',
 'ENSG00000135740',
 'ENSG00000127948',
 'ENSG00000141505',
 'ENSG00000197375',
 'ENSG00000152270',
 'ENSG00000137204',
 'ENSG00000112394',
 'ENSG00000181856',
 'ENSG00000132170',
 'ENSG00000171105',
 'ENSG00000109511',
 'ENSG00000134240',
 'ENSG00000090857',
 'ENSG00000005882',
 'ENSG00000158571',
 'ENSG00000163631',
 'ENSG00000197249',
 'ENSG00000154262',
 'ENSG00000138678',
 'ENSG00000100889',
 'ENSG00000081479',
 'ENSG00000135423',
 'ENSG00000106633',
 'ENSG00000117054',
 'ENSG00000196616',
 'ENSG00000213398',
 'ENSG00000084110',
 'ENSG00000138075',
 'ENSG00000130164',
 'ENSG00000166035',
 'ENSG00000175445',
 'ENSG00000087237',
 'ENSG00000165029',
 'ENSG00000134571',
 'ENSG00000073060',


In [16]:
# load atlas enriched dataset graph
url = 'https://cdn.humanatlas.io/digital-objects/graph/hra-pop/latest/assets/atlas-enriched-dataset-graph.jsonld'


# Fetch JSON-LD data from HRA LOD server
response = requests.get(url)


# Check if the request was successful
if response.status_code == 200:
    json_ld_data = response.json()  # Load the JSON-LD data
else:
    raise Exception(f"Failed to retrieve data. Status code: {
                    response.status_code}")

# print first donor
pprint(json_ld_data['@graph'][0])

{'@id': 'https://entity.api.hubmapconsortium.org/entities/1628b6f7eb615862322d6274a6bc9fa0',
 '@type': 'Donor',
 'age': '67',
 'bmi': '30.2',
 'consortium_name': 'HuBMAP',
 'description': 'Entered 12/27/2019, Yiing Lin, TMC-Stanford',
 'label': 'Female, Age 67, BMI 30.2',
 'link': 'https://portal.hubmapconsortium.org/browse/donor/1628b6f7eb615862322d6274a6bc9fa0',
 'provider_name': 'TMC-Stanford',
 'provider_uuid': 'def5fd76-ed43-11e8-b56a-0e8017bdda58',
 'race': 'White',
 'samples': [{'@id': 'https://entity.api.hubmapconsortium.org/entities/0b43d8d0dbbc5e3923a8b963650ab8e3',
              '@type': 'Sample',
              'datasets': [],
              'description': '3 x 30 x 15 millimeter, 15 millimeter, 1 '
                             'Sections',
              'label': 'Registered 2/16/2022, amir Bahmani, TMC-Stanford',
              'link': 'https://portal.hubmapconsortium.org/browse/sample/0b43d8d0dbbc5e3923a8b963650ab8e3',
              'rui_location': {'@context': 'https://hubma

In [17]:
# initialize dict for result
result = {
  'sex': [],
  'organ_id': [],
  'dataset_id': [],
  'ensembl_id' : [],
  'mean_gene_expr_value': [],
  'cell_label': [],
  'annotation_method': []
}

# iterate over ds-graph for hra-pop, get ensembl IDs and match against ensembl_unique
for donor in json_ld_data['@graph']:
  for sample in donor['samples']:
    for data in sample['datasets']:
      for cell_summary in data['summaries']:
        if cell_summary['@type'] == 'CellSummary':
          for summary in cell_summary['summary']:
            if 'gene_expr' in summary.keys():
              for gene in summary['gene_expr']:
                if 'ensembl_id' in gene:
                  if gene['ensembl_id'] in ensembl_unique:
                    result['sex'].append(cell_summary['sex'])
                    result['organ_id'].append(data['organ_id'])
                    result['dataset_id'].append(data['@id'])
                    result['ensembl_id'].append(gene['ensembl_id'])
                    result['mean_gene_expr_value'].append(
                        gene['mean_gene_expr_value'])
                    result['cell_label'] = summary['cell_label']
                    result['annotation_method'].append(
                        cell_summary['annotation_method'])
                    
# convert to data frame 
df = pd.DataFrame(result).drop_duplicates() 

# export to CSV
df.to_csv("output/ensembl_ids_hra_pop.csv", index=False)

# print df
df

Unnamed: 0,sex,organ_id,dataset_id,ensembl_id,mean_gene_expr_value,cell_label,annotation_method
0,Female,http://purl.obolibrary.org/obo/UBERON_0000948,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000134571,1.999749,Monocyte-derived Mφ,azimuth
1,Female,http://purl.obolibrary.org/obo/UBERON_0000948,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000154262,1.644089,Monocyte-derived Mφ,azimuth
2,Female,http://purl.obolibrary.org/obo/UBERON_0000948,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000100979,1.261290,Monocyte-derived Mφ,azimuth
3,Female,http://purl.obolibrary.org/obo/UBERON_0000948,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000062282,1.121837,Monocyte-derived Mφ,azimuth
4,Female,http://purl.obolibrary.org/obo/UBERON_0000948,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000145703,1.038063,Monocyte-derived Mφ,azimuth
...,...,...,...,...,...,...,...
572,Male,http://purl.obolibrary.org/obo/UBERON_0002048,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000152270,2.104608,Monocyte-derived Mφ,celltypist
573,Male,http://purl.obolibrary.org/obo/UBERON_0002048,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000145703,0.996712,Monocyte-derived Mφ,celltypist
574,Male,http://purl.obolibrary.org/obo/UBERON_0002048,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000165029,2.914111,Monocyte-derived Mφ,celltypist
575,Male,http://purl.obolibrary.org/obo/UBERON_0002048,https://api.cellxgene.cziscience.com/dp/v1/col...,ENSG00000196616,3.298286,Monocyte-derived Mφ,azimuth
