In [2]:
import pandas as pd
import biorosetta as br
import hpotk
import typing
from hpotk import TermId
import os
import warnings
warnings.filterwarnings('ignore')

This notebook collects the sources to integrate HPO and HRA as a knowledge graph (KG). The KG will be agregated in the form of nodes and edges. In thecase of HPO we have the following nodes:

- Genes
- Phenotypes
- Anatomical structures (AS)

The nodes tables will have the following format: iri | label | type | source

The edge table explains the relationships between theser nodes. In HPO, The only direct relationship between the above mentioned nodes is between genes and phenotypes, we will just focus on this and will work on the other relatiosnhips later. 

The edges table will have the following format: subject | predicate | object | source

# Genes Table

##### Load  genes_to_phenotype.txt file form HPO: https://hpo.jax.org/data/ontology

version:v2025-08-11

In [2]:
hpo_genes_to_phenotype = pd.read_csv("./data/genes_to_phenotype_v2025-08-11.txt", sep="\t")
hpo_genes_to_phenotype.head()
hpo_genes_to_phenotype['ncbi_gene_id'] = hpo_genes_to_phenotype['ncbi_gene_id'].astype(str)
hpo_genes_to_phenotype.head()

Unnamed: 0,ncbi_gene_id,gene_symbol,hpo_id,hpo_name,frequency,disease_id
0,10,NAT2,HP:0000007,Autosomal recessive inheritance,-,OMIM:243400
1,10,NAT2,HP:0001939,Abnormality of metabolism/homeostasis,-,OMIM:243400
2,16,AARS1,HP:0002460,Distal muscle weakness,15/15,OMIM:613287
3,16,AARS1,HP:0002451,Limb dystonia,3/3,OMIM:616339
4,16,AARS1,HP:0008619,Bilateral sensorineural hearing impairment,HP:0040283,ORPHA:33364


### Subset HPO to those related to kidney:

In [3]:
store = hpotk.configure_ontology_store()
hpo = store.load_hpo(release='v2025-05-06')

def get_all_decendant_hpo_term_dict(
    hpo: hpotk.MinimalOntology, parent_TermID: str = "HP:0003674"
) -> typing.Set[TermId]:
    """
    Retrieve all descendant onset TermIds from a specified parent term ID in an ontology.

    :param parent_TermID: The parent term ID to find descendants for, defaults to 'HP:0003674'.
    :param hpo: The ontology instance containing term relationships.
    :return: A set of onset TermIds derived from the given parent term ID.
    """
    descendant_term_id_dict = dict()
    for term in hpo.graph.get_descendants(parent_TermID):
        descendant_term_id_dict[term.value] = hpo.get_term_name(term)
    return descendant_term_id_dict

#### 'HP:0000077' corresponds to 'Abnormality of the kidney'
##### We assume that all the terms that are the decendants of 'Abnormality of the kidney' are kidney related HPO terms
#### We subset the HPO term - phenotype relationships table to those terms related to kidney

In [4]:

kidney_related_hpo_dict = get_all_decendant_hpo_term_dict(hpo=hpo, parent_TermID='HP:0000077')


In [5]:
hpo_genes_to_phenotype_kidney = hpo_genes_to_phenotype[hpo_genes_to_phenotype['hpo_id'].isin(kidney_related_hpo_dict.keys())]

In [6]:
hpo_genes_to_phenotype_kidney

Unnamed: 0,ncbi_gene_id,gene_symbol,hpo_id,hpo_name,frequency,disease_id
1444,54,ACP5,HP:0005576,Tubulointerstitial fibrosis,-,OMIM:607944
1446,54,ACP5,HP:0012622,Chronic kidney disease,HP:0040282,ORPHA:1855
1482,54,ACP5,HP:0000790,Hematuria,HP:0040283,ORPHA:1855
2079,60,ACTB,HP:0000126,Hydronephrosis,HP:0040282,ORPHA:2995
2423,71,ACTG1,HP:0000126,Hydronephrosis,HP:0040282,ORPHA:2995
...,...,...,...,...,...,...
391610,105259599,-,HP:0002667,Nephroblastoma,-,OMIM:194071
391618,105259599,-,HP:0000121,Nephrocalcinosis,-,OMIM:130650
391620,105259599,-,HP:0000105,Enlarged kidney,-,OMIM:130650
391638,105259599,-,HP:0000803,Renal cortical cysts,-,OMIM:130650


### Map the NCBI IDs and the gene symbols to HGNC IDs

In [7]:
# get the idmapper from biorosetta
idmap = br.IDMapper([br.EnsemblBiomartMapper(),br.HGNCBiomartMapper(),br.MyGeneMapper()])

- Loading lookup tables from cache (use function EnsemblBiomartMapper.download_data() to force new download)
- Loading lookup tables from cache (use function HGNCBiomartMapper.download_data() to force new download)


In [8]:
NCBI_geneSymbol_HGNC_mapping = hpo_genes_to_phenotype[['ncbi_gene_id', 'gene_symbol']]
NCBI_geneSymbol_HGNC_mapping.drop_duplicates(inplace=True)
NCBI_geneSymbol_HGNC_mapping.loc[:, 'NCBI_to_HGNC'] = list(idmap.convert(NCBI_geneSymbol_HGNC_mapping['ncbi_gene_id'],'entr','hgnc'))
NCBI_geneSymbol_HGNC_mapping.loc[:, 'geneSymbol_to_HGNC'] = list(idmap.convert(NCBI_geneSymbol_HGNC_mapping['gene_symbol'],'symb','hgnc'))
NCBI_geneSymbol_HGNC_mapping

One or more sources do not support the requested input/output ID type mapping: mygene
Mapping will be executed using the following source(s): ensembl,hgnc
One or more sources do not support the requested input/output ID type mapping: mygene
Mapping will be executed using the following source(s): ensembl,hgnc


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NCBI_geneSymbol_HGNC_mapping.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NCBI_geneSymbol_HGNC_mapping.loc[:, 'NCBI_to_HGNC'] = list(idmap.convert(NCBI_geneSymbol_HGNC_mapping['ncbi_gene_id'],'entr','hgnc'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NCBI_geneSymbol_HGNC_mapping.loc[:, 'geneSymbol_to_HGNC'] = list(idma

Unnamed: 0,ncbi_gene_id,gene_symbol,NCBI_to_HGNC,geneSymbol_to_HGNC
0,10,NAT2,HGNC:7646,HGNC:7646
2,16,AARS1,HGNC:20,HGNC:20
222,18,ABAT,HGNC:23,HGNC:23
262,19,ABCA1,HGNC:29,HGNC:29
323,20,ABCA2,HGNC:32,HGNC:32
...,...,...,...,...
391666,105371045,PERCC1,HGNC:52293,HGNC:52293
391695,105804841,-,,
391712,109580095,-,,
391714,111365204,-,,


In [9]:
NCBI_geneSymbol_HGNC_mapping.loc[NCBI_geneSymbol_HGNC_mapping['gene_symbol']=='-',:].shape[0]

6

In [10]:
NCBI_geneSymbol_HGNC_mapping.loc[NCBI_geneSymbol_HGNC_mapping['geneSymbol_to_HGNC']=='N/A',:].shape[0]

19

In [11]:
NCBI_geneSymbol_HGNC_mapping.loc[
    (NCBI_geneSymbol_HGNC_mapping['geneSymbol_to_HGNC'] != 'N/A') & 
    (NCBI_geneSymbol_HGNC_mapping['NCBI_to_HGNC'] == 'N/A')
]

Unnamed: 0,ncbi_gene_id,gene_symbol,NCBI_to_HGNC,geneSymbol_to_HGNC
21039,831,CAST,,HGNC:1515
89461,3500,IGHG1,,HGNC:5525
89505,3501,IGHG2,,HGNC:5526
89552,3507,IGHM,,HGNC:5541
89651,3514,IGKC,,HGNC:5716
115095,4511,MT-TC,,HGNC:7477
117100,4549,MT-RNR1,,HGNC:7470
117189,4556,MT-TE,,HGNC:7479
117245,4558,MT-TF,,HGNC:7481
117382,4564,MT-TH,,HGNC:7487


In [12]:
NCBI_geneSymbol_HGNC_mapping.loc[
    (NCBI_geneSymbol_HGNC_mapping['gene_symbol']=='-') & 
    (NCBI_geneSymbol_HGNC_mapping['NCBI_to_HGNC'] != 'N/A')
]

Unnamed: 0,ncbi_gene_id,gene_symbol,NCBI_to_HGNC,geneSymbol_to_HGNC


#### the best way of mapping NCBI - gene symbol - HGNC is thorugh gene symbols.

- 6 NCBI genes do not have any corresponding symbol or genes
- 41 NCBI genes do not have corresponding HGNC ID
- 19 gene symbols do not have any corresponding HGNC ID

Therefore, we can use gene symbols as the bridge and in this way we map 5167/5186 genes 

In [13]:
NCBI_geneSymbol_HGNC_mapping_complete = NCBI_geneSymbol_HGNC_mapping.loc[NCBI_geneSymbol_HGNC_mapping['geneSymbol_to_HGNC']!='N/A',:]
NCBI_geneSymbol_HGNC_mapping_complete.tail()

Unnamed: 0,ncbi_gene_id,gene_symbol,NCBI_to_HGNC,geneSymbol_to_HGNC
391420,101101692,HELLPAR,,HGNC:43984
391452,101928376,IL12A-AS1,HGNC:49094,HGNC:49094
391537,101929726,MYMX,HGNC:52391,HGNC:52391
391666,105371045,PERCC1,HGNC:52293,HGNC:52293
391730,120766137,HRURF,HGNC:55085,HGNC:55085


### map hpo_genes_to_phenotype_kidney ncbi ids to HGNC though gene symbols using NCBI_geneSymbol_HGNC_mapping lookup table

In [14]:
# remove the rows without any gene symbol
hpo_genes_to_phenotype_kidney_annotated = hpo_genes_to_phenotype_kidney.copy()
hpo_genes_to_phenotype_kidney_annotated = hpo_genes_to_phenotype_kidney_annotated.loc[hpo_genes_to_phenotype_kidney_annotated['gene_symbol']!='-',:]
hpo_genes_to_phenotype_kidney_annotated

Unnamed: 0,ncbi_gene_id,gene_symbol,hpo_id,hpo_name,frequency,disease_id
1444,54,ACP5,HP:0005576,Tubulointerstitial fibrosis,-,OMIM:607944
1446,54,ACP5,HP:0012622,Chronic kidney disease,HP:0040282,ORPHA:1855
1482,54,ACP5,HP:0000790,Hematuria,HP:0040283,ORPHA:1855
2079,60,ACTB,HP:0000126,Hydronephrosis,HP:0040282,ORPHA:2995
2423,71,ACTG1,HP:0000126,Hydronephrosis,HP:0040282,ORPHA:2995
...,...,...,...,...,...,...
391441,101101692,HELLPAR,HP:0001919,Acute kidney injury,HP:0040284,ORPHA:244242
391454,101928376,IL12A-AS1,HP:0100820,Glomerulopathy,HP:0040283,ORPHA:117
391462,101928376,IL12A-AS1,HP:0000083,Renal insufficiency,HP:0040283,ORPHA:117
391463,101928376,IL12A-AS1,HP:0000099,Glomerulonephritis,HP:0040284,ORPHA:117


In [15]:

# add HGNC annotations
hpo_genes_to_phenotype_kidney_annotated['HGNC'] = [NCBI_geneSymbol_HGNC_mapping_complete[NCBI_geneSymbol_HGNC_mapping_complete['gene_symbol']==x].loc[:, 'geneSymbol_to_HGNC'].values[0] for x in hpo_genes_to_phenotype_kidney_annotated['gene_symbol']]
hpo_genes_to_phenotype_kidney_annotated

Unnamed: 0,ncbi_gene_id,gene_symbol,hpo_id,hpo_name,frequency,disease_id,HGNC
1444,54,ACP5,HP:0005576,Tubulointerstitial fibrosis,-,OMIM:607944,HGNC:124
1446,54,ACP5,HP:0012622,Chronic kidney disease,HP:0040282,ORPHA:1855,HGNC:124
1482,54,ACP5,HP:0000790,Hematuria,HP:0040283,ORPHA:1855,HGNC:124
2079,60,ACTB,HP:0000126,Hydronephrosis,HP:0040282,ORPHA:2995,HGNC:132
2423,71,ACTG1,HP:0000126,Hydronephrosis,HP:0040282,ORPHA:2995,HGNC:144
...,...,...,...,...,...,...,...
391441,101101692,HELLPAR,HP:0001919,Acute kidney injury,HP:0040284,ORPHA:244242,HGNC:43984
391454,101928376,IL12A-AS1,HP:0100820,Glomerulopathy,HP:0040283,ORPHA:117,HGNC:49094
391462,101928376,IL12A-AS1,HP:0000083,Renal insufficiency,HP:0040283,ORPHA:117,HGNC:49094
391463,101928376,IL12A-AS1,HP:0000099,Glomerulonephritis,HP:0040284,ORPHA:117,HGNC:49094


In [16]:
gene_nodes_table = pd.DataFrame({
    'iri':['http://identifiers.org/hgnc/'+ a.split(':')[-1] for a in hpo_genes_to_phenotype_kidney_annotated['HGNC']],
    'label': hpo_genes_to_phenotype_kidney_annotated['gene_symbol'],
    'type':'http://purl.bioontology.org/ontology/HGNC/gene',
    'source':'https://purl.humanatlas.io/vocab/hp'
})
gene_nodes_table.drop_duplicates(inplace=True)

In [17]:
gene_nodes_table.to_csv('../input-csvs/hpo-kidney-genes-nodes.csv', index=False)

# Phenotypes Table

In [18]:
phenotype_nodes_table = pd.DataFrame({
    'iri':['http://purl.obolibrary.org/obo/HP_'+ a.split(':')[-1] for a in hpo_genes_to_phenotype_kidney_annotated['hpo_id']],
    'label': hpo_genes_to_phenotype_kidney_annotated['hpo_name'],
    'type':'http://purl.obolibrary.org/obo/HP_0000118', # phenotypic abnormality
    'source':'https://purl.humanatlas.io/vocab/hp'
})

phenotype_nodes_table_complete = phenotype_nodes_table.drop_duplicates()
phenotype_nodes_table_complete = phenotype_nodes_table_complete.reset_index(drop=True)
phenotype_nodes_table_complete.head()

Unnamed: 0,iri,label,type,source
0,http://purl.obolibrary.org/obo/HP_0005576,Tubulointerstitial fibrosis,http://purl.obolibrary.org/obo/HP_0000118,https://purl.humanatlas.io/vocab/hp
1,http://purl.obolibrary.org/obo/HP_0012622,Chronic kidney disease,http://purl.obolibrary.org/obo/HP_0000118,https://purl.humanatlas.io/vocab/hp
2,http://purl.obolibrary.org/obo/HP_0000790,Hematuria,http://purl.obolibrary.org/obo/HP_0000118,https://purl.humanatlas.io/vocab/hp
3,http://purl.obolibrary.org/obo/HP_0000126,Hydronephrosis,http://purl.obolibrary.org/obo/HP_0000118,https://purl.humanatlas.io/vocab/hp
4,http://purl.obolibrary.org/obo/HP_0000003,Multicystic kidney dysplasia,http://purl.obolibrary.org/obo/HP_0000118,https://purl.humanatlas.io/vocab/hp


In [19]:
phenotype_nodes_table_complete.to_csv('../input-csvs/hpo-kidney-phenotypes-nodes.csv', index=False)

# AS table:

- Bruce retrived all the AS UBERON terms in HPO: https://api.triplydb.com/s/2r-5G7lyW
    - HPO_AS_uberon.csv

- Bruce also prepared this file mapping: AS -> Organ Mapping https://grlc.io/api-git/hubmapconsortium/ccf-grlc/subdir/hra/as-parts.csv?location=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FUBERON_0002113 . The corresponding SPARQL is here: https://github.com/hubmapconsortium/ccf-grlc/blob/main/hra/as-parts.rq

    - The last part of the URL correspond to the ueberon term of Kidney: UBERON_0002113
    - as-parts.csv 
    


In [20]:
HPO_AS = pd.read_csv("./data/HPO_AS_uberon.csv")
HPO_AS.head()

Unnamed: 0,iri,label,type,source
0,http://purl.obolibrary.org/obo/UBERON_0000019,camera-type eye,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
1,http://purl.obolibrary.org/obo/UBERON_0000955,brain,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
2,http://purl.obolibrary.org/obo/UBERON_0000966,retina,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
3,http://purl.obolibrary.org/obo/UBERON_0001017,central nervous system,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
4,http://purl.obolibrary.org/obo/UBERON_0001607,sphincter pupillae,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp


In [21]:
kidney_as_parts = pd.read_csv("./data/as-parts.csv")
kidney_as_parts.head()

Unnamed: 0,part_iri,part_label
0,http://purl.obolibrary.org/obo/UBERON_0000074,renal glomerulus
1,http://purl.obolibrary.org/obo/UBERON_0000362,renal medulla
2,http://purl.obolibrary.org/obo/UBERON_0001225,cortex of kidney
3,http://purl.obolibrary.org/obo/UBERON_0001228,renal papilla
4,http://purl.obolibrary.org/obo/UBERON_0001229,renal corpuscle


In [22]:
HPO_AS_kidney = HPO_AS[HPO_AS['iri'].isin(kidney_as_parts['part_iri'])]
HPO_AS_kidney.head()

Unnamed: 0,iri,label,type,source
139,http://purl.obolibrary.org/obo/UBERON_0004100,renal collecting system,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
215,http://purl.obolibrary.org/obo/UBERON_0000074,renal glomerulus,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
251,http://purl.obolibrary.org/obo/UBERON_0002113,kidney,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
544,http://purl.obolibrary.org/obo/UBERON_0004538,left kidney,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
545,http://purl.obolibrary.org/obo/UBERON_0004539,right kidney,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp


In [23]:
HPO_AS_kidney.reset_index(drop=True, inplace=True)
HPO_AS_kidney.head()

Unnamed: 0,iri,label,type,source
0,http://purl.obolibrary.org/obo/UBERON_0004100,renal collecting system,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
1,http://purl.obolibrary.org/obo/UBERON_0000074,renal glomerulus,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
2,http://purl.obolibrary.org/obo/UBERON_0002113,kidney,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
3,http://purl.obolibrary.org/obo/UBERON_0004538,left kidney,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp
4,http://purl.obolibrary.org/obo/UBERON_0004539,right kidney,http://purl.obolibrary.org/obo/UBERON_0000061,https://purl.humanatlas.io/vocab/hp


In [24]:
HPO_AS_kidney.head().to_csv('../input-csvs/hpo-kidney-as-nodes.csv', index=False)

# Edge tables

 Subject | Predicate | Object


In [25]:
hpo_genes_to_phenotype_kidney_annotated['hgnc_iri']= ['http://identifiers.org/hgnc/'+ a.split(':')[-1] for a in hpo_genes_to_phenotype_kidney_annotated['HGNC']]
hpo_genes_to_phenotype_kidney_annotated.head()

Unnamed: 0,ncbi_gene_id,gene_symbol,hpo_id,hpo_name,frequency,disease_id,HGNC,hgnc_iri
1444,54,ACP5,HP:0005576,Tubulointerstitial fibrosis,-,OMIM:607944,HGNC:124,http://identifiers.org/hgnc/124
1446,54,ACP5,HP:0012622,Chronic kidney disease,HP:0040282,ORPHA:1855,HGNC:124,http://identifiers.org/hgnc/124
1482,54,ACP5,HP:0000790,Hematuria,HP:0040283,ORPHA:1855,HGNC:124,http://identifiers.org/hgnc/124
2079,60,ACTB,HP:0000126,Hydronephrosis,HP:0040282,ORPHA:2995,HGNC:132,http://identifiers.org/hgnc/132
2423,71,ACTG1,HP:0000126,Hydronephrosis,HP:0040282,ORPHA:2995,HGNC:144,http://identifiers.org/hgnc/144


In [26]:


edge_table = pd.DataFrame({'subject': hpo_genes_to_phenotype_kidney_annotated['hgnc_iri'],
                            'predicate': 'https://purl.humanatlas.io/vocab/hp#has_modifier', 
                            'object': ['http://purl.obolibrary.org/obo/HP_' + a.split(':')[-1] for a in hpo_genes_to_phenotype_kidney_annotated['hpo_id']],
                            'source':'https://purl.humanatlas.io/vocab/hp'
                            })
edge_table

Unnamed: 0,subject,predicate,object,source
1444,http://identifiers.org/hgnc/124,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0005576,https://purl.humanatlas.io/vocab/hp
1446,http://identifiers.org/hgnc/124,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0012622,https://purl.humanatlas.io/vocab/hp
1482,http://identifiers.org/hgnc/124,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0000790,https://purl.humanatlas.io/vocab/hp
2079,http://identifiers.org/hgnc/132,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0000126,https://purl.humanatlas.io/vocab/hp
2423,http://identifiers.org/hgnc/144,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0000126,https://purl.humanatlas.io/vocab/hp
...,...,...,...,...
391441,http://identifiers.org/hgnc/43984,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0001919,https://purl.humanatlas.io/vocab/hp
391454,http://identifiers.org/hgnc/49094,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0100820,https://purl.humanatlas.io/vocab/hp
391462,http://identifiers.org/hgnc/49094,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0000083,https://purl.humanatlas.io/vocab/hp
391463,http://identifiers.org/hgnc/49094,https://purl.humanatlas.io/vocab/hp#has_modifier,http://purl.obolibrary.org/obo/HP_0000099,https://purl.humanatlas.io/vocab/hp


In [27]:
edge_table.to_csv('../input-csvs/hpo-kidney-genes-to-phenotype-edges.csv', index=False)