In [8]:
import os 
from bioportal_client import BioPortalClient
import pandas as pd

In [9]:
def get_bioportal_mappings(from_ontology, to_ontology):
    api_key = "225a7f07-744e-41fc-b8e8-7c3d9b11a100"
    client = BioPortalClient(api_key)
    return client.get_mappings(from_ontology, to_ontology)

# Getting the Mappings

### HPO to MESH

In [10]:
hpo_mesh = get_bioportal_mappings('HP', 'MESH')

In [11]:
len(hpo_mesh)

923

In [12]:
hpo_mesh

{'http://purl.bioontology.org/ontology/MESH/D007007': 'http://purl.obolibrary.org/obo/HP_0000966',
 'http://purl.bioontology.org/ontology/MESH/D065707': 'http://purl.obolibrary.org/obo/HP_0010636',
 'http://purl.bioontology.org/ontology/MESH/D012162': 'http://purl.obolibrary.org/obo/HP_0000546',
 'http://purl.bioontology.org/ontology/MESH/D006628': 'http://purl.obolibrary.org/obo/HP_0001007',
 'http://purl.bioontology.org/ontology/MESH/D009845': 'http://purl.obolibrary.org/obo/HP_0000798',
 'http://purl.bioontology.org/ontology/MESH/D004716': 'http://purl.obolibrary.org/obo/HP_0025636',
 'http://purl.bioontology.org/ontology/MESH/D006327': 'http://purl.obolibrary.org/obo/HP_0012722',
 'http://purl.bioontology.org/ontology/MESH/D014973': 'http://purl.obolibrary.org/obo/HP_0000991',
 'http://purl.bioontology.org/ontology/MESH/D018908': 'http://purl.obolibrary.org/obo/HP_0001324',
 'http://purl.bioontology.org/ontology/MESH/D020069': 'http://purl.obolibrary.org/obo/HP_0030834',
 'http://p

In [13]:
hpo_mesh_df = pd.DataFrame.from_dict(hpo_mesh, orient='index', columns = ['HPO'])
hpo_mesh_df.reset_index(inplace = True)

In [14]:
hpo_mesh_df['HPO']=hpo_mesh_df['HPO'].apply(lambda row: row.split('_')[-1])
hpo_mesh_df['MESH']=hpo_mesh_df['index'].apply(lambda row: row.split('/')[-1])
hpo_mesh_df.drop(columns = 'index', inplace = True)

In [15]:
hpo_mesh_df.head(3)

Unnamed: 0,HPO,MESH
0,966,D007007
1,10636,D065707
2,546,D012162


### DOID to OMIM

In [16]:
doid_omim = get_bioportal_mappings("DOID", "OMIM")

In [17]:
do_omim_df = pd.DataFrame.from_dict(doid_omim, orient='index', columns = ['OMIM'])
do_omim_df.reset_index(inplace = True)

In [18]:
do_omim_df['DOID']=do_omim_df['index'].apply(lambda row: row.split('_')[-1])
do_omim_df['OMIM']=do_omim_df['OMIM'].apply(lambda row: row.split('/')[-1])
do_omim_df.drop(columns = 'index', inplace = True)

In [19]:
do_omim_df.head(3)

Unnamed: 0,OMIM,DOID
0,MTHU039740,8986
1,617904,80291
2,MTHU013769,1572


### DOID to ORPHA

In [20]:
doid_orpha = get_bioportal_mappings('DOID', 'ORDO')

In [21]:
do_orpha_df = pd.DataFrame.from_dict(doid_orpha, orient='index', columns = ['ORPHA'])
do_orpha_df.reset_index(inplace = True)

In [22]:
do_orpha_df['DOID']=do_orpha_df['index'].apply(lambda row: row.split('_')[-1])
do_orpha_df['ORPHA']=do_orpha_df['ORPHA'].apply(lambda row: row.split('_')[-1])
do_orpha_df.drop(columns = 'index', inplace = True)

In [23]:
do_orpha_df.head(3)

Unnamed: 0,ORPHA,DOID
0,364559,1934
1,90652,111784
2,178,3302


## HPO to DO mappings

In [24]:
hpo_do = get_bioportal_mappings("HP", "DOID")
# For some reason it is returning a mixed mapping,
# containing SYMP, HPO AND DO TO HPO.
# I clean the mapping by doing a partial string match

In [25]:
hpo_do_df = pd.DataFrame.from_dict(hpo_do, orient='index', columns = ['HPO'])
hpo_do_df.reset_index(inplace = True)

In [26]:
hpo_do_df = hpo_do_df.loc[hpo_do_df['index'].str.contains('DOID', na = False)]

In [27]:
hpo_do_df['DOID']=hpo_do_df['index'].apply(lambda row: row.split('_')[-1])
hpo_do_df['HPO']=hpo_do_df['HPO'].apply(lambda row: row.split('_')[-1])

In [28]:
hpo_do_df.drop(columns = 'index', inplace = True)

In [29]:
hpo_do_df

Unnamed: 0,HPO,DOID
0,0030050,8986
2,0002343,1572
4,0010762,3302
6,0100324,419
8,0011950,2942
...,...,...
1454,0004810,1342
1455,0012474,807
1456,0001004,4977
1458,0012197,3892


## Mappings parsed
\
\
#### HPO to Mesh  =  hpo_mesh_df
\
#### OMIM to DO   =  do_omim_df
\
#### ORPHA to DO  =  do_orpha_df
\
#### HPO to DO = hpo_do_df

# Parsing the Edges

In [30]:
hpo_annot = pd.read_csv('hpo_anot.txt', delimiter = "\t", skiprows = 2, header = None)

In [31]:
header_key = ["HPO_ID", "HPO_LABEL", "gen_id", "gen_name", "info_from_source", "source", "disease_id"]

In [32]:
hpo_annot.columns = header_key
hpo_annot.drop(columns = 'info_from_source', inplace = True)

In [33]:
hpo_annot

Unnamed: 0,HPO_ID,HPO_LABEL,gen_id,gen_name,source,disease_id
0,HP:0000002,Abnormality of body height,6138,RPL15,orphadata,ORPHA:124
1,HP:0000002,Abnormality of body height,7227,TRPS1,mim2gene,OMIM:190351
2,HP:0000002,Abnormality of body height,3835,KIF22,mim2gene,OMIM:603546
3,HP:0000002,Abnormality of body height,4001,LMNB1,orphadata,ORPHA:2514
4,HP:0000002,Abnormality of body height,545,ATR,orphadata,ORPHA:808
...,...,...,...,...,...,...
1021766,HP:0032754,Focal aware sensory seizure,64780,MICAL1,mim2gene,OMIM:600512
1021767,HP:0032754,Focal aware sensory seizure,5649,RELN,mim2gene,OMIM:600512
1021768,HP:0032759,Focal sensory seizure with vestibular features,9211,LGI1,mim2gene,OMIM:600512
1021769,HP:0032759,Focal sensory seizure with vestibular features,64780,MICAL1,mim2gene,OMIM:600512


### The messy part

We don't have HPO on SPOKE, but we have MESH. We can use mappings from HPO to MESH, and just use the MESH nodes to build edges.

Edges from HPO can be connected to DOID (Diseases) but only through OMIM-DOID mappings or ORPHA-DOID mappings. 

### HPO / MESH ---> (edge) --> to OMIM/DOID

In [34]:
hpo_edge_omim = hpo_annot.loc[hpo_annot['disease_id'].str.contains('OMIM', na = False)]

In [35]:
hpo_edge_omim['OMIM'] = hpo_edge_omim['disease_id'].apply(lambda row : row.split(':')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_edge_omim['OMIM'] = hpo_edge_omim['disease_id'].apply(lambda row : row.split(':')[-1])


In [36]:
hpo_edge_omim['HPO'] = hpo_edge_omim['HPO_ID'].apply(lambda row : row.split(':')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_edge_omim['HPO'] = hpo_edge_omim['HPO_ID'].apply(lambda row : row.split(':')[-1])


In [37]:
hpo_mesh_edge_to_omim = hpo_edge_omim.merge(hpo_mesh_df, how = 'inner', on = 'HPO')

In [38]:
hpo_mesh_edge_to_omim['OMIM'] = hpo_mesh_edge_to_omim['disease_id'].apply(lambda row : row.split(':')[-1])

In [39]:
hpo_mesh_edge_to_omim_do = hpo_mesh_edge_to_omim.merge(do_omim_df, how = 'inner', on = 'OMIM')

#### HPO / MESH ---> (edge) --> to OMIM/DOID result

In [40]:
hpo_mesh_edge_to_omim_do.drop(columns=['gen_name', 'gen_id', 'HPO_ID', 'source', 'disease_id'], inplace = True)

In [41]:
hpo_mesh_edge_to_omim_do

Unnamed: 0,HPO_LABEL,OMIM,HPO,MESH,DOID
0,Bladder diverticulum,194050,0000015,C562406,1928
1,Bladder diverticulum,194050,0000015,C562406,1928
2,Vesicoureteral reflux,194050,0000076,D014718,1928
3,Vesicoureteral reflux,194050,0000076,D014718,1928
4,Renal insufficiency,194050,0000083,D051437,1928
...,...,...,...,...,...
10966,Pilomatrixoma,132600,0030434,D018296,5374
10967,Fundus albipunctatus,136880,0030642,C562733,11105
10968,Fundus albipunctatus,136880,0030642,C562733,11105
10969,Fundus albipunctatus,136880,0030642,C562733,11105


In [42]:
hpo_mesh_edge_to_omim_do.drop_duplicates(inplace=True)

In [43]:
hpo_mesh_edge_to_omim_do.to_csv('hpo_mesh_edge_to_omim_do.csv')

### HPO / DOID (mapped)---> (edge) --> to OMIM/DOID

In [44]:
hpo_do_edge_omim = hpo_edge_omim.merge(hpo_do_df, how = 'inner', on = 'HPO')

In [45]:
hpo_do_edge_omim

Unnamed: 0,HPO_ID,HPO_LABEL,gen_id,gen_name,source,disease_id,OMIM,HPO,DOID
0,HP:0000011,Neurogenic bladder,2697,GJA1,mim2gene,OMIM:164200,164200,0000011,12143
1,HP:0000011,Neurogenic bladder,2019,EN1,mim2gene,OMIM:619218,619218,0000011,12143
2,HP:0000011,Neurogenic bladder,6885,MAP3K7,mim2gene,OMIM:617137,617137,0000011,12143
3,HP:0000011,Neurogenic bladder,10466,COG5,mim2gene,OMIM:613612,613612,0000011,12143
4,HP:0000011,Neurogenic bladder,55823,VPS11,mim2gene,OMIM:616683,616683,0000011,12143
...,...,...,...,...,...,...,...,...,...
24195,HP:0032409,Subcortical band heterotopia,5048,PAFAH1B1,mim2gene,OMIM:607432,607432,0032409,0111169
24196,HP:0032409,Subcortical band heterotopia,79633,FAT4,mim2gene,OMIM:615546,615546,0032409,0111169
24197,HP:0032409,Subcortical band heterotopia,387119,CEP85L,mim2gene,OMIM:618873,618873,0032409,0111169
24198,HP:0032564,Ileitis,5970,RELA,mim2gene,OMIM:618287,618287,0032564,0060189


In [46]:
hpo_do_edge_omim.rename(columns = {'DOID' : 'DOID_MAP'}, inplace = True)

In [47]:
hpo_do_edge_omim['OMIM'] = hpo_do_edge_omim['disease_id'].apply(lambda row : row.split(':')[-1])

In [48]:
hpo_do_edge_omim_do = hpo_do_edge_omim.merge(do_omim_df, how = 'inner', on = 'OMIM')

In [49]:
hpo_do_edge_omim_do.rename(columns = {'DOID' : 'EDGES_TO_DOID'}, inplace = True)

#### This DOID --> (EDGE) --> DOID, might seem weird, but a disease like "aortic aneurysm" presents in syndromes like "Marfan Syndrome".

In [50]:
hpo_do_edge_omim_do

Unnamed: 0,HPO_ID,HPO_LABEL,gen_id,gen_name,source,disease_id,OMIM,HPO,DOID_MAP,EDGES_TO_DOID
0,HP:0000011,Neurogenic bladder,2697,GJA1,mim2gene,OMIM:164200,164200,0000011,12143,0060291
1,HP:0000175,Cleft palate,2697,GJA1,mim2gene,OMIM:164200,164200,0000175,674,0060291
2,HP:0000252,Microcephaly,2697,GJA1,mim2gene,OMIM:164200,164200,0000252,10907,0060291
3,HP:0410030,Cleft lip,2697,GJA1,mim2gene,OMIM:164200,164200,0410030,9296,0060291
4,HP:0000501,Glaucoma,2697,GJA1,mim2gene,OMIM:164200,164200,0000501,1686,0060291
...,...,...,...,...,...,...,...,...,...,...
8526,HP:0030078,Lung adenocarcinoma,1548,CYP2A6,mim2gene,OMIM:211980,211980,0030078,3910,1324
8527,HP:0030078,Lung adenocarcinoma,1956,EGFR,mim2gene,OMIM:211980,211980,0030078,3910,1324
8528,HP:0030078,Lung adenocarcinoma,5519,PPP2R1B,mim2gene,OMIM:211980,211980,0030078,3910,1324
8529,HP:0030078,Lung adenocarcinoma,673,BRAF,mim2gene,OMIM:211980,211980,0030078,3910,1324


In [51]:
hpo_do_edge_omim_do.drop(columns = ['gen_name', 'gen_id', 'HPO_ID', 'source', 'disease_id'], inplace = True)

In [52]:
hpo_do_edge_omim_do.drop_duplicates(inplace=True)

In [53]:
hpo_do_edge_omim_do.to_csv('hpo_do_edge_to_omim_do.csv')

### HPO / MESH --> (edge) --> ORPHA/DOID

In [54]:
hpo_edge_orpha = hpo_annot.loc[hpo_annot['disease_id'].str.contains('ORPHA', na = False)]

In [55]:
hpo_edge_orpha

Unnamed: 0,HPO_ID,HPO_LABEL,gen_id,gen_name,source,disease_id
0,HP:0000002,Abnormality of body height,6138,RPL15,orphadata,ORPHA:124
3,HP:0000002,Abnormality of body height,4001,LMNB1,orphadata,ORPHA:2514
4,HP:0000002,Abnormality of body height,545,ATR,orphadata,ORPHA:808
5,HP:0000002,Abnormality of body height,1499,CTNNB1,orphadata,ORPHA:54595
8,HP:0000002,Abnormality of body height,50937,CDON,orphadata,ORPHA:280200
...,...,...,...,...,...,...
1021745,HP:0032679,Focal non-motor seizure,81704,DOCK8,orphadata,ORPHA:178469
1021748,HP:0032679,Focal non-motor seizure,10369,CACNG2,orphadata,ORPHA:178469
1021751,HP:0032680,Focal cognitive seizure,8131,NPRL3,orphadata,ORPHA:98820
1021752,HP:0032680,Focal cognitive seizure,9681,DEPDC5,orphadata,ORPHA:98820


In [56]:
hpo_edge_orpha['ORPHA'] = hpo_edge_orpha['disease_id'].apply(lambda row : row.split(':')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_edge_orpha['ORPHA'] = hpo_edge_orpha['disease_id'].apply(lambda row : row.split(':')[-1])


In [57]:
hpo_edge_orpha['HPO'] = hpo_edge_orpha['HPO_ID'].apply(lambda row : row.split(':')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_edge_orpha['HPO'] = hpo_edge_orpha['HPO_ID'].apply(lambda row : row.split(':')[-1])


In [58]:
hpo_mesh_edge_to_orpha = hpo_edge_orpha.merge(hpo_mesh_df, how = 'inner', on = 'HPO')

In [59]:
hpo_mesh_edge_to_orpha_do = hpo_mesh_edge_to_orpha.merge(do_orpha_df, how = 'inner', on = 'ORPHA')

In [60]:
hpo_mesh_edge_to_orpha_do.drop(columns = ['gen_name', 'gen_id', 'HPO_ID', 'source', 'disease_id'], inplace = True)

In [61]:
hpo_mesh_edge_to_orpha_do

Unnamed: 0,HPO_LABEL,ORPHA,HPO,MESH,DOID
0,Bladder diverticulum,198,0000015,C562406,0111272
1,Pectus carinatum,198,0000768,D066166,0111272
2,Osteoporosis,198,0000939,D010024,0111272
3,Jaundice,198,0000952,D007565,0111272
4,Brachydactyly,198,0001156,D059327,0111272
...,...,...,...,...,...
20130,Hypercholesterolemia,79506,0003124,D006937,0111368
20131,Nevus,840,0003764,D009506,5445
20132,Epidermal nevus,840,0010816,C580062,5445
20133,Vascular calcification,289601,0004934,D061205,0111582


In [62]:
hpo_mesh_edge_to_orpha_do.drop_duplicates(inplace=True)

In [63]:
hpo_mesh_edge_to_orpha_do.to_csv('hpo_mesh_edge_to_orpha_do.csv')

### HPO / DOID (mapped)---> (edge) --> to ORPHA/DOID

In [64]:
hpo_edge_orpha

Unnamed: 0,HPO_ID,HPO_LABEL,gen_id,gen_name,source,disease_id,ORPHA,HPO
0,HP:0000002,Abnormality of body height,6138,RPL15,orphadata,ORPHA:124,124,0000002
3,HP:0000002,Abnormality of body height,4001,LMNB1,orphadata,ORPHA:2514,2514,0000002
4,HP:0000002,Abnormality of body height,545,ATR,orphadata,ORPHA:808,808,0000002
5,HP:0000002,Abnormality of body height,1499,CTNNB1,orphadata,ORPHA:54595,54595,0000002
8,HP:0000002,Abnormality of body height,50937,CDON,orphadata,ORPHA:280200,280200,0000002
...,...,...,...,...,...,...,...,...
1021745,HP:0032679,Focal non-motor seizure,81704,DOCK8,orphadata,ORPHA:178469,178469,0032679
1021748,HP:0032679,Focal non-motor seizure,10369,CACNG2,orphadata,ORPHA:178469,178469,0032679
1021751,HP:0032680,Focal cognitive seizure,8131,NPRL3,orphadata,ORPHA:98820,98820,0032680
1021752,HP:0032680,Focal cognitive seizure,9681,DEPDC5,orphadata,ORPHA:98820,98820,0032680


In [65]:
hpo_do_edge_orpha = hpo_edge_orpha.merge(hpo_do_df, how = 'inner', on = 'HPO')

In [66]:
hpo_do_edge_orpha.rename(columns = {'DOID' : 'DOID_MAP'}, inplace = True)

In [67]:
hpo_do_edge_orpha_do = hpo_do_edge_orpha.merge(do_orpha_df, how = 'inner', on = "ORPHA")

In [68]:
hpo_do_edge_orpha_do.rename(columns = {'DOID' : 'EDGES_TO_DOID'}, inplace = True)

In [69]:
hpo_do_edge_orpha_do.drop(columns = ['gen_name', 'gen_id', 'HPO_ID', 'source', 'disease_id'], inplace = True )

In [70]:
hpo_do_edge_orpha_do.drop_duplicates(inplace = True)

In [71]:
hpo_do_edge_orpha_do.to_csv('hpo_do_edge_orpha_do.csv')

### HPO/SPOKE Matching

In [72]:
hpo_spoke_match = pd.read_csv("hpo_spoke_match.csv")

In [73]:
hpo_spoke_match.head(10)

Unnamed: 0,HPO_ID,hpo,node_id,node_name,node_type,similarity
0,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,SideEffect,96
1,HP:0001067,Neurofibromas,C0027830,Neurofibroma,SideEffect,96
2,HP:0001067,Neurofibromas,DOID:962,neurofibroma,Disease,96
3,HP:0004409,Hyposmia,C2364082,Hyposmia,SideEffect,100
4,HP:0001386,Joint swelling,C0152031,Joint swelling,SideEffect,100
5,HP:0032192,Hydatidiform mole,C0020217,Hydatidiform mole,SideEffect,100
6,HP:0032192,Hydatidiform mole,C0549315,Benign hydatidiform mole,SideEffect,95
7,HP:0000966,Hypohidrosis,C0020620,Hypohidrosis,SideEffect,100
8,HP:0000966,Hypohidrosis,DOID:11155,hypohidrosis,Disease,100
9,HP:0002671,Basal cell carcinoma,C0007117,Basal cell carcinoma,SideEffect,100


Similarity at 95 or below is hit or miss. It seems that 96 and above is the sweet spot

In [74]:
hpo_spoke_match = hpo_spoke_match.loc[hpo_spoke_match['similarity'] >= 96]

In [75]:
# Let's decrease complexity by using only disease

In [76]:
##hpo_spoke_match = hpo_spoke_match.loc[hpo_spoke_match['node_type'] == 'Disease']

Now let's merge it with all the anottations (Edges)

In [77]:
hpo_spoke_annot = hpo_spoke_match.merge(hpo_annot, how = 'inner', on='HPO_ID')

In [78]:
hpo_spoke_annot

Unnamed: 0,HPO_ID,hpo,node_id,node_name,node_type,similarity,HPO_LABEL,gen_id,gen_name,source,disease_id
0,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,SideEffect,96,Hypochloremia,1811,SLC26A3,mim2gene,OMIM:214700
1,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,SideEffect,96,Hypochloremia,7809,BSND,orphadata,ORPHA:89938
2,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,SideEffect,96,Hypochloremia,3758,KCNJ1,mim2gene,OMIM:241200
3,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,SideEffect,96,Hypochloremia,6557,SLC12A1,mim2gene,OMIM:601678
4,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,SideEffect,96,Hypochloremia,1187,CLCNKA,orphadata,ORPHA:89938
...,...,...,...,...,...,...,...,...,...,...,...
199291,HP:0001264,Spastic diplegia,DOID:10965,spastic diplegia,Disease,100,Spastic diplegia,3028,HSD17B10,orphadata,ORPHA:391428
199292,HP:0001264,Spastic diplegia,DOID:10965,spastic diplegia,Disease,100,Spastic diplegia,8260,NAA10,mim2gene,OMIM:309800
199293,HP:0001264,Spastic diplegia,DOID:10965,spastic diplegia,Disease,100,Spastic diplegia,1615,DARS1,mim2gene,OMIM:615281
199294,HP:0001264,Spastic diplegia,DOID:10965,spastic diplegia,Disease,100,Spastic diplegia,8050,PDHX,orphadata,ORPHA:255182


### Let's merge on OMIM

In [79]:
hpo_spoke_annot.drop(columns = ['similarity', 'node_type', 'gen_id', 'gen_name', 'source'], inplace = True)

In [80]:
hpo_spoke_annot_omim = hpo_spoke_annot.loc[hpo_spoke_annot['disease_id'].str.contains('OMIM', na = False)]

In [81]:
hpo_spoke_annot_omim

Unnamed: 0,HPO_ID,hpo,node_id,node_name,HPO_LABEL,disease_id
0,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,Hypochloremia,OMIM:214700
2,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,Hypochloremia,OMIM:241200
3,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,Hypochloremia,OMIM:601678
5,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,Hypochloremia,OMIM:602522
6,HP:0003113,Hypochloremia,C0085680,Hypochloraemia,Hypochloremia,OMIM:613090
...,...,...,...,...,...,...
199289,HP:0001264,Spastic diplegia,DOID:10965,spastic diplegia,Spastic diplegia,OMIM:233400
199290,HP:0001264,Spastic diplegia,DOID:10965,spastic diplegia,Spastic diplegia,OMIM:619065
199292,HP:0001264,Spastic diplegia,DOID:10965,spastic diplegia,Spastic diplegia,OMIM:309800
199293,HP:0001264,Spastic diplegia,DOID:10965,spastic diplegia,Spastic diplegia,OMIM:615281


In [82]:
hpo_spoke_annot_omim['OMIM'] = hpo_spoke_annot_omim['disease_id'].apply(lambda x: x.split(':')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_spoke_annot_omim['OMIM'] = hpo_spoke_annot_omim['disease_id'].apply(lambda x: x.split(':')[-1])


In [83]:
hpo_spoke_annot_omim = hpo_spoke_annot_omim.merge(do_omim_df, how = 'inner', on = 'OMIM')

In [84]:
hpo_spoke_annot_omim.to_csv('hpo_spoke_annot_omim')

In [85]:
hpo_spoke_annot_omim

Unnamed: 0,HPO_ID,hpo,node_id,node_name,HPO_LABEL,disease_id,OMIM,DOID
0,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,OMIM:606764,606764,9253
1,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,OMIM:606764,606764,9253
2,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,OMIM:606764,606764,9253
3,HP:0001067,Neurofibromas,DOID:962,neurofibroma,Neurofibromas,OMIM:606764,606764,9253
4,HP:0001067,Neurofibromas,DOID:962,neurofibroma,Neurofibromas,OMIM:606764,606764,9253
...,...,...,...,...,...,...,...,...
25369,HP:0000028,Cryptorchidism,DOID:11383,cryptorchidism,Cryptorchidism,OMIM:614279,614279,0111773
25370,HP:0000787,Nephrolithiasis,C0392525,Nephrolithiasis,Nephrolithiasis,OMIM:614723,614723,0060350
25371,HP:0000787,Nephrolithiasis,DOID:585,nephrolithiasis,Nephrolithiasis,OMIM:614723,614723,0060350
25372,HP:0000252,Microcephaly,C0025958,Microcephaly,Microcephaly,OMIM:211180,211180,0050684


In [86]:
hpo_spoke_annot_omim.duplicated().sum()

3145

A lot of duplciated rows, we will drop them

In [87]:
hpo_spoke_annot_omim.drop_duplicates(inplace = True)

In [88]:
hpo_spoke_annot_omim

Unnamed: 0,HPO_ID,hpo,node_id,node_name,HPO_LABEL,disease_id,OMIM,DOID
0,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,OMIM:606764,606764,9253
3,HP:0001067,Neurofibromas,DOID:962,neurofibroma,Neurofibromas,OMIM:606764,606764,9253
6,HP:0002015,Dysphagia,C0011168,Dysphagia,Dysphagia,OMIM:606764,606764,9253
9,HP:0004796,Gastrointestinal obstruction,C0236124,Gastrointestinal obstruction,Gastrointestinal obstruction,OMIM:606764,606764,9253
12,HP:0005214,Intestinal obstruction,C0021843,Intestinal obstruction,Intestinal obstruction,OMIM:606764,606764,9253
...,...,...,...,...,...,...,...,...
25368,HP:0000028,Cryptorchidism,DOID:11383,cryptorchidism,Cryptorchidism,OMIM:614279,614279,0111773
25370,HP:0000787,Nephrolithiasis,C0392525,Nephrolithiasis,Nephrolithiasis,OMIM:614723,614723,0060350
25371,HP:0000787,Nephrolithiasis,DOID:585,nephrolithiasis,Nephrolithiasis,OMIM:614723,614723,0060350
25372,HP:0000252,Microcephaly,C0025958,Microcephaly,Microcephaly,OMIM:211180,211180,0050684


In [89]:
hpo_spoke_annot_omim.to_csv('hpo_spoke_annot_omim.csv')

### Let's merge on ORPHA

In [90]:
hpo_spoke_annot_orpha = hpo_spoke_annot.loc[hpo_spoke_annot['disease_id'].str.contains('ORPHA', na = False)]

In [91]:
hpo_spoke_annot_orpha['ORPHA'] = hpo_spoke_annot_orpha['disease_id'].apply(lambda x: x.split(':')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_spoke_annot_orpha['ORPHA'] = hpo_spoke_annot_orpha['disease_id'].apply(lambda x: x.split(':')[-1])


In [92]:
hpo_spoke_annot_orpha = hpo_spoke_annot_orpha.merge(do_orpha_df, how = 'inner', on = 'ORPHA')

In [93]:
hpo_spoke_annot_orpha

Unnamed: 0,HPO_ID,hpo,node_id,node_name,HPO_LABEL,disease_id,ORPHA,DOID
0,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,ORPHA:2495,2495,3565
1,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,ORPHA:2495,2495,3565
2,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,ORPHA:2495,2495,3565
3,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,ORPHA:2495,2495,3565
4,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,ORPHA:2495,2495,3565
...,...,...,...,...,...,...,...,...
45172,HP:0002289,Alopecia universalis,DOID:0050634,alopecia universalis,Alopecia universalis,ORPHA:701,701,0050634
45173,HP:0000453,Choanal atresia,DOID:9574,choanal atresia,Choanal atresia,ORPHA:1200,1200,0080695
45174,HP:0000453,Choanal atresia,DOID:9574,choanal atresia,Choanal atresia,ORPHA:1200,1200,0080695
45175,HP:0001561,Polyhydramnios,C0020224,Polyhydramnios,Polyhydramnios,ORPHA:2300,2300,14671


In [94]:
hpo_spoke_annot_orpha.duplicated().sum()

30992

In [95]:
hpo_spoke_annot_orpha.drop_duplicates(inplace = True)

In [96]:
hpo_spoke_annot_orpha

Unnamed: 0,HPO_ID,hpo,node_id,node_name,HPO_LABEL,disease_id,ORPHA,DOID
0,HP:0001067,Neurofibromas,C0027830,Neurofibroma,Neurofibromas,ORPHA:2495,2495,3565
12,HP:0001067,Neurofibromas,DOID:962,neurofibroma,Neurofibromas,ORPHA:2495,2495,3565
24,HP:0002354,Memory impairment,C0233794,Memory impairment,Memory impairment,ORPHA:2495,2495,3565
36,HP:0001342,Cerebral hemorrhage,C2937358,Cerebral haemorrhage,Cerebral hemorrhage,ORPHA:2495,2495,3565
48,HP:0000505,Visual impairment,C3665347,Visual impairment,Visual impairment,ORPHA:2495,2495,3565
...,...,...,...,...,...,...,...,...
45171,HP:0001596,Alopecia,DOID:987,alopecia,Alopecia,ORPHA:701,701,0050634
45172,HP:0002289,Alopecia universalis,DOID:0050634,alopecia universalis,Alopecia universalis,ORPHA:701,701,0050634
45173,HP:0000453,Choanal atresia,DOID:9574,choanal atresia,Choanal atresia,ORPHA:1200,1200,0080695
45175,HP:0001561,Polyhydramnios,C0020224,Polyhydramnios,Polyhydramnios,ORPHA:2300,2300,14671


In [97]:
hpo_spoke_annot_orpha.to_csv('hpo_spoke_annot_orpha.csv')

In [98]:
hpo_mesh_edge_to_orpha_do

Unnamed: 0,HPO_LABEL,ORPHA,HPO,MESH,DOID
0,Bladder diverticulum,198,0000015,C562406,0111272
1,Pectus carinatum,198,0000768,D066166,0111272
2,Osteoporosis,198,0000939,D010024,0111272
3,Jaundice,198,0000952,D007565,0111272
4,Brachydactyly,198,0001156,D059327,0111272
...,...,...,...,...,...
20129,Hypercholesterolemia,79506,0003124,D006937,0111368
20131,Nevus,840,0003764,D009506,5445
20132,Epidermal nevus,840,0010816,C580062,5445
20133,Vascular calcification,289601,0004934,D061205,0111582


In [99]:
hpo_mesh_edge_to_omim_do

Unnamed: 0,HPO_LABEL,OMIM,HPO,MESH,DOID
0,Bladder diverticulum,194050,0000015,C562406,1928
2,Vesicoureteral reflux,194050,0000076,D014718,1928
4,Renal insufficiency,194050,0000083,D051437,1928
6,Nephrocalcinosis,194050,0000121,D009397,1928
8,Otitis media,194050,0000388,D010033,1928
...,...,...,...,...,...
10955,Erythema,617525,0010783,D004890,0080249
10956,Hepatitis,114550,0012115,D006505,684
10965,Hemothorax,262850,0012151,D006491,0060601
10966,Pilomatrixoma,132600,0030434,D018296,5374


In [103]:
concatenado = pd.concat([hpo_mesh_edge_to_omim_do, hpo_mesh_edge_to_orpha_do], ignore_index= True)

In [106]:
concatenado.duplicated().sum()

0