In [1]:
import os
import pandas as pd
from py2neo import Graph
from dotenv import load_dotenv
import warnings
from functools import reduce
warnings.simplefilter('ignore')

In [2]:
#variables to connect to neo4j
uri = 'bolt://localhost:7687'
load_dotenv('../.env')
NEO4J_PASS = os.getenv("NEO4J_PASS")
NEO4J_USER = os.getenv("NEO4J_USER")

In [3]:
#set neo4j driver
graph = Graph("bolt://localhost:7687", auth=(NEO4J_USER, NEO4J_PASS))

In [None]:
NODES = ["DOID", "EFO", "HPO", "ICD9CM", "ICD10CM", "MEDDRA", "MEDGEN", "MESH", "MONDO", "NCIT", "SNOMEDCT", "ORPHANET", "UMLS"]

In [4]:
dfs_without_xref = {}
for n in range (0, 1000):
  limit = 500000
  skip = limit * n
  query_without_xref = f"""
    MATCH (n)-[:altLabel|:prefLabel]-(b) 
    RETURN n.source_id as id, b.name as synonym
    SKIP {skip}
    LIMIT {limit}
  """
  #get synonyms without cross referencing
  df_synonym_without_xref = (
    pd.DataFrame(
      graph.run(query_without_xref), columns = ['id', 'synonyms_without_xref']
    )
  )

  if df_synonym_without_xref.shape[0] > 0:
    df_synonym_without_xref = (
      df_synonym_without_xref 
      .groupby(['id'])['synonyms_without_xref'].agg(lambda x : list(set(list(x))))
      .reset_index()
    )

    df_synonym_without_xref['count_without_xref'] = [len(x) for x in df_synonym_without_xref['synonyms_without_xref']]

    df_synonym_without_xref = df_synonym_without_xref[['id', 'synonyms_without_xref', 'count_without_xref']]

    dfs_without_xref.append(df_synonym_without_xref)
  else:
    break

In [5]:
df_without_xref = reduce(lambda df1, df2: pd.concat([df1, df2]), dfs_without_xref)
print(df_without_xref.shape)
print(df_without_xref.describe())
df_without_xref.head()

(584695, 3)
       count_without_xref
count       584695.000000
mean             3.347118
std              5.187865
min              1.000000
25%              1.000000
50%              2.000000
75%              3.000000
max            823.000000


Unnamed: 0,id,synonyms_without_xref,count_without_xref
0,10,"[Cholera d/t vib cholerae, Cholera due to vibr...",2.0
1,11,"[Cholera due to vibrio cholerae el tor, Choler...",2.0
2,19,"[Cholera NOS, Cholera, unspecified]",2.0
3,20,[Typhoid fever],1.0
4,21,"[Paratyphoid fever A, Paratyphoid fever a]",2.0


In [6]:
dfs_with_xref= []


for node in NODES:
  for n in range (0, 5000):
    limit = 200000
    skip = limit * n

    query_with_xref = f"""
      MATCH (b1)-[:altLabel|:prefLabel]-(n:{node})-[:hasDbXref]-(n2)-[:altLabel|:prefLabel]-(b2)
      RETURN n.source_id as id, b1.name as synonym, b2.name as synonym_xref
      SKIP {skip}
      LIMIT {limit}
    """

    #get synonyms with cross referencing
    df_synonym_with_xref = (
      pd.DataFrame(
        graph.run(query_with_xref), columns = ['id', 'synonym', 'synonym_xref']
      )
    )
    if df_synonym_with_xref.shape[0] > 0 :
      df_synonym_with_xref = (
        df_synonym_with_xref
        .groupby(['id'])['synonym', 'synonym_xref'].agg(lambda x:list(x))
        .reset_index()
      )

      df_synonym_with_xref['synonyms_with_xref'] = df_synonym_with_xref['synonym'] + df_synonym_with_xref['synonym_xref']

      df_synonym_with_xref['synonyms_with_xref'] = [list(set(x)) for x in df_synonym_with_xref['synonyms_with_xref']]

      df_synonym_with_xref['count_with_xref'] = [len(x) for x in df_synonym_with_xref['synonyms_with_xref']]

      df_synonym_with_xref =  df_synonym_with_xref[['id', 'synonyms_with_xref', 'count_with_xref']]

      dfs_with_xref.append(df_synonym_with_xref)
    else:
      break


In [7]:
df_with_xref = reduce(lambda df1, df2: pd.concat([df1, df2]), dfs_with_xref)
print(df_with_xref.shape)
print(df_with_xref.describe())
df_with_xref.head()

(431345, 3)
       count_with_xref
count    431345.000000
mean          7.436695
std          26.212870
min           1.000000
25%           2.000000
50%           2.000000
75%           5.000000
max        1276.000000


Unnamed: 0,id,synonyms_with_xref,count_with_xref
0,DOID_0001816,"[Angiosarcoma non-metastatic, angiosarcoma, He...",39
1,DOID_0050052,"[RMSF, Choix, exanthematic typhus of sao Paulo...",12
2,DOID_0050117,[Adenovirus infection in conditions classified...,4
3,DOID_0050118,[California encephalitis virus infection neuro...,37
4,DOID_0050120,"[Lymphohistiocytosis, Familial Hemophagocytic,...",79


In [8]:
#comparison
difference = (
  df_with_xref
  .merge(
    right = df_without_xref,
    how = 'inner',
    on = 'id'
  )
)

difference['difference'] = difference['count_with_xref'] - difference['count_without_xref']

difference = difference.sort_values(by='difference', ascending=False)
difference.head()

Unnamed: 0,id,synonyms_with_xref,count_with_xref,synonyms_without_xref,count_without_xref,difference
11327,75989,"[Laband-Zimmermann syndrome, fetal warfarin sy...",1276,"[Other specified congenital anomalies, Specfie...",2.0,1274.0
67359,G600,"[MARS Charcot-Marie-Tooth disease type 2, Char...",1107,[Hereditary motor and sensory neuropathy],1.0,1106.0
20978,Q870,[Facio-auriculo-vertebral spectrum (disorder) ...,1007,[Congenital malformation syndromes predominant...,1.0,1006.0
66969,G114,"[spastic paraplegia 44, autosomal recessive, a...",776,[Hereditary spastic paraplegia],1.0,775.0
571415,MONDO_0018908,[Partial trisomy of the short arm of chromosom...,703,"[non-Hodgkin lymphoma, non-Hodgkin's lymphoma ...",5.0,698.0


In [9]:
difference.describe()

Unnamed: 0,count_with_xref,count_without_xref,difference
count,659218.0,659218.0,659218.0
mean,7.160461,4.34071,2.819752
std,24.836467,11.811921,22.099402
min,1.0,1.0,-294.0
25%,2.0,2.0,0.0
50%,3.0,2.0,0.0
75%,5.0,4.0,0.0
max,1276.0,823.0,1274.0


In [10]:
difference[difference['difference'] < 0].head()

Unnamed: 0,id,synonyms_with_xref,count_with_xref,synonyms_without_xref,count_without_xref,difference
174751,C0036341,"[Schizophrenic Disorder, schizophrenia with or...",31,[schizophrenia with or without an affective di...,32.0,-1.0
482056,C0157917,"[Pauciarticular JRA, Pauciarticular juvenile r...",13,"[Pauciarticular JRA, Pauciarticular juvenile a...",14.0,-1.0
371641,C0260845,[[V]Issue of repeat prescription for medicatio...,8,[[V]Issue of repeat prescription for medicatio...,9.0,-1.0
203210,C0406047,[Local Infection of Skin and Subcutaneous Tiss...,7,[Local Infection of Skin and Subcutaneous Tiss...,8.0,-1.0
59990,C8587,"[precancerous polyps, Precancerous Polyp]",2,"[precancerous polyps, Precancerous Polyp, Othe...",3.0,-1.0


In [None]:
difference.to_csv('difference.csv')