In [49]:
import xml.etree.ElementTree as ET
import pandas as pd
from utilities.preprocessing import parse_gaf,parse_kgml,get_go_term_description,kegg_symbols_and_names
from neo4j import GraphDatabase
import os
import dask.dataframe as dd
from dotenv import load_dotenv
load_dotenv()


kgml_path = 'data/KGML/hsa05012.xml'
gaf_path='data/GAF/goa_human.gaf'


def parse_kgml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    pathway_id = root.get('number')
    pathway_name = root.get('title')
    
    entries = {
        'gene': [],
        'compound': []
    }
    interactions = []

    for entry in root.findall('entry'):
        entry_type = entry.get('type')
        
        if entry_type in ['gene', 'compound']:
            entry_id = entry.get('id')
            entry_names = entry.get('name').split()
            
            graphics = entry.find('graphics')
            entry_symbols = graphics.get('name').split(', ') if graphics is not None else []
            # Remove '...' from symbols if present
            entry_symbols = [symbol.rstrip('...') for symbol in entry_symbols]

            entries[entry_type].append({
                f'{entry_type}_id': entry_id,
                f'{entry_type}_names': entry_names,
                f'{entry_type}_symbols': entry_symbols
            })

    for relation in root.findall('relation'):
        entry1 = relation.get('entry1')
        entry2 = relation.get('entry2')
        relation_type = relation.get('type')
        subtypes = [subtype.get('name') for subtype in relation.findall('subtype')]

        interactions.append({
            'entry1': entry1,
            'entry2': entry2,
            'type': relation_type,
            'subtypes': subtypes
        })

    pathway_data = {
        'pathway_id': pathway_id,
        'pathway_name': pathway_name,
        'genes': entries['gene'],
        'compounds': entries['compound'],
        'interactions': interactions
    }

    return pathway_data



In [50]:
pathway_data=parse_kgml(kgml_path)
pathway_data

{'pathway_id': '05012',
 'pathway_name': 'Parkinson disease',
 'genes': [{'gene_id': '5',
   'gene_names': ['hsa:120892'],
   'gene_symbols': ['LRRK2', 'AURA17', 'DARDARIN', 'PARK8', 'RIPK7', 'ROCO2']},
  {'gene_id': '11',
   'gene_names': ['hsa:7317', 'hsa:7318'],
   'gene_symbols': ['UBA1',
    'A1S9',
    'A1S9T',
    'A1ST',
    'AMCX1',
    'CFAP124',
    'GXP1',
    'POC20',
    'SMAX2',
    'UBA1A',
    'UBE1',
    'UBE1X',
    'VEXAS']},
  {'gene_id': '12',
   'gene_names': ['hsa:6233', 'hsa:7311', 'hsa:7314', 'hsa:7316'],
   'gene_symbols': ['RPS27A',
    'CEP80',
    'HEL112',
    'S27A',
    'UBA80',
    'UBC',
    'UBCEP1',
    'UBCEP80',
    'eS31']},
  {'gene_id': '13',
   'gene_names': ['hsa:7332', 'hsa:9246'],
   'gene_symbols': ['UBE2L3', 'E2-F1', 'L-UBC', 'UBCH7', 'UbcM4']},
  {'gene_id': '14',
   'gene_names': ['hsa:6233', 'hsa:7311', 'hsa:7314', 'hsa:7316'],
   'gene_symbols': ['RPS27A',
    'CEP80',
    'HEL112',
    'S27A',
    'UBA80',
    'UBC',
    'UBCEP1',
  

In [52]:
kegg_genes=pathway_data['genes']
kegg_genes

[{'gene_id': '5',
  'gene_names': ['hsa:120892'],
  'gene_symbols': ['LRRK2', 'AURA17', 'DARDARIN', 'PARK8', 'RIPK7', 'ROCO2']},
 {'gene_id': '11',
  'gene_names': ['hsa:7317', 'hsa:7318'],
  'gene_symbols': ['UBA1',
   'A1S9',
   'A1S9T',
   'A1ST',
   'AMCX1',
   'CFAP124',
   'GXP1',
   'POC20',
   'SMAX2',
   'UBA1A',
   'UBE1',
   'UBE1X',
   'VEXAS']},
 {'gene_id': '12',
  'gene_names': ['hsa:6233', 'hsa:7311', 'hsa:7314', 'hsa:7316'],
  'gene_symbols': ['RPS27A',
   'CEP80',
   'HEL112',
   'S27A',
   'UBA80',
   'UBC',
   'UBCEP1',
   'UBCEP80',
   'eS31']},
 {'gene_id': '13',
  'gene_names': ['hsa:7332', 'hsa:9246'],
  'gene_symbols': ['UBE2L3', 'E2-F1', 'L-UBC', 'UBCH7', 'UbcM4']},
 {'gene_id': '14',
  'gene_names': ['hsa:6233', 'hsa:7311', 'hsa:7314', 'hsa:7316'],
  'gene_symbols': ['RPS27A',
   'CEP80',
   'HEL112',
   'S27A',
   'UBA80',
   'UBC',
   'UBCEP1',
   'UBCEP80',
   'eS31']},
 {'gene_id': '15',
  'gene_names': ['hsa:5071'],
  'gene_symbols': ['PRKN', 'AR-JP', 'L

In [53]:
kegg_df=pd.DataFrame(kegg_genes,columns=['gene_id','gene_symbols'])
kegg_exploded=kegg_df.explode('gene_symbols')
kegg_exploded

Unnamed: 0,gene_id,gene_symbols
0,5,LRRK2
0,5,AURA17
0,5,DARDARIN
0,5,PARK8
0,5,RIPK7
...,...,...
102,735,IPOA
102,735,SOD
102,735,STAHP
102,735,hSod1


In [54]:

column_names = [
    'DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 
    'DB:Reference', 'Evidence_Code', 'With_or_From', 'Aspect', 
    'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'Taxon', 
    'Date', 'Assigned_By', 'Annotation_Extension', 'Gene_Product_Form_ID'
]
# Read the GAF file into a DataFrame, skipping comment lines
gaf_df = pd.read_csv(gaf_path, sep='\t', comment='!', header=None, names=column_names, dtype=str)
gaf_df=gaf_df[gaf_df['DB_Object_Synonym'].notna()]
gaf_df['DB_Object_Synonym'] = gaf_df['DB_Object_Synonym'].apply(lambda x: x.split('|') if pd.notna(x) else [])
gaf_df['Taxon'] = gaf_df['Taxon'].apply(lambda x: x.split('|') if pd.notna(x) else [])


In [34]:
# check if synonyms list has duplicates in each line-yes!
gaf_df[gaf_df['DB_Object_Synonym'].apply(set).apply(len)!=gaf_df['DB_Object_Synonym'].apply(len)]['DB_Object_Synonym']

53628     [SERF1A, SERF1B, FAM2A, FAM2B, SERF1, SERF1, S...
53629     [SERF1A, SERF1B, FAM2A, FAM2B, SERF1, SERF1, S...
53630     [SERF1A, SERF1B, FAM2A, FAM2B, SERF1, SERF1, S...
53631     [SERF1A, SERF1B, FAM2A, FAM2B, SERF1, SERF1, S...
53632     [SERF1A, SERF1B, FAM2A, FAM2B, SERF1, SERF1, S...
                                ...                        
701167     [MAGEA2, MAGEA2B, MAGE2, MAGE2, MAGEA2, MAGEA2A]
701362    [CCL4L1, CCL4L2, CCL4L, CCL4L, LAG1, SCYA4L1, ...
704881    [PABPC1L2A, PABPC1L2B, PABPC1L2, PABPC1L2, RBM...
705150             [HSFY1, HSFY2, HSF2L, HSF2L, HSFY, HSFY]
706480    [CCL4L1, CCL4L2, CCL4L, CCL4L, LAG1, SCYA4L1, ...
Name: DB_Object_Synonym, Length: 221, dtype: object

In [55]:
gaf_exploded=gaf_df.explode('DB_Object_Synonym')
gaf_exploded.head()

Unnamed: 0,DB,DB_Object_ID,DB_Object_Symbol,Qualifier,GO_ID,DB:Reference,Evidence_Code,With_or_From,Aspect,DB_Object_Name,DB_Object_Synonym,DB_Object_Type,Taxon,Date,Assigned_By,Annotation_Extension,Gene_Product_Form_ID
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,[taxon:9606],20240408,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,[taxon:9606],20240408,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,[taxon:9606],20230619,HPA,,
3,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,[taxon:9606],20240408,UniProt,,
4,UniProtKB,A0A075B6H7,IGKV3-7,located_in,GO:0005886,GO_REF:0000044,IEA,UniProtKB-SubCell:SL-0039,C,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,[taxon:9606],20240408,UniProt,,


In [60]:
#merge kegg and gaf on common synonyms
merged=pd.merge(kegg_exploded,gaf_exploded,left_on='gene_symbols', right_on='DB_Object_Synonym', how='inner')[['gene_id','Qualifier','GO_ID','Aspect','DB_Object_Name','DB_Object_Synonym','DB_Object_Type']].drop_duplicates()
merged

Unnamed: 0,gene_id,Qualifier,GO_ID,Aspect,DB_Object_Name,DB_Object_Synonym,DB_Object_Type
0,5,acts_upstream_of_or_within,GO:0034260,P,Leucine-rich repeat serine/threonine-protein k...,LRRK2,protein
1,5,enables,GO:0000149,F,Leucine-rich repeat serine/threonine-protein k...,LRRK2,protein
2,5,enables,GO:0000287,F,Leucine-rich repeat serine/threonine-protein k...,LRRK2,protein
3,5,enables,GO:0003779,F,Leucine-rich repeat serine/threonine-protein k...,LRRK2,protein
4,5,enables,GO:0003924,F,Leucine-rich repeat serine/threonine-protein k...,LRRK2,protein
...,...,...,...,...,...,...,...
60807,735,located_in,GO:0070062,C,Insulin-like growth factor-binding protein com...,ALS,protein
60809,735,part_of,GO:0042567,C,Insulin-like growth factor-binding protein com...,ALS,protein
60810,735,enables,GO:0005520,F,Insulin-like growth factor-binding protein com...,ALS,protein
60811,735,is_active_in,GO:0031012,C,Insulin-like growth factor-binding protein com...,ALS,protein


In [61]:
kegg_names=kegg_symbols_and_names('https://www.kegg.jp/entry/pathway+hsa05012')
kegg_names

{'ADORA2A': 'adenosine A2a receptor',
 'GNAL': 'G protein subunit alpha L',
 'GNAS': 'GNAS complex locus',
 'DRD2': 'dopamine receptor D2',
 'GNAI1': 'G protein subunit alpha i1',
 'GNAI3': 'G protein subunit alpha i3',
 'GNAI2': 'G protein subunit alpha i2',
 'ADCY5': 'adenylate cyclase 5',
 'PRKACA': 'protein kinase cAMP-activated catalytic subunit alpha',
 'PRKACB': 'protein kinase cAMP-activated catalytic subunit beta',
 'PRKACG': 'protein kinase cAMP-activated catalytic subunit gamma',
 'DRD1': 'dopamine receptor D1',
 'UBA1': 'ubiquitin like modifier activating enzyme 1',
 'UBA7': 'ubiquitin like modifier activating enzyme 7',
 'UBA52': 'ubiquitin A-52 residue ribosomal protein fusion product 1',
 'RPS27A': 'ribosomal protein S27a',
 'UBB': 'ubiquitin B',
 'UBC': 'ubiquitin C',
 'UBE2L3': 'ubiquitin conjugating enzyme E2 L3',
 'UBE2L6': 'ubiquitin conjugating enzyme E2 L6',
 'UBE2J2': 'ubiquitin conjugating enzyme E2 J2',
 'UBE2J1': 'ubiquitin conjugating enzyme E2 J1',
 'UBE2G2'

In [62]:
aux=merged[['gene_id','DB_Object_Synonym','DB_Object_Name']].drop_duplicates()
aux

Unnamed: 0,gene_id,DB_Object_Synonym,DB_Object_Name
0,5,LRRK2,Leucine-rich repeat serine/threonine-protein k...
512,5,PARK8,Leucine-rich repeat serine/threonine-protein k...
1024,11,UBA1,Ubiquitin-like modifier-activating enzyme 1
1075,11,A1S9T,Ubiquitin-like modifier-activating enzyme 1
1126,11,UBE1,Ubiquitin-like modifier-activating enzyme 1
...,...,...,...
60546,724,SLC39A11,Zinc transporter ZIP11
60557,724,C17orf26,Zinc transporter ZIP11
60568,724,ZIP11,Zinc transporter ZIP11
60579,735,SOD1,Superoxide dismutase [Cu-Zn]


In [63]:
#example of gene with multiple names and symbols both from KEGG and GO
from_kegg=pd.DataFrame(list(kegg_names.items()), columns=['Gene_Symbol', 'Gene_Name'])
kegg_go=pd.merge(from_kegg,aux,left_on='Gene_Symbol',right_on='DB_Object_Synonym').drop_duplicates()
kegg_go[kegg_go['gene_id']=='12']

Unnamed: 0,Gene_Symbol,Gene_Name,gene_id,DB_Object_Synonym,DB_Object_Name
13,RPS27A,ribosomal protein S27a,12,RPS27A,Ubiquitin-ribosomal protein eS31 fusion protein
24,UBC,ubiquitin C,12,UBC,Polyubiquitin-C


In [64]:
#example from GO
ids=merged[['gene_id','DB_Object_Name']].drop_duplicates()['DB_Object_Name'].index
merged.loc[ids]

Unnamed: 0,gene_id,Qualifier,GO_ID,Aspect,DB_Object_Name,DB_Object_Synonym,DB_Object_Type
0,5,acts_upstream_of_or_within,GO:0034260,P,Leucine-rich repeat serine/threonine-protein k...,LRRK2,protein
1024,11,part_of,GO:0000792,C,Ubiquitin-like modifier-activating enzyme 1,UBA1,protein
1177,12,enables,GO:0003723,F,Ubiquitin-ribosomal protein eS31 fusion protein,RPS27A,protein
2343,12,enables,GO:0003723,F,Polyubiquitin-C,UBC,protein
3563,13,enables,GO:0003713,F,Ubiquitin-conjugating enzyme E2 L3,UBE2L3,protein
...,...,...,...,...,...,...,...
60406,715,enables,GO:0005515,F,Histone-lysine N-methyltransferase 2B,TRX2,protein
60453,715,enables,GO:0003723,F,Thioredoxin,TXN,protein
60546,724,enables,GO:0005375,F,Zinc transporter ZIP11,SLC39A11,protein
60579,735,acts_upstream_of_or_within,GO:0043410,P,Superoxide dismutase [Cu-Zn],SOD1,protein


In [65]:
#gene_ids map to different Names 
merged[['gene_id','DB_Object_Name']].drop_duplicates()

Unnamed: 0,gene_id,DB_Object_Name
0,5,Leucine-rich repeat serine/threonine-protein k...
1024,11,Ubiquitin-like modifier-activating enzyme 1
1177,12,Ubiquitin-ribosomal protein eS31 fusion protein
2343,12,Polyubiquitin-C
3563,13,Ubiquitin-conjugating enzyme E2 L3
...,...,...
60406,715,Histone-lysine N-methyltransferase 2B
60453,715,Thioredoxin
60546,724,Zinc transporter ZIP11
60579,735,Superoxide dismutase [Cu-Zn]


In [66]:
#let's aggregate all synonyms and names by gene_id
agg_names_synonyms=merged[['gene_id','DB_Object_Name','DB_Object_Synonym']].groupby('gene_id').agg(lambda x:list(set(x))).reset_index()
agg_names_synonyms

Unnamed: 0,gene_id,DB_Object_Name,DB_Object_Synonym
0,11,[Ubiquitin-like modifier-activating enzyme 1],"[UBA1, UBE1, A1S9T]"
1,12,"[Polyubiquitin-C, Ubiquitin-ribosomal protein ...","[UBCEP1, UBA80, UBC, RPS27A]"
2,13,[Ubiquitin-conjugating enzyme E2 L3],"[UBCH7, UBE2L3]"
3,139,[Ubiquitin-conjugating enzyme E2 J2],"[NCUBE2, UBE2J2]"
4,14,"[Polyubiquitin-C, Ubiquitin-ribosomal protein ...","[UBCEP1, UBA80, UBC, RPS27A]"
...,...,...,...
98,711,[Sodium-dependent dopamine transporter],"[DAT1, SLC6A3]"
99,714,[Kelch-like ECH-associated protein 1],"[KLHL19, KEAP1]"
100,715,"[Thioredoxin, Thioredoxin, mitochondrial, Hist...","[TRX2, TXN, TXN2]"
101,724,[Zinc transporter ZIP11],"[ZIP11, C17orf26, SLC39A11]"


In [69]:
#merge merged with agg_names_synonym so that all gene_id entries have consistent names and symbols
merged=pd.merge(merged[['gene_id','Qualifier','GO_ID','Aspect','DB_Object_Type']],agg_names_synonyms)
merged

Unnamed: 0,gene_id,Qualifier,GO_ID,Aspect,DB_Object_Type,DB_Object_Name,DB_Object_Synonym
0,5,acts_upstream_of_or_within,GO:0034260,P,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]"
1,5,enables,GO:0000149,F,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]"
2,5,enables,GO:0000287,F,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]"
3,5,enables,GO:0003779,F,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]"
4,5,enables,GO:0003924,F,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]"
...,...,...,...,...,...,...,...
14369,735,located_in,GO:0070062,C,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]"
14370,735,part_of,GO:0042567,C,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]"
14371,735,enables,GO:0005520,F,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]"
14372,735,is_active_in,GO:0031012,C,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]"


In [70]:
num_cores = os.cpu_count()
num_cores

8

In [252]:
def go_id_description(df:pd.DataFrame)->pd.DataFrame:
    """
    Process a DataFrame to get GO term descriptions.
    
    Args:
        df (pd.DataFrame): DataFrame with at least one column 'GO_ID'.
    
    Returns:
        pd.DataFrame: DataFrame with 'GO_ID', 'GO_label', and 'GO_definition'.
    """
    if not pd.api.types.is_string_dtype(df['GO_ID']):
        raise ValueError("'GO_ID' column must be of type string.")
    
    go_df=df[['GO_ID']].drop_duplicates()

    # Convert the Pandas DataFrame to a Dask DataFrame
    num_cores = os.cpu_count()
    ddf = dd.from_pandas(go_df, npartitions=num_cores)

    # Define a function to apply to each partition
    def apply_go_term_description(partition):
        results = [get_go_term_description(go_id) for go_id in partition['GO_ID']]
        result_df = pd.DataFrame(results)
        return pd.concat([partition.reset_index(drop=True), result_df], axis=1)
    
    # define metadata structure for Dask computation
    meta = pd.DataFrame({
        'GO_ID': pd.Series(dtype='str'),
        'GO_label': pd.Series(dtype='str'),
        'GO_definition': pd.Series(dtype='str')
    })

    go_ddf = ddf.map_partitions(apply_go_term_description, meta=meta)
    
    return go_ddf.compute()

In [71]:
go_df=merged[['GO_ID']].drop_duplicates()

# Convert the Pandas DataFrame to a Dask DataFrame
ddf = dd.from_pandas(go_df, npartitions=num_cores)

# Define a function to apply to each partition
def apply_go_term_description(partition):
    results = [get_go_term_description(go_id) for go_id in partition['GO_ID']]
    result_df = pd.DataFrame(results)
    return pd.concat([partition.reset_index(drop=True), result_df], axis=1)


meta = pd.DataFrame({
    'GO_ID': pd.Series(dtype='str'),
    'GO_label': pd.Series(dtype='str'),
    'GO_definition': pd.Series(dtype='str')
})

go_ddf = ddf.map_partitions(apply_go_term_description, meta=meta)

# Compute the result
go_df = go_ddf.compute()
go_df

Unnamed: 0,GO_ID,GO_label,GO_definition
0,GO:0034260,negative regulation of GTPase activity,Any process that stops or reduces the rate of ...
1,GO:0000149,SNARE binding,Binding to a SNARE (soluble N-ethylmaleimide-s...
2,GO:0000287,magnesium ion binding,Binding to a magnesium (Mg) ion.
3,GO:0003779,actin binding,Binding to monomeric or multimeric forms of ac...
4,GO:0003924,GTPase activity,Catalysis of the reaction: GTP + H2O = GDP + H...
...,...,...,...
223,GO:0099610,action potential initiation,The initiating cycle of an action potential. I...
224,GO:0005777,peroxisome,A small organelle enclosed by a single membran...
225,GO:0042567,insulin-like growth factor ternary complex,"A complex of three proteins, which in animals ..."
226,GO:0005520,insulin-like growth factor binding,"Binding to an insulin-like growth factor, any ..."


In [72]:
aspect_dict={"P":"Biological Process", "F":"Molecular Function", "C":"Cellular Component"}

In [73]:
merged=pd.merge(merged,go_df,on='GO_ID')
merged=merged.replace({'Aspect':aspect_dict})
merged

Unnamed: 0,gene_id,Qualifier,GO_ID,Aspect,DB_Object_Type,DB_Object_Name,DB_Object_Synonym,GO_label,GO_definition
0,5,acts_upstream_of_or_within,GO:0034260,Biological Process,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",negative regulation of GTPase activity,Any process that stops or reduces the rate of ...
1,5,enables,GO:0000149,Molecular Function,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",SNARE binding,Binding to a SNARE (soluble N-ethylmaleimide-s...
2,5,enables,GO:0000287,Molecular Function,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",magnesium ion binding,Binding to a magnesium (Mg) ion.
3,5,enables,GO:0003779,Molecular Function,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",actin binding,Binding to monomeric or multimeric forms of ac...
4,5,enables,GO:0003924,Molecular Function,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",GTPase activity,Catalysis of the reaction: GTP + H2O = GDP + H...
...,...,...,...,...,...,...,...,...,...
14369,735,located_in,GO:0070062,Cellular Component,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]",extracellular exosome,A vesicle that is released into the extracellu...
14370,735,part_of,GO:0042567,Cellular Component,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]",insulin-like growth factor ternary complex,"A complex of three proteins, which in animals ..."
14371,735,enables,GO:0005520,Molecular Function,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]",insulin-like growth factor binding,"Binding to an insulin-like growth factor, any ..."
14372,735,is_active_in,GO:0031012,Cellular Component,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]",extracellular matrix,A structure lying external to one or more cell...


In [258]:
#LOAD config.yaml
def kegg_go_integration(kgml_path: str, gaf_path: str) -> pd.DataFrame:
    """
    Integrates KEGG and GO data.

    Args:
        kgml_path (str): Path to the KEGG KGML file.
        gaf_path (str): Path to the GO GAF file.

    Returns:
        pd.DataFrame: Merged DataFrame with integrated KEGG and GO data.
    """
    # Parse KEGG data
    kegg_genes=parse_kgml(kgml_path)['genes']
    kegg_df=pd.DataFrame(kegg_genes,columns=['gene_id','gene_symbols'])
    kegg_exploded_df=kegg_df.explode('gene_symbols')

    # Parse GAF data
    gaf_df=parse_gaf(gaf_path)[['Qualifier','GO_ID','Aspect','DB_Object_Name','DB_Object_Synonym','DB_Object_Type']]
    gaf_exploded_df=gaf_df.explode('DB_Object_Synonym')

    # Merge KEGG and GAF on common synonyms
    merged_df=pd.merge(kegg_exploded_df,gaf_exploded_df,left_on='gene_symbols',right_on='DB_Object_Synonym',how='inner').drop_duplicates()

    # Aggregate all synonyms and names by gene_id
    agg_names_synonyms_df=merged_df[['gene_id','DB_Object_Name','DB_Object_Synonym']].groupby('gene_id').agg(lambda x:list(set(x))).reset_index()

    # Merge merged_df with agg_names_synonyms_df so that all gene_id entries have consistent names and symbols
    merged_df=pd.merge(merged_df[['gene_id','Qualifier','GO_ID','Aspect','DB_Object_Type']],agg_names_synonyms_df)

    #Get dataframe with GO_IDs and their descriptions
    go_df=go_id_description(merged_df)
    
    # Merge GO descriptions and replace aspect values
    merged_df=pd.merge(merged_df,go_df,on='GO_ID').replace({'Aspect':aspect_dict})

    return merged_df


    




    

In [260]:
merged

Unnamed: 0,gene_id,Qualifier,GO_ID,Aspect,DB_Object_Type,DB_Object_Name,DB_Object_Synonym,GO_label,GO_definition
0,5,acts_upstream_of_or_within,GO:0034260,Biological Process,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",negative regulation of GTPase activity,Any process that stops or reduces the rate of ...
1,5,enables,GO:0000149,Molecular Function,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",SNARE binding,Binding to a SNARE (soluble N-ethylmaleimide-s...
2,5,enables,GO:0000287,Molecular Function,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",magnesium ion binding,Binding to a magnesium (Mg) ion.
3,5,enables,GO:0003779,Molecular Function,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",actin binding,Binding to monomeric or multimeric forms of ac...
4,5,enables,GO:0003924,Molecular Function,protein,[Leucine-rich repeat serine/threonine-protein ...,"[LRRK2, PARK8]",GTPase activity,Catalysis of the reaction: GTP + H2O = GDP + H...
...,...,...,...,...,...,...,...,...,...
14369,735,located_in,GO:0070062,Cellular Component,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]",extracellular exosome,A vesicle that is released into the extracellu...
14370,735,part_of,GO:0042567,Cellular Component,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]",insulin-like growth factor ternary complex,"A complex of three proteins, which in animals ..."
14371,735,enables,GO:0005520,Molecular Function,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]",insulin-like growth factor binding,"Binding to an insulin-like growth factor, any ..."
14372,735,is_active_in,GO:0031012,Cellular Component,protein,"[Superoxide dismutase [Cu-Zn], Insulin-like gr...","[SOD1, ALS]",extracellular matrix,A structure lying external to one or more cell...


In [None]:
merged2=kegg_go_integration(kgml_path,gaf_path)
merged2.head(20)

Names in KEGG and GO don't match (examples in 4930)

In [125]:
'phosphoinositide-3-kinase regulatory subunit 1' in set(gaf_df['DB_Object_Name'].apply(lambda x:x.lower()))

False

In [127]:
'protein kinase C zeta'.lower() in set(gaf_df['DB_Object_Name'].apply(lambda x:x.lower()))

False

In [114]:
symbols=set(gaf_new['DB_Object_Symbol'])
synonyms=set(gaf_new['DB_Object_Synonym'].explode())
len(synonyms-symbols),len(synonyms),len(symbols)

(23361, 42944, 19583)

In [108]:
df=gaf_new[['DB_Object_ID','DB_Object_Name']]
df.drop_duplicates().groupby('DB_Object_Name').count().value_counts()

DB_Object_ID
1               19640
2                   8
3                   1
8                   1
12                  1
15                  1
Name: count, dtype: int64

In [None]:

uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")
api_key = os.getenv("OPENAI_API_KEY")


In [77]:
class KGMLGAFImporter:
    def __init__(self, driver):
        self.driver = driver

    def close(self):
        self.driver.close()

    def create_gene_node(self, tx, gene,disease_id):
        tx.run(
            """
            MERGE (g:Gene {unique_id: $unique_id})
            SET g.synonyms = $synonyms
            SET g.names= $names
            """,
            unique_id=f"{disease_id}_{gene['gene_id']}",
            names=gene['DB_Object_Name'],
            synonyms=gene['DB_Object_Synonym'],
            type='gene'
        )

    def create_disease_node(self, tx, disease):
        tx.run(
            """
            MERGE (d:Disease {disease_id: $disease_id}) 
            SET d.name = $name
            """,
            disease_id=disease['disease_id'], 
            name=disease['name']
        )
    
    def create_go_node(self,tx,gene):
        tx.run(
            """
            MERGE (a:GO_Annotation {qualifier:$qualifier, GO_ID:$GO_ID})
            SET a.aspect=$aspect,
                a.object_type=$object_type,
                a.name=$GO_label,
                a.definition=$GO_definition 
            """,
            qualifier=gene['Qualifier'],
            GO_ID=gene['GO_ID'],
            aspect=gene['Aspect'],
            object_type=gene['DB_Object_Type'],	
            GO_label=gene['GO_label'],
            GO_definition=gene['GO_definition']
        )

    def create_gene_interaction(self, tx, interaction,disease_id):
        tx.run(
            """
            MATCH (g1:Gene {unique_id: $entry_1})
            MATCH (g2:Gene {unique_id: $entry_2})
            MERGE (g1)-[r:INTERACTS_WITH {type: $interaction_type, subtypes: $subtypes}]->(g2)
            """,
            entry_1=f"{disease_id}_{interaction['entry1']}",
            entry_2=f"{disease_id}_{interaction['entry2']}",
            interaction_type=interaction['type'],
            subtypes=interaction['subtypes']
        )

    def create_disease_association(self, tx, gene, disease_id, evidence):
        tx.run(
            """
            MATCH (g:Gene {unique_id: $unique_id})
            MATCH (d:Disease {disease_id: $disease_id})
            MERGE (g)-[r:ASSOCIATED_WITH {evidence: $evidence}]->(d)
            """,
            unique_id=f"{disease_id}_{gene['gene_id']}",
            disease_id=disease_id,
            evidence=evidence
        )
    
    def create_go_association(self,tx,gene,disease_id):
            tx.run(
            """
            MATCH (g:Gene {unique_id: $unique_id})
            MATCH (a:GO_Annotation {qualifier:$qualifier, GO_ID:$GO_ID})
            MERGE (g)-[r:HAS_GO_ANNOTATION]->(a)
            """,
            unique_id=f"{disease_id}_{gene['gene_id']}",
            qualifier=gene['Qualifier'],
            GO_ID=gene['GO_ID']
           )

    def import_data(self, kgml_files):
        with self.driver.session() as session:
            for kgml_file_path in kgml_files:
                pathway_data = parse_kgml(kgml_file_path)
                disease_id = pathway_data['pathway_id']
                disease_name = pathway_data['pathway_name']
                # Create disease node
                session.write_transaction(self.create_disease_node, {
                    'disease_id': disease_id,
                    'name': disease_name
                })
                merged=kegg_go_integration(kgml_file_path,gaf_path)
                # Create gene nodes and their associations with the disease
                for _,df in merged.groupby('gene_id'):
                    for _, gene in df.iterrows():
                        session.write_transaction(self.create_gene_node, gene,disease_id)
                        session.write_transaction(self.create_go_node, gene)
                        session.write_transaction(self.create_disease_association, gene, disease_id, evidence="from KGML")
                        session.write_transaction(self.create_go_association, gene,disease_id)
                # Create interactions between genes
                for interaction in pathway_data['interactions']:
                    session.write_transaction(self.create_gene_interaction, interaction,disease_id)


# Connect to the Neo4j database
driver = GraphDatabase.driver(uri, auth=(user, password))

def create_constraints(driver):
    with driver.session() as session:
        session.run("CREATE CONSTRAINT FOR (d:Disease) REQUIRE d.disease_id IS UNIQUE;")

kgml_files = ['data/KGML/hsa05012.xml']

# Create constraints and import data
importer = KGMLGAFImporter(driver)
importer.import_data(kgml_files)
importer.close()


  session.write_transaction(self.create_disease_node, {
  session.write_transaction(self.create_gene_node, gene,disease_id)
  session.write_transaction(self.create_go_node, gene)
  session.write_transaction(self.create_disease_association, gene, disease_id, evidence="from KGML")
  session.write_transaction(self.create_go_association, gene,disease_id)
  session.write_transaction(self.create_gene_interaction, interaction,disease_id)


In [270]:

from neo4j import GraphDatabase
from utilities.preprocessing import parse_kgml
import pandas as pd
from pydantic import BaseModel, Field, ValidationError, field_validator
from typing import List, Optional, Dict, Any
from neo4j import GraphDatabase,Transaction


class GeneData(BaseModel):
    """
    Pydantic model for Gene data with fields validated and default values handled.

    Attributes:
        gene_id: Unique identifier for the gene.
        Qualifier: Qualifier for the gene annotation.
        GO_ID: Gene Ontology ID.
        Aspect: Aspect of the gene ontology.
        DB_Object_Type: Type of the database object.
        DB_Object_Name: List of names for the database object.
        DB_Object_Synonym: List of synonyms for the database object.
        GO_label: Label for the Gene Ontology term.
        GO_definition: Definition for the Gene Ontology term.
    """
    gene_id: str
    Qualifier: Optional[str]
    GO_ID: Optional[str]
    Aspect: Optional[str]
    DB_Object_Type: Optional[str]
    DB_Object_Name: Optional[List[str]] = Field(default_factory=list)
    DB_Object_Synonym: Optional[List[str]] = Field(default_factory=list)
    GO_label: Optional[str]
    GO_definition: Optional[str]

    @field_validator('*', mode='before')
    def handle_nan(cls, v: Any) -> Any:
        """
        Handle NaN values by converting them to None.
        If the value is a list, it iterates over the elements and replaces NaN with None.
        
        Args:
            v: The value to check and potentially modify.
        
        Returns:
            The modified value with NaN values handled.
        """
        if isinstance(v, list):
            return [None if pd.isna(i) else i for i in v]
        if pd.isna(v):
            return None
        return v

    @field_validator('DB_Object_Name', 'DB_Object_Synonym', mode='before')
    def convert_to_list(cls, v:Optional[Any]) -> List[Any]:
        """
        Convert None values to an empty list for specific fields.
        
        Args:
            v: The value to check and potentially modify.
        
        Returns:
            An empty list if the value is None, otherwise the original value.
        """
        if v is None:
            return []
        return v


class Disease(BaseModel):
    """
    Pydantic model for Disease data with required fields.

    Attributes:
        disease_id: Unique identifier for the disease.
        name: Name of the disease.
    """
    disease_id: str
    name: str


class GeneInteraction(BaseModel):
    """
    Pydantic model for Gene Interaction data with fields validated and default values handled.

    Attributes:
        entry1: Identifier for the first gene in the interaction.
        entry2: Identifier for the second gene in the interaction.
        type: Type of interaction.
        subtypes: List of subtypes for the interaction.
    """
    entry1: str
    entry2: str
    type: str
    subtypes: Optional[List[str]] = Field(default_factory=list)


class KGMLGAFImporter:
    """
    Class to import data from KGML and GAF files into a Neo4j database.

    Attributes:
        driver: Neo4j driver for database connection.
    """
    def __init__(self, driver: GraphDatabase.driver):
        """
        Initialize the importer with a Neo4j driver.

        Args:
            driver: Neo4j driver for database connection.
        """
        self.driver = driver

    def close(self):
        """
        Close the Neo4j driver connection.
        """
        self.driver.close()

    def create_gene_node(self, tx: Transaction, gene: Dict[str, Any], disease_id: str):
        """
        Create a Gene node in the Neo4j database.

        Args:
            tx: Neo4j transaction object.
            gene: Dictionary containing gene data.
            disease_id: ID of the associated disease.
        """
        try:
            gene_model = GeneData(**gene)
        except ValidationError as e:
            print(f"Validation error for gene {gene['gene_id']}: {e}")
            return

        tx.run(
            """
            MERGE (g:Gene {unique_id: $unique_id})
            SET g.synonyms = $synonyms,
                g.names = $names
            """,
            unique_id=f"{disease_id}_{gene_model.gene_id}",
            names=gene_model.DB_Object_Name,
            synonyms=gene_model.DB_Object_Synonym,
        )

    def create_disease_node(self, tx: Transaction, disease: Dict[str, Any]):
        """
        Create a Disease node in the Neo4j database.

        Args:
            tx: Neo4j transaction object.
            disease: Dictionary containing disease data.
        """
        try:
            disease_model = Disease(**disease)
        except ValidationError as e:
            print(f"Validation error for disease {disease['disease_id']}: {e}")
            return

        tx.run(
            """
            MERGE (d:Disease {disease_id: $disease_id})
            SET d.name = $name
            """,
            disease_id=disease_model.disease_id,
            name=disease_model.name
        )

    def create_go_node(self, tx: Transaction, gene: Dict[str, Any]):
        """
        Create a GO Annotation node in the Neo4j database.

        Args:
            tx: Neo4j transaction object.
            gene: Dictionary containing gene data.
        """
        try:
            gene_model = GeneData(**gene)
        except ValidationError as e:
            print(f"Validation error for gene {gene['gene_id']}: {e}")
            return

        tx.run(
            """
            MERGE (a:GO_Annotation {qualifier: $qualifier, GO_ID: $GO_ID})
            SET a.aspect = $aspect,
                a.object_type = $object_type,
                a.name = $GO_label,
                a.definition = $GO_definition
            """,
            qualifier=gene_model.Qualifier,
            GO_ID=gene_model.GO_ID,
            aspect=gene_model.Aspect,
            object_type=gene_model.DB_Object_Type,
            GO_label=gene_model.GO_label,
            GO_definition=gene_model.GO_definition
        )

    def create_gene_interaction(self, tx: Transaction, interaction: Dict[str, Any], disease_id: str):
        """
        Create an INTERACTS_WITH relationship between Gene nodes in the Neo4j database.

        Args:
            tx: Neo4j transaction object.
            interaction: Dictionary containing interaction data.
            disease_id: ID of the associated disease.
        """
        try:
            interaction_model = GeneInteraction(**interaction)
        except ValidationError as e:
            print(f"Validation error for interaction {interaction['entry1']} -> {interaction['entry2']}: {e}")
            return

        tx.run(
            """
            MATCH (g1:Gene {unique_id: $entry_1})
            MATCH (g2:Gene {unique_id: $entry_2})
            MERGE (g1)-[r:INTERACTS_WITH {type: $interaction_type, subtypes: $subtypes}]->(g2)
            """,
            entry_1=f"{disease_id}_{interaction_model.entry1}",
            entry_2=f"{disease_id}_{interaction_model.entry2}",
            interaction_type=interaction_model.type,
            subtypes=interaction_model.subtypes
        )

    def create_disease_association(self, tx: Transaction, gene: Dict[str, Any], disease_id: str, evidence: str):
        """
        Create an ASSOCIATED_WITH relationship between Gene and Disease nodes in the Neo4j database.

        Args:
            tx: Neo4j transaction object.
            gene: Dictionary containing gene data.
            disease_id: ID of the associated disease.
            evidence: Evidence for the association.
        """
        try:
            gene_model = GeneData(**gene)
        except ValidationError as e:
            print(f"Validation error for gene {gene['gene_id']}: {e}")
            return

        tx.run(
            """
            MATCH (g:Gene {unique_id: $unique_id})
            MATCH (d:Disease {disease_id: $disease_id})
            MERGE (g)-[r:ASSOCIATED_WITH {evidence: $evidence}]->(d)
            """,
            unique_id=f"{disease_id}_{gene_model.gene_id}",
            disease_id=disease_id,
            evidence=evidence
        )

    def create_go_association(self, tx: Transaction, gene: Dict[str, Any], disease_id: str):
        """
        Create a HAS_GO_ANNOTATION relationship between Gene and GO Annotation nodes in the Neo4j database.

        Args:
            tx: Neo4j transaction object.
            gene: Dictionary containing gene data.
            disease_id: ID of the associated disease.
        """
        try:
            gene_model = GeneData(**gene)
        except ValidationError as e:
            print(f"Validation error for gene {gene['gene_id']}: {e}")
            return

        tx.run(
            """
            MATCH (g:Gene {unique_id: $unique_id})
            MATCH (a:GO_Annotation {qualifier: $qualifier, GO_ID: $GO_ID})
            MERGE (g)-[r:HAS_GO_ANNOTATION]->(a)
            """,
            unique_id=f"{disease_id}_{gene_model.gene_id}",
            qualifier=gene_model.Qualifier,
            GO_ID=gene_model.GO_ID
        )

    def import_data(self, kgml_files: List[str]):
        """
        Import data from KGML files and a merged DataFrame into the Neo4j database.

        Args:
            kgml_files: List of file paths to KGML files.
            merged: DataFrame containing merged data.
        """
        with self.driver.session() as session:
            for kgml_file_path in kgml_files:
                pathway_data = parse_kgml(kgml_file_path)
                disease_id = pathway_data['pathway_id']
                disease_name = pathway_data['pathway_name']
                
                # Create disease node
                session.write_transaction(self.create_disease_node, {
                    'disease_id': disease_id,
                    'name': disease_name
                })

                #Create dataframe integrating KGML genes and associated GO terms
                merged=kegg_go_integration(kgml_file_path,gaf_path)

                # Create gene nodes and their associations with the disease
                for _, df in merged.groupby('gene_id'):
                    for _, gene in df.iterrows():
                        session.write_transaction(self.create_gene_node, gene.to_dict(), disease_id)
                        session.write_transaction(self.create_go_node, gene.to_dict())
                        session.write_transaction(self.create_disease_association, gene.to_dict(), disease_id, evidence="from KGML")
                        session.write_transaction(self.create_go_association, gene.to_dict(), disease_id)
                        
                # Create interactions between genes
                for interaction in pathway_data['interactions']:
                    session.write_transaction(self.create_gene_interaction, interaction, disease_id)



# Connect to the Neo4j database
driver = GraphDatabase.driver(uri, auth=(user, password))

def create_constraints(driver):
    with driver.session() as session:
        session.run("CREATE CONSTRAINT FOR (d:Disease) REQUIRE d.disease_id IS UNIQUE;")

kgml_files = [
              'data/KGML/hsa04930.xml',
              'data/KGML/hsa05010.xml',
              'data/KGML/hsa05012.xml',
              'data/KGML/hsa05210.xml'
              ]

# Create constraints and import data
importer = KGMLGAFImporter(driver)
importer.import_data(kgml_files)
importer.close()


  session.write_transaction(self.create_disease_node, {
  session.write_transaction(self.create_gene_node, gene.to_dict(), disease_id)
  session.write_transaction(self.create_go_node, gene.to_dict())
  session.write_transaction(self.create_disease_association, gene.to_dict(), disease_id, evidence="from KGML")
  session.write_transaction(self.create_go_association, gene.to_dict(), disease_id)
  session.write_transaction(self.create_gene_interaction, interaction, disease_id)
  session.write_transaction(self.create_disease_node, {
  session.write_transaction(self.create_gene_node, gene.to_dict(), disease_id)
  session.write_transaction(self.create_go_node, gene.to_dict())
  session.write_transaction(self.create_disease_association, gene.to_dict(), disease_id, evidence="from KGML")
  session.write_transaction(self.create_go_association, gene.to_dict(), disease_id)
  session.write_transaction(self.create_gene_interaction, interaction, disease_id)
  session.write_transaction(self.create_dise

In [79]:
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings.openai import OpenAIEmbeddings


In [94]:
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(api_key=api_key),
    url=uri,
    username=user,
    password=password,
    index_name='go_ids',
    node_label="GO_Annotation",
    text_node_properties=['qualifier', 'name', 'definition','aspect'],
    embedding_node_property='embedding',
)
response = vector_index.similarity_search(
    "(Spermatogenesis-associated protein 2) inhibition of (Kinesin light chain 3)",k=7,
    #filter={'GO_ID':{"$in":["GO:0018","001634"]}}
)



In [152]:
response

[Document(metadata={'object_type': 'protein', 'GO_ID': 'GO:0016938'}, page_content='\nqualifier: part_of\nname: kinesin I complex\ndefinition: A complex of two kinesin heavy chains and two kinesin light chains.\naspect: Cellular Component'),
 Document(metadata={'object_type': 'protein', 'GO_ID': 'GO:0005871'}, page_content='\nqualifier: part_of\nname: kinesin complex\ndefinition: Any complex that includes a dimer of molecules from the kinesin superfamily, a group of related proteins that contain an extended region of predicted alpha-helical coiled coil in the main chain that likely produces dimerization. The native complexes of several kinesin family members have also been shown to contain additional peptides, often designated light chains as all of the noncatalytic subunits that are currently known are smaller than the chain that contains the motor unit. Kinesin complexes generally possess a force-generating enzymatic activity, or motor, which converts the free energy of the gamma pho

In [150]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
vector_qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(api_key=api_key),
    chain_type="stuff",
    retriever=vector_index.as_retriever()
)

vector_qa.run(
    "Which GO_ID corresponding to (Spermatogenesis-associated protein 2) describes best the following interaction:(Spermatogenesis-associated protein 2) inhibition of (Kinesin light chain 3)"
)

"I don't have the specific GO_ID for the interaction between Spermatogenesis-associated protein 2 and Kinesin light chain 3."

** BASE CLASS AND DERIVED CLASSES

In [64]:
import yaml
from langchain_community.graphs.neo4j_graph import Neo4jGraph
from neo4j.exceptions import CypherSyntaxError
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings.openai import OpenAIEmbeddings
import textwrap
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")
api_key = os.getenv("OPENAI_API_KEY")


In [74]:


class BaseAgent:
    def __init__(self, uri: str, user: str, password: str, api_key: str, config_path: str):
        """
        Initialize the BaseAgent with Neo4jGraph, OpenAI API key, and configuration.

        Args:
            uri (str): URI for the Neo4j database.
            user (str): Username for the Neo4j database.
            password (str): Password for the Neo4j database.
            api_key (str): API key for OpenAI.
            config_path (str): Path to the configuration YAML file.
        """
        self.kg = Neo4jGraph(url=uri, username=user, password=password)
        self.api_key = api_key
        self.config = self.load_config(config_path)
        self.schema = textwrap.fill(self.kg.schema, 60)

    def load_config(self, path: str) -> dict:
        """
        Load configuration from a YAML file.

        Args:
            path (str): Path to the YAML configuration file.

        Returns:
            dict: Configuration dictionary.
        """
        with open(path, 'r') as file:
            return yaml.safe_load(file)

    def generate(self, prompt_template: str, temperature: float = 0, **kwargs) -> str:
        """
        Generate a response using the OpenAI API.

        Args:
            prompt_template (str): Template for the prompt to be sent to OpenAI.
            temperature (float, optional): Sampling temperature for OpenAI completion. Defaults to 0.
            **kwargs: Additional keyword arguments for formatting the prompt.

        Returns:
            str: Generated response from OpenAI.
        """
        prompt = prompt_template.format(**kwargs)
        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": kwargs.get('question', '')},
        ]

        client = OpenAI(api_key=self.api_key)

        completions = client.chat.completions.create(
            model="gpt-4o",
            temperature=temperature,
            messages=messages,
            max_tokens=500
        )
        return completions.choices[0].message.content

    def run_cypher_query(self, cypher_statement: str, retry: bool = True) -> dict:
        """
        Execute a Cypher query against the Neo4j database.

        Args:
            cypher_statement (str): Cypher query to be executed.
            retry (bool, optional): Whether to retry the query in case of a Cypher syntax error. Defaults to True.

        Returns:
            dict: Result of the Cypher query.

        Raises:
            CypherSyntaxError: If the Cypher query has a syntax error and retry is False.
        """
        try:
            print(cypher_statement)
            return self.kg.query(cypher_statement)
        except CypherSyntaxError as e:
            if retry:
                print("Retrying due to Cypher syntax error...")
                return self.generate_cypher_retry(cypher_statement, str(e))
            else:
                raise e

    def generate_cypher_retry(self, cypher_statement: str, error_message: str) -> str:
        """
        Generate a corrected Cypher query in case of a syntax error.

        Args:
            cypher_statement (str): Original Cypher query that caused the syntax error.
            error_message (str): Error message from the Cypher syntax error.

        Returns:
            str: Corrected Cypher query.
        """
        retry_prompt = self.current_prompts['retry_prompt']
        return self.generate(retry_prompt, cypher_statement=cypher_statement, error_message=error_message)
    
    def set_prompt_context(self, context: str):
        """
        Set the current prompt context based on the configuration.

        Args:
            context (str): Context key to set the current prompts.

        Raises:
            ValueError: If the context is not found in the configuration.
        """
        if context in self.config['prompts']:
            self.current_prompts = self.config['prompts'][context]
        else:
            raise ValueError(f"Context '{context}' not found in configuration.")
 



In [97]:
class DiseaseAssociation(BaseAgent):
    def __init__(self, uri: str, user: str, password: str, api_key: str, config_path: str):
        """
        Initialize the DiseaseAssociation with Neo4jGraph, OpenAI API key, and configuration.

        Args:
            uri (str): URI for the Neo4j database.
            user (str): Username for the Neo4j database.
            password (str): Password for the Neo4j database.
            api_key (str): API key for OpenAI.
            config_path (str): Path to the configuration YAML file.
        """        
        super().__init__(uri, user, password, api_key, config_path)
        
    def generate_response(self, question: str) -> str:
        """
        Generate a response for a given question about disease associations.

        Args:
            question (str): The question to be answered.

        Returns:
            str: The generated response from OpenAI.
        """
        # Generate initial Cypher statement
        self.set_prompt_context('disease_association')
        initial_prompt = self.current_prompts['initial_prompt']
        cypher_statement = self.generate(initial_prompt, schema=self.schema, question=question)
        
        # Execute Cypher query and get results
        cypher_result = self.run_cypher_query(cypher_statement)
        print(cypher_result)

        # Generate final response
        final_prompt = self.current_prompts['final_prompt']
        final_response = self.generate(final_prompt, question=question, information=cypher_result)
        
        return final_response

# Usage example
custom_query = DiseaseAssociation(
    uri=uri,
    user=user,
    password=password,
    api_key=api_key,
    config_path='config.yaml'
)
custom_query.generate_response("What diseases is Insulin associated with?")

MATCH (g:Gene)-[:ASSOCIATED_WITH]->(d:Disease)
WHERE 'Insulin' IN g.names OR 'Insulin' IN g.synonyms
RETURN g.names as gene_names, g.synonyms as gene_synonyms, d.name as disease_name, d.disease_id as KEGG_pathway
[{'gene_names': ['Insulin'], 'gene_synonyms': ['INS'], 'disease_name': 'Alzheimer disease', 'KEGG_pathway': '05010'}, {'gene_names': ['Insulin'], 'gene_synonyms': ['INS'], 'disease_name': 'Type II diabetes mellitus', 'KEGG_pathway': '04930'}, {'gene_names': ['Insulin'], 'gene_synonyms': ['INS'], 'disease_name': 'Type II diabetes mellitus', 'KEGG_pathway': '04930'}]


"Insulin, also known as 'INS', is associated with Alzheimer disease and Type II diabetes mellitus. It appears in pathway 05010 of the KEGG database for Alzheimer disease and in pathway 04930 for Type II diabetes mellitus."

**TESTING

In [37]:
from typing import Any, Dict, List
from utilities.preprocessing import dict_to_frozenset



class DownstreamInteraction(BaseAgent):
    vector_index = None

    def __init__(self, uri: str, user: str, password: str, api_key: str, config_path: str):
        """
        Initialize the DownstreamInteraction with Neo4jGraph, OpenAI API key, and configuration.

        Args:
            uri (str): URI for the Neo4j database.
            user (str): Username for the Neo4j database.
            password (str): Password for the Neo4j database.
            api_key (str): API key for OpenAI.
            config_path (str): Path to the configuration YAML file.
        """
        super().__init__(uri, user, password, api_key, config_path)
        if DownstreamInteraction.vector_index is None:
            DownstreamInteraction.vector_index = Neo4jVector.from_existing_graph(
                OpenAIEmbeddings(api_key=api_key),
                url=uri,
                username=user,
                password=password,
                index_name='go_ids',
                node_label="GO_Annotation",
                text_node_properties=['qualifier', 'name', 'definition','aspect'],
                embedding_node_property='embedding',
            )

    def get_go_ids(self, unique_id: str) -> List[str]:
        """
        Retrieve GO IDs associated with a gene unique ID.

        Args:
            unique_id (str): Unique ID of the gene.

        Returns:
            List[str]: List of GO IDs associated with the gene.
        """
        go_result = self.kg.query(
            """
            MATCH (g: Gene {unique_id:$unique_id})-[:HAS_GO_ANNOTATION]->(a: GO_Annotation)
            RETURN collect(a.GO_ID) as GO_ID
            """,
            params={'unique_id': unique_id} 
        )
        go_list = go_result[0]['GO_ID']
        return go_list

    def perform_similarity_search(self, interaction: Dict[str, Any], go_list: List[str]) -> str:
        """
        Perform a similarity search using the interaction details and GO IDs.

        Args:
            interaction (Dict[str, Any]): Interaction details.
            go_list (List[str]): List of GO IDs.

        Returns:
            str: Description of the interaction from the similarity search.
        """
        start_node_names=interaction['start'].get('names')
        end_node_names=interaction['end'].get('names')
        subtype=interaction['subtypes']
        interaction_type=self.config['interaction_type_dict'].get(interaction['type'])

        #form crude search query consisting of start and end node gene names, as well as (sub)type of the interaction connecting them (eg activation, inhibition etc).
        search_query = f"{start_node_names}, {subtype}, {end_node_names}, {interaction_type}"

        #perform similarity search based on the search_query- filter for GO_IDs relevant to the first node.
        response = DownstreamInteraction.vector_index.similarity_search(
            search_query, k=1,
            filter={'GO_ID': {"$in": go_list}}
        )
        return response[0].page_content

    def process_interaction(self, interaction: Dict[str, Any]) -> str:
        """
        Process an interaction to generate a descriptive response.

        Args:
            interaction (Dict[str, Any]): Interaction details.

        Returns:
            str: Generated response describing the interaction.
        """
        unique_id = interaction['start'].get('unique_id')
        go_list = self.get_go_ids(unique_id)
        interaction_description = self.perform_similarity_search(interaction,go_list)
        final_prompt = self.current_prompts['final_prompt']
        start_node_names=[interaction['start'].get('names')]
        end_node_names=[interaction['end'].get('names')]
        question=f"Please describe the interaction of {start_node_names} with {end_node_names} using the description {interaction_description}."
        final_response = self.generate(
            final_prompt,
            temperature=0.5,
            question=question
        )
        return final_response

    def generate_response(self, question: str) -> List[List[str]]:
        """
        Generate a response for a given question about downstream interactions.

        Args:
            question (str): The question to be answered.

        Returns:
            List[List[str]]: List of all distinct downstream interaction paths from the specified gene node.
        """
        # Set the context for downstream interaction prompts
        self.set_prompt_context('downstream_interaction')
        
        # Generate initial Cypher statement
        initial_prompt = self.current_prompts['initial_prompt']
        cypher_statement = self.generate(initial_prompt, schema=self.schema, question=question)
        
        # Execute Cypher query and get results
        cypher_result = self.run_cypher_query(cypher_statement)
        
        processed_interactions = {}
        all_paths_list = []
        for path in cypher_result[0]['interactions']:
            path_list=[]
            for interaction in path:
                interaction_frozenset = dict_to_frozenset(interaction)
        
                # Check if this interaction has already been processed
                if interaction_frozenset in processed_interactions:
                    # Retrieve the result from the dictionary and append it to path_description
                    path_list.append(processed_interactions[interaction_frozenset])
                    continue
                interaction_result=self.process_interaction(interaction)
                path_list.append(interaction_result)
                processed_interactions[interaction_frozenset] = interaction_result
            all_paths_list.append(path_list)

        if not all_paths_list:
            return "I could not find the answer in the database. Please try again."
        
        print("The following are the distinct downstream interaction paths from the specified gene node:")
        return all_paths_list



In [38]:
downstream_query = DownstreamInteraction(
    uri=uri,
    user=user,
    password=password,
    api_key=api_key,
    config_path='config.yaml'
)

downstream_query.generate_response('What are the downstream interactions of PARK7?')




MATCH (start: Gene)
WHERE ('PARK7' IN start.names) or ('PARK7' IN start.synonyms)
CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) 
YIELD path
WITH path
WHERE NOT EXISTS {
    MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)
    WHERE lastNode = last(nodes(path))
}
WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships
RETURN collect(relationships) AS interactions
The following are the distinct downstream interaction paths from the specified gene node:


[['Parkinson disease protein 7 interacts with Nuclear factor erythroid 2-related factor 2 (also known as Heme-binding protein 1) via enabling protein binding. This interaction involves the binding of Parkinson disease protein 7 to another protein.',
  'Nuclear factor erythroid 2-related factor 2 (also known as Heme-binding protein 1) interacts with Thioredoxin (also known as Thioredoxin, mitochondrial, Histone-lysine N-methyltransferase 2B) via enabling RNA polymerase II-specific DNA-binding transcription factor binding. This interaction involves binding to a sequence-specific DNA binding RNA polymerase II transcription factor, which selectively and non-covalently interacts with a specific DNA sequence to modulate transcription. This interaction falls under the aspect of Molecular Function.'],
 ['Parkinson disease protein 7 interacts with Cellular tumor antigen p53 via enabling protein binding. This interaction involves the binding of Parkinson disease protein 7 to Cellular tumor antig

In [80]:
from langchain.agents import AgentExecutor, create_react_agent
from langchain.tools import Tool
from langchain import hub
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import Neo4jChatMessageHistory
from openai import OpenAI
from langchain.agents import AgentType

from langchain.agents import initialize_agent

tools = [
    Tool.from_function(
        name="Disease Association",
        description="For when you need to ask questions like 'What disease is gene X associated with?'. The question will be a string. Return a string.",
        func=DiseaseAssociation(uri,user,password,api_key,config_path='config.yaml').generate_response
    ),

    Tool.from_function(
        name="Downstream Interaction",
        description="For answering text based questions like 'What are the downstream interactions of gene X in disease pathway Y?' The question will be a String. Return a String",
        func=DownstreamInteraction(uri,user,password,api_key,config_path='config.yaml').generate_response
    )
]

llm = ChatOpenAI(temperature=0, model="gpt-4o")

agent= initialize_agent(
    tools, 
    llm, 
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    verbose = True)

In [82]:
agent('What disease is PARK7 associated with?? Provide associated evidence')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo determine the disease associated with the gene PARK7 and provide associated evidence, I will use the Disease Association tool.

Action: Disease Association
Action Input: What disease is gene PARK7 associated with? Provide associated evidence.[0mMATCH (g:Gene)-[r:ASSOCIATED_WITH]->(d:Disease)
WHERE 'PARK7' IN g.synonyms
RETURN g.names as gene_names, g.synonyms as gene_synonyms, d.name as disease_name, d.disease_id as KEGG_pathway, r.evidence as association_evidence
[{'gene_names': ['Parkinson disease protein 7'], 'gene_synonyms': ['PARK7'], 'disease_name': 'Parkinson disease', 'KEGG_pathway': '05012', 'association_evidence': 'from KGML'}, {'gene_names': ['Parkinson disease protein 7'], 'gene_synonyms': ['PARK7'], 'disease_name': 'Parkinson disease', 'KEGG_pathway': '05012', 'association_evidence': 'from KGML'}]

Observation: [36;1m[1;3mGene PARK7, also known as 'Parkinson disease protein 7', is associated with Parkinson 

{'input': 'What disease is PARK7 associated with??Provide associated evidence',
 'output': "Gene PARK7, also known as 'Parkinson disease protein 7', is associated with Parkinson disease. This association is supported by evidence from the KEGG pathway 05012, as indicated in the KGML."}

In [84]:
agent('What are the downstream interactions of PARK7 ? Write all the interactions that your tool provides, do not summarize or omit information')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer the question about the downstream interactions of PARK7, I will use the Downstream Interaction tool.

Action: Downstream Interaction
Action Input: What are the downstream interactions of PARK7?[0mMATCH (start: Gene)
WHERE 'PARK7' IN start.synonyms
CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) 
YIELD path
WITH path
WHERE NOT EXISTS {
    MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)
    WHERE lastNode = last(nodes(path))
}
WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships
RETURN collect(relationships) AS interactions
The following are the distinct downstream interaction paths from the specified gene node:

Observation: [33;1m[1;3m[['Parkinson disease protein 7 interacts with Nuclear factor erythroid 2-related factor 2 (also known as Heme-binding pro

{'input': 'What are the downstream interactions of PARK7 ? Write all the interactions that your tool provides, do not summarize or omit information',
 'output': 'The downstream interactions of PARK7 are as follows:\n\n1. PARK7 interacts with Nuclear factor erythroid 2-related factor 2 (Heme-binding protein 1) via enabling protein binding. This interaction involves the binding to a protein, which is a molecular function. Nuclear factor erythroid 2-related factor 2 interacts with Thioredoxin via enabling RNA polymerase II-specific DNA-binding transcription factor binding.\n\n2. PARK7 interacts with Cellular tumor antigen p53 via enabling protein binding. Cellular tumor antigen p53 interacts with Dual specificity protein phosphatase 1 in a process involved in the DNA damage response, specifically signal transduction by the p53 class mediator resulting in the transcription of the p21 class mediator.\n\n3. PARK7 interacts with Bcl-2-like protein 1 via enabling protein binding. Bcl-2-like pr

In [89]:
from typing import Callable

class CustomAgent(BaseAgent):
    """
    CustomAgent class that inherits from BaseAgent to handle questions related to disease associations 
    and downstream interactions. It uses OpenAI to classify questions and selects the appropriate tool 
    (DiseaseAssociation or DownstreamInteraction) to generate responses.

    Example usage:
    config_path = 'config.yaml'
    custom_agent = CustomAgent(uri, user, password, api_key, config_path)
    response = custom_agent.ask("What are the downstream interactions of gene INS in the pathway Type II diabetes mellitus?")
    print(response)
    """
    def __init__(self, uri: str, user: str, password: str, api_key: str, config_path: str):
        """
        Initialize the CustomAgent with Neo4jGraph, OpenAI API key, and configuration.

        Args:
            uri (str): URI for the Neo4j database.
            user (str): Username for the Neo4j database.
            password (str): Password for the Neo4j database.
            api_key (str): API key for OpenAI.
            config_path (str): Path to the configuration YAML file.
        """
        super().__init__(uri, user, password, api_key, config_path)
        self.disease_association_agent = DiseaseAssociation(uri, user, password, api_key,config_path)
        self.downstream_interaction_agent = DownstreamInteraction(uri, user, password, api_key,config_path)

    def classify_question(self, question: str) -> str:
        """
        Classify the question using OpenAI to determine which tool to use.

        Args:
            question (str): The question to be classified.

        Returns:
            str: The classification result.
        """
        classification_prompt = self.config['prompts']['classification_prompt']
        prompt = classification_prompt.format(question=question)
        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": question},
        ]
        client = OpenAI(api_key=self.api_key)
        completions = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            temperature=0,
            max_tokens=10
        )
        return completions.choices[0].message.content.strip().lower()

    def select_tool(self, question: str) -> Callable:
        """
        Select the appropriate tool based on the classified question.

        Args:
            question (str): The question to be answered.

        Returns:
            Callable: The selected tool's generate_response method.
        """
        category = self.classify_question(question)
        if "disease" in category:
            return self.disease_association_agent.generate_response
        elif "downstream" in category:
            return self.downstream_interaction_agent.generate_response
        else:
            raise ValueError("No appropriate tool found for the given question.")

    def ask(self, question: str) -> str:
        """
        Generate a response for a given question by selecting the appropriate tool.

        Args:
            question (str): The question to be answered.

        Returns:
            str: The generated response.
        """
        tool_func = self.select_tool(question)
        return tool_func(question)




In [96]:
config_path = 'config.yaml'
custom_agent = CustomAgent(uri, user, password, api_key,config_path)
response = custom_agent.ask("What are the benefits of good weather?")
print(response)

ValueError: No appropriate tool found for the given question.

In [108]:
import json
tool_selection_dataset = [
    {"question": "What disease is PRKN associated with?", "label": "disease_association"},
    {"question": "What are the downstream interactions of gene Y?", "label": "downstream_interaction"},
    {"question": "Is gene GNAI1 associated with Parkinson disease?", "label": "disease_association"},
    {"question": "What genes interact with gene SNCA downstream?", "label": "downstream_interaction"},
    {"question": "Who won the Euro Cup in 2024?", "label": "none"},
    {"question": "Is gene Alpha-synuclein associated with Parkinson's?", "label": "disease_association"},
    {"question": "What downstream genes are affected by gene Z?", "label": "downstream_interaction"},
    {"question": "What is the capital of France?", "label": "none"},
    {"question": "What is the weather like?", "label": "none"},
    {"question": "What would happend if Insulin is activated in the Type II diabetes mellitus pathway?", "label": "downstream_interaction"},
    {"question": "Are there any connections between Caspace-3 and Alzheimer Disease?","label": "disease_association"},
    {"question": "Which gene is linked to Alzheimer's disease?", "label": "disease_association"},
    {"question": "What downstream effects does gene BCL2 have?", "label": "downstream_interaction"},
    {"question": "Is BRCA1 associated with Colorectal cancer?", "label": "disease_association"},
    {"question": "What pathways are activated by gene TP53?", "label": "downstream_interaction"},
    {"question": "Is there a link between gene APP and Alzheimer's?", "label": "disease_association"},
    {"question": "What downstream targets are influenced by gene MFN1?", "label": "downstream_interaction"},
    {"question": "What environmental factors influence gene mutations?", "label": "none"}
]

with open('evaluation_datasets/tool_selection_dataset.json', 'w') as f:
    json.dump(tool_selection_dataset, f, indent=4)


In [110]:

# Load the dataset from a JSON file
with open('evaluation_datasets/tool_selection_dataset.json', 'r') as f:
    loaded_dataset = json.load(f)

# Check if the dataset is loaded correctly
print(loaded_dataset)

[{'question': 'What disease is PRKN associated with?', 'label': 'disease_association'}, {'question': 'What are the downstream interactions of gene Y?', 'label': 'downstream_interaction'}, {'question': 'Is gene GNAI1 associated with Parkinson disease?', 'label': 'disease_association'}, {'question': 'What genes interact with gene SNCA downstream?', 'label': 'downstream_interaction'}, {'question': 'Who won the Euro Cup in 2024?', 'label': 'none'}, {'question': "Is gene Alpha-synuclein associated with Parkinson's?", 'label': 'disease_association'}, {'question': 'What downstream genes are affected by gene Z?', 'label': 'downstream_interaction'}, {'question': 'What is the capital of France?', 'label': 'none'}, {'question': 'What is the weather like?', 'label': 'none'}, {'question': 'What would happend if Insulin is activated in the Type II diabetes mellitus pathway?', 'label': 'downstream_interaction'}, {'question': 'Are there any connections between Caspace-3 and Alzheimer Disease?', 'label

In [118]:
from typing import List, Dict
import pandas as pd
def evaluate_tool_selection(agent: CustomAgent, dataset: List[Dict[str, str]]) -> float:
    """
    Evaluate the tool selection performance of the CustomAgent.

    Args:
        agent (CustomAgent): The CustomAgent to be evaluated.
        dataset (List[Dict[str, str]]): The labeled dataset containing questions and expected tool labels.

    Returns:
        float: The accuracy of the tool selection.
    """
    correct_count = 0
    results = []

    for data in dataset:
        question = data["question"]
        label = data["label"]
        
        try:
            selected_tool_func = agent.select_tool(question)
            selected_tool = "disease_association" if selected_tool_func == agent.disease_association_agent.generate_response else "downstream_interaction"
        except ValueError:
            selected_tool = "none"
        
        # Determine if the tool selection was correct
        is_correct = (selected_tool == label)
        if is_correct:
            correct_count += 1

        # Append the result to the list
        results.append({
            "question": question,
            "expected_label": label,
            "predicted_label": selected_tool,
            "is_correct": is_correct
        })

    accuracy = correct_count / len(dataset)
    print(f"Accuracy={accuracy}")
    df = pd.DataFrame(results)

    return df

# Usage
config_path = 'config.yaml'
custom_agent = CustomAgent(uri, user, password, api_key, config_path)
evaluate_tool_selection(custom_agent, loaded_dataset)


Accuracy=0.9444444444444444


Unnamed: 0,question,expected_label,predicted_label,is_correct
0,What disease is PRKN associated with?,disease_association,disease_association,True
1,What are the downstream interactions of gene Y?,downstream_interaction,downstream_interaction,True
2,Is gene GNAI1 associated with Parkinson disease?,disease_association,disease_association,True
3,What genes interact with gene SNCA downstream?,downstream_interaction,downstream_interaction,True
4,Who won the Euro Cup in 2024?,none,none,True
5,Is gene Alpha-synuclein associated with Parkin...,disease_association,disease_association,True
6,What downstream genes are affected by gene Z?,downstream_interaction,downstream_interaction,True
7,What is the capital of France?,none,none,True
8,What is the weather like?,none,none,True
9,What would happend if Insulin is activated in ...,downstream_interaction,downstream_interaction,True


In [122]:
import json

# Sample questions and their corresponding expected Cypher queries
downstream_interaction_examples = [
    {
        "question": "Can you give me the paths downstream of Growth factor receptor-bound protein 2 in the Colorectal cancer?",
        "expected_cypher_query": """
            MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Colorectal cancer'})
            WHERE "Growth factor receptor-bound protein 2" IN start.names
            CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) 
            YIELD path
            WITH path
            WHERE NOT EXISTS {
                MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)
                WHERE lastNode = last(nodes(path))
            }
            WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships
            RETURN collect(relationships) AS interactions
        """
    },
    {
        "question": "What effects should I expect downstream of 'MCH5' in the Alzheimer pathway?",
        "expected_cypher_query": """
            MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Alzheimer disease'})
            WHERE "MCH5" IN start.synonyms
            CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) 
            YIELD path
            WITH path
            WHERE NOT EXISTS {
                MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)
                WHERE lastNode = last(nodes(path))
            }
            WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships
            RETURN collect(relationships) AS interactions
        """
    },
    {
        "question": "What will happen if Insulin is activated in the Type II diabetes mellitus pathway?",
        "expected_cypher_query": """
            MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Type II diabetes mellitus'})
            WHERE "Insulin" IN start.names
            CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) 
            YIELD path
            WITH path
            WHERE NOT EXISTS {
                MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)
                WHERE lastNode = last(nodes(path))
            }
            WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships
            RETURN collect(relationships) AS interactions
        """
    },
    {
        "question": "What are the downstream interactions of gene PARK7?",
        "expected_cypher_query": """
            MATCH (start: Gene)
            WHERE ('PARK7' IN start.names) OR ('PARK7' IN start.synonyms)
            CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>', minLevel: 1, uniqueness: 'NODE_PATH', bfs: false}) 
            YIELD path
            WITH path
            WHERE NOT EXISTS {
                MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)
                WHERE lastNode = last(nodes(path))
            }
            WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships
            RETURN collect(relationships) AS interactions
        """
    },
    {
        "question": "What downstream genes are affected by Kelch-like ECH-associated protein 1?",
        "expected_cypher_query": """
            MATCH (start: Gene)
            WHERE 'Kelch-like ECH-associated protein 1' IN start.names
            CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>', minLevel: 1, uniqueness: 'NODE_PATH', bfs: false}) 
            YIELD path
            WITH path
            WHERE NOT EXISTS {
                MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)
                WHERE lastNode = last(nodes(path))
            }
            WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships
            RETURN collect(relationships) AS interactions
        """
    }
]

# Save the dataset to a JSON file
with open('downstream_interaction_dataset.json', 'w') as f:
    json.dump(downstream_interaction_examples, f, indent=4)

In [None]:
import json
import pandas as pd

def evaluate_run_cypher_query(agent: CustomAgent, dataset: list, context: str) -> pd.DataFrame:
    """
    Evaluate the run_cypher_query method of the CustomAgent.

    Args:
        agent (CustomAgent): The CustomAgent to be tested.
        dataset (list): List of questions and their expected Cypher queries.
        context (str): The prompt context to set for the agent.

    Returns:
        pd.DataFrame: DataFrame containing the questions, generated Cypher queries, expected Cypher queries,
                      agent results, manual results, and whether the results match.
    """
    results = []

    for item in dataset:
        question = item["question"]
        expected_cypher_query = item["expected_cypher_query"].strip()

        # Set the appropriate context
        agent.set_prompt_context(context)

        # Generate Cypher query using the agent
        generated_cypher_query = agent.generate(agent.current_prompts['initial_prompt'], schema=agent.schema, question=question).strip()

        # Run the Cypher query using run_cypher_query method
        agent_result = agent.run_cypher_query(generated_cypher_query)

        # Run the expected Cypher query manually using kg.query
        manual_result = agent.kg.query(expected_cypher_query)

        # Compare the results
        results.append({
            "question": question,
            "generated_cypher_query": generated_cypher_query,
            "expected_cypher_query": expected_cypher_query,
            "agent_result": json.dumps(agent_result, indent=4),
            "manual_result": json.dumps(manual_result, indent=4),
            "match": agent_result == manual_result
        })

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    return df

# Load the downstream interaction dataset from a JSON file
with open('evaluation_datasets/downstream_interaction_dataset.json', 'r') as f:
    downstream_dataset = json.load(f)

# Example usage for downstream interactions
config_path = 'config.yaml'
custom_agent = CustomAgent(uri, user, password, api_key, config_path)
downstream_evaluation_df = evaluate_run_cypher_query(custom_agent, downstream_dataset, 'downstream_interaction')

# Set pandas display options to show all content
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

# Print the DataFrame
print("Downstream Interaction Evaluation:")
downstream_evaluation_df


In [124]:
downstream_evaluation_df


Unnamed: 0,question,generated_cypher_query,expected_cypher_query,agent_result,manual_result,match
0,Can you give me the paths downstream of Growth factor receptor-bound protein 2 in the Colorectal cancer?,"MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Colorectal cancer'})\nWHERE ('Growth factor receptor-bound protein 2' IN start.names) or ('Growth factor receptor-bound protein 2' IN start.synonyms)\nCALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) \nYIELD path\nWITH path\nWHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n}\nWITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\nRETURN collect(relationships) AS interactions","MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Colorectal cancer'})\n WHERE ""Growth factor receptor-bound protein 2"" IN start.names\n CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) \n YIELD path\n WITH path\n WHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n }\n WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\n RETURN collect(relationships) AS interactions","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05210_155"",\n ""names"": [\n ""Growth factor receptor-bound protein 2""\n ],\n ""synonyms"": [\n ""ASH"",\n ""GRB2""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_154"",\n ""names"": [\n ""Hepatocyte growth factor"",\n ""Son of sevenless homolog 1"",\n ""Erythroid transcription factor""\n ],\n ""synonyms"": [\n ""SOS1"",\n ""GF1"",\n ""HGF""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05210_154"",\n ""names"": [\n ""Hepatocyte growth factor"",\n ""Son of sevenless homolog 1"",\n ""Erythroid transcription factor""\n ],\n ""synonyms"": [\n ""SOS1"",\n ""GF1"",\n ""HGF""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_152"",\n ""names"": [\n ""GTPase HRas"",\n ""GTPase NRas""\n ],\n ""synonyms"": [\n ""HRAS1"",\n ""HRAS""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05210_152"",\n ""names"": [\n ""GTPase HRas"",\n ""GTPase NRas""\n ],\n ""synonyms"": [\n ""HRAS1"",\n ""HRAS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_162"",\n ""names"": [\n ""Serine/threonine-protein kinase A-Raf""\n ],\n ""synonyms"": [\n ""ARAF"",\n ""ARAF1"",\n ""PKS2""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation"",\n ""phosphorylation""\n ],\n ""start"": {\n ""unique_id"": ""05210_162"",\n ""names"": [\n ""Serine/threonine-protein kinase A-Raf""\n ],\n ""synonyms"": [\n ""ARAF"",\n ""ARAF1"",\n ""PKS2""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_161"",\n ""names"": [\n ""Dual specificity mitogen-activated protein kinase kinase 1"",\n ""Ras-related protein Rab-8A""\n ],\n ""synonyms"": [\n ""PRKMK1"",\n ""MAP2K1"",\n ""MEK1"",\n ""MEL""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation"",\n ""phosphorylation""\n ],\n ""start"": {\n ""unique_id"": ""05210_161"",\n ""names"": [\n ""Dual specificity mitogen-activated protein kinase kinase 1"",\n ""Ras-related protein Rab-8A""\n ],\n ""synonyms"": [\n ""PRKMK1"",\n ""MAP2K1"",\n ""MEK1"",\n ""MEL""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_160"",\n ""names"": [\n ""Mitogen-activated protein kinase 1"",\n ""Ephrin type-B receptor 2""\n ],\n ""synonyms"": [\n ""ERK"",\n ""ERK2"",\n ""MAPK1"",\n ""PRKM2"",\n ""PRKM1""\n ]\n }\n }\n ]\n ]\n }\n]","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05210_155"",\n ""names"": [\n ""Growth factor receptor-bound protein 2""\n ],\n ""synonyms"": [\n ""ASH"",\n ""GRB2""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_154"",\n ""names"": [\n ""Hepatocyte growth factor"",\n ""Son of sevenless homolog 1"",\n ""Erythroid transcription factor""\n ],\n ""synonyms"": [\n ""SOS1"",\n ""GF1"",\n ""HGF""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05210_154"",\n ""names"": [\n ""Hepatocyte growth factor"",\n ""Son of sevenless homolog 1"",\n ""Erythroid transcription factor""\n ],\n ""synonyms"": [\n ""SOS1"",\n ""GF1"",\n ""HGF""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_152"",\n ""names"": [\n ""GTPase HRas"",\n ""GTPase NRas""\n ],\n ""synonyms"": [\n ""HRAS1"",\n ""HRAS""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05210_152"",\n ""names"": [\n ""GTPase HRas"",\n ""GTPase NRas""\n ],\n ""synonyms"": [\n ""HRAS1"",\n ""HRAS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_162"",\n ""names"": [\n ""Serine/threonine-protein kinase A-Raf""\n ],\n ""synonyms"": [\n ""ARAF"",\n ""ARAF1"",\n ""PKS2""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation"",\n ""phosphorylation""\n ],\n ""start"": {\n ""unique_id"": ""05210_162"",\n ""names"": [\n ""Serine/threonine-protein kinase A-Raf""\n ],\n ""synonyms"": [\n ""ARAF"",\n ""ARAF1"",\n ""PKS2""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_161"",\n ""names"": [\n ""Dual specificity mitogen-activated protein kinase kinase 1"",\n ""Ras-related protein Rab-8A""\n ],\n ""synonyms"": [\n ""PRKMK1"",\n ""MAP2K1"",\n ""MEK1"",\n ""MEL""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation"",\n ""phosphorylation""\n ],\n ""start"": {\n ""unique_id"": ""05210_161"",\n ""names"": [\n ""Dual specificity mitogen-activated protein kinase kinase 1"",\n ""Ras-related protein Rab-8A""\n ],\n ""synonyms"": [\n ""PRKMK1"",\n ""MAP2K1"",\n ""MEK1"",\n ""MEL""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05210_160"",\n ""names"": [\n ""Mitogen-activated protein kinase 1"",\n ""Ephrin type-B receptor 2""\n ],\n ""synonyms"": [\n ""ERK"",\n ""ERK2"",\n ""MAPK1"",\n ""PRKM2"",\n ""PRKM1""\n ]\n }\n }\n ]\n ]\n }\n]",True
1,What effects should I expect downstream of 'MCH5' in the Alzheimer pathway?,"MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Alzheimer disease'})\nWHERE 'MCH5' IN start.synonyms\nCALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) \nYIELD path\nWITH path\nWHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n}\nWITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\nRETURN collect(relationships) AS interactions","MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Alzheimer disease'})\n WHERE ""MCH5"" IN start.synonyms\n CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) \n YIELD path\n WITH path\n WHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n }\n WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\n RETURN collect(relationships) AS interactions","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05010_87"",\n ""names"": [\n ""Caspase-8""\n ],\n ""synonyms"": [\n ""MCH5"",\n ""CASP8""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05010_88"",\n ""names"": [\n ""BH3-interacting domain death agonist""\n ],\n ""synonyms"": [\n ""BID""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05010_87"",\n ""names"": [\n ""Caspase-8""\n ],\n ""synonyms"": [\n ""MCH5"",\n ""CASP8""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05010_34"",\n ""names"": [\n ""Caspase-3""\n ],\n ""synonyms"": [\n ""CPP32"",\n ""CASP3""\n ]\n }\n }\n ]\n ]\n }\n]","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05010_87"",\n ""names"": [\n ""Caspase-8""\n ],\n ""synonyms"": [\n ""MCH5"",\n ""CASP8""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05010_88"",\n ""names"": [\n ""BH3-interacting domain death agonist""\n ],\n ""synonyms"": [\n ""BID""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05010_87"",\n ""names"": [\n ""Caspase-8""\n ],\n ""synonyms"": [\n ""MCH5"",\n ""CASP8""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05010_34"",\n ""names"": [\n ""Caspase-3""\n ],\n ""synonyms"": [\n ""CPP32"",\n ""CASP3""\n ]\n }\n }\n ]\n ]\n }\n]",True
2,What will happen if Insulin is activated in the Type II diabetes mellitus pathway?,"MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Type II diabetes mellitus'})\nWHERE ""Insulin"" IN start.names\nCALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) \nYIELD path\nWITH path\nWHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n}\nWITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\nRETURN collect(relationships) AS interactions","MATCH (start: Gene)-[:ASSOCIATED_WITH]->(d:Disease {name: 'Type II diabetes mellitus'})\n WHERE ""Insulin"" IN start.names\n CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>',minLevel: 1, uniqueness: 'NODE_PATH',bfs: false}) \n YIELD path\n WITH path\n WHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n }\n WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\n RETURN collect(relationships) AS interactions","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_17"",\n ""names"": [\n ""Insulin""\n ],\n ""synonyms"": [\n ""INS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_15"",\n ""names"": [\n ""Insulin receptor substrate 1""\n ],\n ""synonyms"": [\n ""IRS1""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_17"",\n ""names"": [\n ""Insulin""\n ],\n ""synonyms"": [\n ""INS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n }\n },\n {\n ""subtypes"": [\n ""indirect effect""\n ],\n ""start"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_23"",\n ""names"": [\n ""Mitogen-activated protein kinase 1"",\n ""Ephrin type-B receptor 2""\n ],\n ""synonyms"": [\n ""ERK"",\n ""ERK2"",\n ""MAPK1"",\n ""PRKM2"",\n ""PRKM1""\n ]\n }\n },\n {\n ""subtypes"": [\n ""indirect effect""\n ],\n ""start"": {\n ""unique_id"": ""04930_23"",\n ""names"": [\n ""Mitogen-activated protein kinase 1"",\n ""Ephrin type-B receptor 2""\n ],\n ""synonyms"": [\n ""ERK"",\n ""ERK2"",\n ""MAPK1"",\n ""PRKM2"",\n ""PRKM1""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_31"",\n ""names"": [\n ""Insulin receptor substrate 1""\n ],\n ""synonyms"": [\n ""IRS1""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_17"",\n ""names"": [\n ""Insulin""\n ],\n ""synonyms"": [\n ""INS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_31"",\n ""names"": [\n ""Insulin receptor substrate 1""\n ],\n ""synonyms"": [\n ""IRS1""\n ]\n }\n }\n ]\n ]\n }\n]","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_17"",\n ""names"": [\n ""Insulin""\n ],\n ""synonyms"": [\n ""INS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_15"",\n ""names"": [\n ""Insulin receptor substrate 1""\n ],\n ""synonyms"": [\n ""IRS1""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_17"",\n ""names"": [\n ""Insulin""\n ],\n ""synonyms"": [\n ""INS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n }\n },\n {\n ""subtypes"": [\n ""indirect effect""\n ],\n ""start"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_23"",\n ""names"": [\n ""Mitogen-activated protein kinase 1"",\n ""Ephrin type-B receptor 2""\n ],\n ""synonyms"": [\n ""ERK"",\n ""ERK2"",\n ""MAPK1"",\n ""PRKM2"",\n ""PRKM1""\n ]\n }\n },\n {\n ""subtypes"": [\n ""indirect effect""\n ],\n ""start"": {\n ""unique_id"": ""04930_23"",\n ""names"": [\n ""Mitogen-activated protein kinase 1"",\n ""Ephrin type-B receptor 2""\n ],\n ""synonyms"": [\n ""ERK"",\n ""ERK2"",\n ""MAPK1"",\n ""PRKM2"",\n ""PRKM1""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_31"",\n ""names"": [\n ""Insulin receptor substrate 1""\n ],\n ""synonyms"": [\n ""IRS1""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_17"",\n ""names"": [\n ""Insulin""\n ],\n ""synonyms"": [\n ""INS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""04930_16"",\n ""names"": [\n ""Insulin receptor""\n ],\n ""synonyms"": [\n ""INSR""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""04930_31"",\n ""names"": [\n ""Insulin receptor substrate 1""\n ],\n ""synonyms"": [\n ""IRS1""\n ]\n }\n }\n ]\n ]\n }\n]",True
3,What are the downstream interactions of gene PARK7?,"MATCH (start: Gene)\nWHERE ('PARK7' IN start.names) OR ('PARK7' IN start.synonyms)\nCALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>', minLevel: 1, uniqueness: 'NODE_PATH', bfs: false}) \nYIELD path\nWITH path\nWHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n}\nWITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\nRETURN collect(relationships) AS interactions","MATCH (start: Gene)\n WHERE ('PARK7' IN start.names) OR ('PARK7' IN start.synonyms)\n CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>', minLevel: 1, uniqueness: 'NODE_PATH', bfs: false}) \n YIELD path\n WITH path\n WHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n }\n WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\n RETURN collect(relationships) AS interactions","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""missing interaction""\n ],\n ""start"": {\n ""unique_id"": ""05012_19"",\n ""names"": [\n ""Parkinson disease protein 7""\n ],\n ""synonyms"": [\n ""PARK7""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_347"",\n ""names"": [\n ""Nuclear factor erythroid 2-related factor 2"",\n ""Heme-binding protein 1""\n ],\n ""synonyms"": [\n ""NFE2L2"",\n ""NRF2"",\n ""HEBP1""\n ]\n }\n },\n {\n ""subtypes"": [\n ""expression""\n ],\n ""start"": {\n ""unique_id"": ""05012_347"",\n ""names"": [\n ""Nuclear factor erythroid 2-related factor 2"",\n ""Heme-binding protein 1""\n ],\n ""synonyms"": [\n ""NFE2L2"",\n ""NRF2"",\n ""HEBP1""\n ]\n },\n ""type"": ""GErel"",\n ""end"": {\n ""unique_id"": ""05012_362"",\n ""names"": [\n ""Thioredoxin"",\n ""Thioredoxin, mitochondrial"",\n ""Histone-lysine N-methyltransferase 2B""\n ],\n ""synonyms"": [\n ""TRX2"",\n ""TXN"",\n ""TXN2""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""missing interaction""\n ],\n ""start"": {\n ""unique_id"": ""05012_19"",\n ""names"": [\n ""Parkinson disease protein 7""\n ],\n ""synonyms"": [\n ""PARK7""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_338"",\n ""names"": [\n ""Cellular tumor antigen p53""\n ],\n ""synonyms"": [\n ""P53"",\n ""TP53""\n ]\n }\n },\n {\n ""subtypes"": [\n ""expression""\n ],\n ""start"": {\n ""unique_id"": ""05012_338"",\n ""names"": [\n ""Cellular tumor antigen p53""\n ],\n ""synonyms"": [\n ""P53"",\n ""TP53""\n ]\n },\n ""type"": ""GErel"",\n ""end"": {\n ""unique_id"": ""05012_361"",\n ""names"": [\n ""Dual specificity protein phosphatase 1""\n ],\n ""synonyms"": [\n ""PTPN10"",\n ""CL100"",\n ""MKP1"",\n ""DUSP1""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""missing interaction""\n ],\n ""start"": {\n ""unique_id"": ""05012_326"",\n ""names"": [\n ""Parkinson disease protein 7""\n ],\n ""synonyms"": [\n ""PARK7""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_327"",\n ""names"": [\n ""Bcl-2-like protein 1""\n ],\n ""synonyms"": [\n ""BCL2L"",\n ""BCL2L1"",\n ""BCLX""\n ]\n }\n },\n {\n ""subtypes"": [\n ""missing interaction""\n ],\n ""start"": {\n ""unique_id"": ""05012_327"",\n ""names"": [\n ""Bcl-2-like protein 1""\n ],\n ""synonyms"": [\n ""BCL2L"",\n ""BCL2L1"",\n ""BCLX""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_316"",\n ""names"": [\n ""Apoptosis regulator BAX""\n ],\n ""synonyms"": [\n ""BAX"",\n ""BCL2L4""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05012_316"",\n ""names"": [\n ""Apoptosis regulator BAX""\n ],\n ""synonyms"": [\n ""BAX"",\n ""BCL2L4""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_25"",\n ""names"": [\n ""Cytochrome c""\n ],\n ""synonyms"": [\n ""CYC"",\n ""CYCS""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05012_25"",\n ""names"": [\n ""Cytochrome c""\n ],\n ""synonyms"": [\n ""CYC"",\n ""CYCS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_152"",\n ""names"": [\n ""Caspase-9""\n ],\n ""synonyms"": [\n ""CASP9"",\n ""MCH6""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05012_152"",\n ""names"": [\n ""Caspase-9""\n ],\n ""synonyms"": [\n ""CASP9"",\n ""MCH6""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_153"",\n ""names"": [\n ""Caspase-3""\n ],\n ""synonyms"": [\n ""CPP32"",\n ""CASP3""\n ]\n }\n }\n ]\n ]\n }\n]","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""missing interaction""\n ],\n ""start"": {\n ""unique_id"": ""05012_19"",\n ""names"": [\n ""Parkinson disease protein 7""\n ],\n ""synonyms"": [\n ""PARK7""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_347"",\n ""names"": [\n ""Nuclear factor erythroid 2-related factor 2"",\n ""Heme-binding protein 1""\n ],\n ""synonyms"": [\n ""NFE2L2"",\n ""NRF2"",\n ""HEBP1""\n ]\n }\n },\n {\n ""subtypes"": [\n ""expression""\n ],\n ""start"": {\n ""unique_id"": ""05012_347"",\n ""names"": [\n ""Nuclear factor erythroid 2-related factor 2"",\n ""Heme-binding protein 1""\n ],\n ""synonyms"": [\n ""NFE2L2"",\n ""NRF2"",\n ""HEBP1""\n ]\n },\n ""type"": ""GErel"",\n ""end"": {\n ""unique_id"": ""05012_362"",\n ""names"": [\n ""Thioredoxin"",\n ""Thioredoxin, mitochondrial"",\n ""Histone-lysine N-methyltransferase 2B""\n ],\n ""synonyms"": [\n ""TRX2"",\n ""TXN"",\n ""TXN2""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""missing interaction""\n ],\n ""start"": {\n ""unique_id"": ""05012_19"",\n ""names"": [\n ""Parkinson disease protein 7""\n ],\n ""synonyms"": [\n ""PARK7""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_338"",\n ""names"": [\n ""Cellular tumor antigen p53""\n ],\n ""synonyms"": [\n ""P53"",\n ""TP53""\n ]\n }\n },\n {\n ""subtypes"": [\n ""expression""\n ],\n ""start"": {\n ""unique_id"": ""05012_338"",\n ""names"": [\n ""Cellular tumor antigen p53""\n ],\n ""synonyms"": [\n ""P53"",\n ""TP53""\n ]\n },\n ""type"": ""GErel"",\n ""end"": {\n ""unique_id"": ""05012_361"",\n ""names"": [\n ""Dual specificity protein phosphatase 1""\n ],\n ""synonyms"": [\n ""PTPN10"",\n ""CL100"",\n ""MKP1"",\n ""DUSP1""\n ]\n }\n }\n ],\n [\n {\n ""subtypes"": [\n ""missing interaction""\n ],\n ""start"": {\n ""unique_id"": ""05012_326"",\n ""names"": [\n ""Parkinson disease protein 7""\n ],\n ""synonyms"": [\n ""PARK7""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_327"",\n ""names"": [\n ""Bcl-2-like protein 1""\n ],\n ""synonyms"": [\n ""BCL2L"",\n ""BCL2L1"",\n ""BCLX""\n ]\n }\n },\n {\n ""subtypes"": [\n ""missing interaction""\n ],\n ""start"": {\n ""unique_id"": ""05012_327"",\n ""names"": [\n ""Bcl-2-like protein 1""\n ],\n ""synonyms"": [\n ""BCL2L"",\n ""BCL2L1"",\n ""BCLX""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_316"",\n ""names"": [\n ""Apoptosis regulator BAX""\n ],\n ""synonyms"": [\n ""BAX"",\n ""BCL2L4""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05012_316"",\n ""names"": [\n ""Apoptosis regulator BAX""\n ],\n ""synonyms"": [\n ""BAX"",\n ""BCL2L4""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_25"",\n ""names"": [\n ""Cytochrome c""\n ],\n ""synonyms"": [\n ""CYC"",\n ""CYCS""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05012_25"",\n ""names"": [\n ""Cytochrome c""\n ],\n ""synonyms"": [\n ""CYC"",\n ""CYCS""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_152"",\n ""names"": [\n ""Caspase-9""\n ],\n ""synonyms"": [\n ""CASP9"",\n ""MCH6""\n ]\n }\n },\n {\n ""subtypes"": [\n ""activation""\n ],\n ""start"": {\n ""unique_id"": ""05012_152"",\n ""names"": [\n ""Caspase-9""\n ],\n ""synonyms"": [\n ""CASP9"",\n ""MCH6""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_153"",\n ""names"": [\n ""Caspase-3""\n ],\n ""synonyms"": [\n ""CPP32"",\n ""CASP3""\n ]\n }\n }\n ]\n ]\n }\n]",True
4,What downstream genes are affected by Kelch-like ECH-associated protein 1?,"MATCH (start: Gene)\nWHERE ('Kelch-like ECH-associated protein 1' IN start.names) OR ('Kelch-like ECH-associated protein 1' IN start.synonyms)\nCALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>', minLevel: 1, uniqueness: 'NODE_PATH', bfs: false}) \nYIELD path\nWITH path\nWHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n}\nWITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\nRETURN collect(relationships) AS interactions","MATCH (start: Gene)\n WHERE 'Kelch-like ECH-associated protein 1' IN start.names\n CALL apoc.path.expandConfig(start, {relationshipFilter: 'INTERACTS_WITH>', minLevel: 1, uniqueness: 'NODE_PATH', bfs: false}) \n YIELD path\n WITH path\n WHERE NOT EXISTS {\n MATCH (lastNode)-[:INTERACTS_WITH]->(:Gene)\n WHERE lastNode = last(nodes(path))\n }\n WITH path, [rel IN relationships(path) | {start: startNode(rel), end: endNode(rel), type: rel.type, subtypes: rel.subtypes}] AS relationships\n RETURN collect(relationships) AS interactions","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""inhibition""\n ],\n ""start"": {\n ""unique_id"": ""05012_714"",\n ""names"": [\n ""Kelch-like ECH-associated protein 1""\n ],\n ""synonyms"": [\n ""KLHL19"",\n ""KEAP1""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_347"",\n ""names"": [\n ""Nuclear factor erythroid 2-related factor 2"",\n ""Heme-binding protein 1""\n ],\n ""synonyms"": [\n ""NFE2L2"",\n ""NRF2"",\n ""HEBP1""\n ]\n }\n },\n {\n ""subtypes"": [\n ""expression""\n ],\n ""start"": {\n ""unique_id"": ""05012_347"",\n ""names"": [\n ""Nuclear factor erythroid 2-related factor 2"",\n ""Heme-binding protein 1""\n ],\n ""synonyms"": [\n ""NFE2L2"",\n ""NRF2"",\n ""HEBP1""\n ]\n },\n ""type"": ""GErel"",\n ""end"": {\n ""unique_id"": ""05012_362"",\n ""names"": [\n ""Thioredoxin"",\n ""Thioredoxin, mitochondrial"",\n ""Histone-lysine N-methyltransferase 2B""\n ],\n ""synonyms"": [\n ""TRX2"",\n ""TXN"",\n ""TXN2""\n ]\n }\n }\n ]\n ]\n }\n]","[\n {\n ""interactions"": [\n [\n {\n ""subtypes"": [\n ""inhibition""\n ],\n ""start"": {\n ""unique_id"": ""05012_714"",\n ""names"": [\n ""Kelch-like ECH-associated protein 1""\n ],\n ""synonyms"": [\n ""KLHL19"",\n ""KEAP1""\n ]\n },\n ""type"": ""PPrel"",\n ""end"": {\n ""unique_id"": ""05012_347"",\n ""names"": [\n ""Nuclear factor erythroid 2-related factor 2"",\n ""Heme-binding protein 1""\n ],\n ""synonyms"": [\n ""NFE2L2"",\n ""NRF2"",\n ""HEBP1""\n ]\n }\n },\n {\n ""subtypes"": [\n ""expression""\n ],\n ""start"": {\n ""unique_id"": ""05012_347"",\n ""names"": [\n ""Nuclear factor erythroid 2-related factor 2"",\n ""Heme-binding protein 1""\n ],\n ""synonyms"": [\n ""NFE2L2"",\n ""NRF2"",\n ""HEBP1""\n ]\n },\n ""type"": ""GErel"",\n ""end"": {\n ""unique_id"": ""05012_362"",\n ""names"": [\n ""Thioredoxin"",\n ""Thioredoxin, mitochondrial"",\n ""Histone-lysine N-methyltransferase 2B""\n ],\n ""synonyms"": [\n ""TRX2"",\n ""TXN"",\n ""TXN2""\n ]\n }\n }\n ]\n ]\n }\n]",True
