In [1]:
import os
import openai
import networkx as nx
import xml.etree.ElementTree as ET
import requests
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph
from neo4j import GraphDatabase

In [3]:
PARENT_PATH = Path.cwd().parent
if 'publishingchatgptpocweb' not in str(PARENT_PATH):
    PARENT_PATH = PARENT_PATH / 'publishingchatgptpocweb'

CORE_DIRECTORY = PARENT_PATH / 'core'
DATA_DIRECTORY = PARENT_PATH / 'data'

SPAQRQL_QUERY_FILE_PATH = CORE_DIRECTORY / 'sparql_query_template.sparql'
JATS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'raw'
BASIC_GRAPH_SAVED_FILE_PATH = DATA_DIRECTORY / 'processed' / 'basic_graph.gml'
OPEN_AI_GRAPH_SAVED_FILE_PATH = DATA_DIRECTORY / 'processed' / 'open_ai_graph.gml'
USE_OPEN_AI=True

In [4]:
OPEN_AI_API_SECRET = os.getenv('PublishingChatGPT_Open_AI_API_Secret')
openai.api_key = OPEN_AI_API_SECRET

In [5]:
graph = Neo4jGraph(
    url=os.getenv('PublishingChatGPT_Neo4J_URL'), username=os.getenv('PublishingChatGPT_Neo4J_UserName'), password=os.getenv('PublishingChatGPT_Neo4J_Password')
)

In [6]:
# graph.query(
#     """
# MERGE (a:Article {pan: "20183000178", title: "The ebola epidemic in West Africa: proceedings of a workshop.", abstract: "This proceedings summarizes the presentations and discussions of the workshop convened by the Forum of Microbial Threats in March 2015, and consists of the information presented, questions raised, and improvements recommended by individual workshop participants. Chapter 2 outlines the trajectory and response to the most recent Ebola outbreak that occurred in West Africa. It includes lessons learned from previous outbreaks that could have been applied to this one, the distinction between this and previous outbreaks, and the challenges in Ebola treatment and control. Chapter 3 discusses case examples of emergency response and preparedness around the world. Finally, Chapter 4 examines current and future research opportunities to study the transmission, diagnosis, proper containment of the ill, and vaccine development. It also discusses the importance of developing a global health risk framework in case of another Ebola-like disease outbreak."})

# MERGE (auth1:Author {name: "A.Mack"})
# MERGE (auth2:Author {name: "M. R. Snair"})
# MERGE (auth3:Author {name: "C.Mundaca-Shah"})

# MERGE (a)-[:HAS_AUTHOR]->(auth1)
# MERGE (a)-[:HAS_AUTHOR]->(auth2)
# MERGE (a)-[:HAS_AUTHOR]->(auth3)

# MERGE (c1:Concept {name: "diagnosis", uri: "https://id.cabi.org/cabt/38337"})
# MERGE (c2:Concept {name: "Ebolavirus", uri: "https://id.cabi.org/cabt/41510"})
# MERGE (c3:Concept {name: "West Africa", uri: "https://id.cabi.org/cabt/124364"})
# MERGE (c4:Concept {name: "Filoviridae", uri: "https://id.cabi.org/cabt/48196"})
# MERGE (c5:Concept {name: "Bundibugyo ebolaviru", uri: "https://id.cabi.org/cabt/275494"})
# MERGE (c6:Concept {name: "Africa South of Sahara", uri: "https://id.cabi.org/cabt/6730"})
# MERGE (c7:Concept {name: "Sahel", uri: "https://id.cabi.org/cabt/18171"})
# MERGE (c8:Concept {name: "Benin", uri: "https://id.cabi.org/cabt/18171"})

# MERGE (a)-[:HAS_PREFERRED_TERM]->(c1)
# MERGE (a)-[:HAS_ORGANISM_TERM]->(c2)
# MERGE (a)-[:HAS_GEOGRAPHIC_TERM]->(c3)

# MERGE (c2)-[:BROADER_TERM]->(c4)
# MERGE (c2)-[:NARROWER_TERM]->(c5)

# MERGE (c3)-[:BROADER_TERM]->(c6)
# MERGE (c3)-[:NARROWER_TERM]->(c8)
# MERGE (c3)-[:RELATED_TERM]->(c7)

# """
# )

graph.refresh_schema()

In [7]:
print(graph.get_schema)


        Node properties are the following:
        [{'properties': [{'property': 'publishing_date', 'type': 'STRING'}, {'property': 'document_type', 'type': 'STRING'}, {'property': 'vector', 'type': 'LIST'}, {'property': 'info', 'type': 'STRING'}, {'property': 'isbn', 'type': 'STRING'}, {'property': 'id', 'type': 'STRING'}, {'property': 'pan', 'type': 'STRING'}, {'property': 'title', 'type': 'STRING'}, {'property': 'source', 'type': 'STRING'}, {'property': 'uri', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}], 'labels': 'PublishingDataChunk'}]
        Relationship properties are the following:
        []
        The relationships are the following:
        ['(:PublishingDataChunk)-[:HAS_GEOGRAPHIC_TERM]->(:PublishingDataChunk)', '(:PublishingDataChunk)-[:HAS_PREFERRED_TERM]->(:PublishingDataChunk)', '(:PublishingDataChunk)-[:HAS_ORGANISM_TERM]->(:PublishingDataChunk)', '(:PublishingDataChunk)-[:RELATED_TERM]->(:PublishingDataChunk)', '(:PublishingDataChunk)-[:NARROWER_TERM

In [37]:
from langchain.prompts.prompt import PromptTemplate


# CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
# Instructions:
# Use only the provided relationship types and properties in the schema.
# Do not use any other relationship types or properties that are not provided.
# Schema:
# {schema}
# Note: Do not include any explanations or apologies in your responses.
# Ensure to handle the results of the Cypher query correctly in the subsequent stages of your application to convey the correct results to the user.
# Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
# Do not include any text except the generated Cypher statement.
# Examples: Here are a few examples of generated Cypher statements for particular questions:
# # What organisms are mentioned in article about ebola epidemic in West Africa?
# MATCH (a:Article)
# WHERE a.title =~ '.*ebola epidemic in West Africa.*'
# MATCH (a)-[:HAS_ORGANISM_TERM]->(concept:Concept)
# RETURN concept.name

# # Which articles are written by M. R. Snair?
# MATCH (a:Author)
# WHERE a.name =~ '.*M\\. R\\. Snair.*'
# MATCH (a)-[:HAS_AUTHOR]-(article:Article)
# RETURN article.title

# # Who are the authors for article ebola epidemic in West Africa?
# MATCH (a:Article)
# WHERE a.title =~ '.*ebola epidemic in West Africa.*'
# MATCH (a)-[:HAS_AUTHOR]->(author:Author)
# RETURN author.name

# The question is:
# {question}"""

CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Ensure to handle the results of the Cypher query correctly in the subsequent stages of your application to convey the correct results to the user.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:
# What organisms are mentioned in article about ebola epidemic in West Africa?
MATCH (a:Article)
WHERE a.title =~ '.*ebola epidemic in West Africa.*'
MATCH (a)-[:HAS_ORGANISM_TERM]->(concept:Concept)
RETURN concept.name

#Give me 5 most latest published articles
MATCH (a:PublishingDataChunk)
WHERE a.document_type = 'article'
RETURN a.title, a.publishing_date
ORDER BY a.publishing_date DESC
LIMIT 5

# Which articles are written by M. R. Snair?
MATCH (a:PublishingDataChunk)
WHERE a.info CONTAINS '\n    Authors:\n        - M. R. Snair'
WITH split(a.info, '\n') AS lines
WITH lines, [i IN range(0, size(lines) - 1) WHERE toLower(trim(lines[i])) CONTAINS 'title:'] AS titleIndices
WITH lines, titleIndices, titleIndices[0] AS titleIdx
RETURN trim(replace(lines[titleIdx], "Title:", "")) AS Article_Title


# Who are the authors for article ebola epidemic in West Africa?
MATCH (a:PublishingDataChunk)
WHERE toLower(a.info) CONTAINS 'title: the ebola epidemic in west africa'
AND toLower(a.info) CONTAINS 'authors:'
WITH split(a.info, '\n') AS lines
WITH lines, [i IN range(0, size(lines) - 1) WHERE toLower(trim(lines[i])) STARTS WITH 'authors'] AS authorLineIndices
UNWIND authorLineIndices AS idx
WITH lines[idx + 1] AS authorLine
RETURN trim(replace(authorLine, "-", "")) AS Author_Name

The question is:
{question}"""



CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model=os.getenv('PublishingChatGPT_Open_AI_Model'), openai_api_key=OPEN_AI_API_SECRET, temperature=0), graph=graph, verbose=True, cypher_prompt=CYPHER_GENERATION_PROMPT
)

# chain = GraphCypherQAChain.from_llm(
#      graph=graph,
#      cypher_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo", openai_api_key=OPEN_AI_API_SECRET),
#      qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo", openai_api_key=OPEN_AI_API_SECRET),
#      cypher_prompt=CYPHER_GENERATION_PROMPT,
#      verbose=True,
# )

In [28]:
chain.run("What organisms are mentioned in article about ebola epidemic in West Africa?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:PublishingDataChunk)
WHERE a.title =~ '.*ebola epidemic in West Africa.*'
MATCH (a)-[:HAS_ORGANISM_TERM]->(concept:PublishingDataChunk)
RETURN concept.name[0m
Full Context:
[32;1m[1;3m[{'concept.name': 'man'}, {'concept.name': 'Ebolavirus'}][0m

[1m> Finished chain.[0m


'The organisms mentioned in the article about the Ebola epidemic in West Africa are man and Ebolavirus.'

In [29]:
chain.run("Who are the authors for article ebola epidemic in West Africa?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:PublishingDataChunk)
WHERE toLower(a.info) CONTAINS 'title: the ebola epidemic in west africa'
AND toLower(a.info) CONTAINS 'authors:'
WITH split(a.info, '
') AS lines
WITH lines, [i IN range(0, size(lines) - 1) WHERE toLower(trim(lines[i])) STARTS WITH 'authors'] AS authorLineIndices
UNWIND authorLineIndices AS idx
WITH lines[idx + 1] AS authorLine
RETURN trim(replace(authorLine, "-", "")) AS Author_Name[0m
Full Context:
[32;1m[1;3m[{'Author_Name': 'A. Mack'}, {'Author_Name': 'M. R. Snair'}, {'Author_Name': 'C. MundacaShah'}][0m

[1m> Finished chain.[0m


'The authors for the article on the Ebola epidemic in West Africa are A. Mack, M. R. Snair, and C. MundacaShah.'

In [31]:
chain.run("Which article is written by M. R. Snair?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:PublishingDataChunk)
WHERE a.info CONTAINS '
    Authors:
        - M. R. Snair'
WITH split(a.info, '
') AS lines
WITH lines, [i IN range(0, size(lines) - 1) WHERE toLower(trim(lines[i])) CONTAINS 'title:'] AS titleIndices
WITH lines, titleIndices, titleIndices[0] AS titleIdx
RETURN trim(replace(lines[titleIdx], "Title:", "")) AS Article_Title[0m
Full Context:
[32;1m[1;3m[{'Article_Title': 'The ebola epidemic in West Africa: proceedings of a workshop.'}][0m

[1m> Finished chain.[0m


"I'm sorry, but I don't have any information about an article written by M. R. Snair."

In [38]:
chain.run("Give me 5 most latest published articles")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:PublishingDataChunk)
WHERE a.document_type = 'article'
RETURN a.title, a.publishing_date
ORDER BY a.publishing_date DESC
LIMIT 5[0m
Full Context:
[32;1m[1;3m[{'a.title': 'Special Issue: British HIV Association guidelines for the management of HIV-2 2021.', 'a.publishing_date': '2022-09-29'}, {'a.title': 'Micronutrient malnutrition across the life course, sarcopenia and frailty. Winter Conference Live 2020, 8-9 December 2020.', 'a.publishing_date': '2022-09-27'}, {'a.title': 'Irish Section Conference 2021: Nutrition, health and ageing - translating science into practice, Online, 22-24 June 2021.', 'a.publishing_date': '2022-09-27'}, {'a.title': 'Global Health: New Trends. Coimbra Health School Annual Meeting 2021, Coimbra, Portugal, 17-19 June 2021.', 'a.publishing_date': '2022-09-26'}, {'a.title': 'IEA World Congress of Epidemiology 2021, 3-6 September 2021.', 'a.publishing_date': '2022-09-2

'The five most recently published articles are:\n\n1. "Special Issue: British HIV Association guidelines for the management of HIV-2 2021." published on September 29, 2022.\n2. "Micronutrient malnutrition across the life course, sarcopenia and frailty. Winter Conference Live 2020, 8-9 December 2020." published on September 27, 2022.\n3. "Irish Section Conference 2021: Nutrition, health and ageing - translating science into practice, Online, 22-24 June 2021." published on September 27, 2022.\n4. "Global Health: New Trends. Coimbra Health School Annual Meeting 2021, Coimbra, Portugal, 17-19 June 2021." published on September 26, 2022.\n5. "IEA World Congress of Epidemiology 2021, 3-6 September 2021." published on September 26, 2022.'