In [54]:
import os
import openai
import requests
import re
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph
from langchain.document_loaders import TextLoader
from neo4j import GraphDatabase
from langchain.vectorstores import Neo4jVector
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

In [55]:
PARENT_PATH = Path.cwd().parent
if 'publishingchatgptpocweb' not in str(PARENT_PATH):
    PARENT_PATH = PARENT_PATH / 'publishingchatgptpocweb'

DATA_DIRECTORY = PARENT_PATH / 'data'

JATS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'raw'
ARTICLES_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'processed' / 'articles'
CONCEPTS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'processed' / 'concepts'

In [56]:
OPEN_AI_API_SECRET = os.getenv('Open__AI__API__Secret')
openai.api_key = OPEN_AI_API_SECRET

In [57]:
embeddings = OpenAIEmbeddings(openai_api_key=OPEN_AI_API_SECRET, model="text-embedding-ada-002")
local_index = Neo4jVector.from_existing_index(
    embeddings,
    url=os.getenv('Local__Neo4J__URL'),
    username=os.getenv('Local__Neo4J__UserName'),
    password=os.getenv('Local__Neo4J__Password'),
    database=os.getenv('Local__Neo4J__Database'),
    index_name=os.getenv('Local__Neo4J__PrimaryIndexName'),  
    text_node_property="info", 
)

In [60]:

TEMPLATE = """As EVA (Expert Virtual Assistance), your role is to assist the user with their queries regarding articles, concepts, etc. 
This response should be extracted based on the provided A&I records enclosed between <ctx></ctx> tags.
The response should be divided into two parts, Answer part and Source part.
Ensure to adhere to the guidelines and example below to develop well-organized response.

1. **Use only the provided A&I records for your response**:   

2. **Do not use any other knowledge**:
   Avoid using data or knowledge that is not present in the provided A&I records.

3. **Structure and Content of Answer part**:   
   Ensure to identify and encapsulate all possible Thesaurus Concepts within anchor tags, linking them to the correct concept URIs.
   Concept URI usually looks like 'https://id.cabi.org/cabt/74668'.
   Never mention Article Source in this part

4. **Structure and Content of Source part**:
   Beneath your Answer part, Provide at least 1 to at maximum 3 source articles from which the above answer part has been derived from.
   Article Source usually looks like 'https://www.cabidigitallibrary.org/doi/10.5555/20183000178'.
   Never mention Concept URI in this part

5. **Avoid Mentioning the Absence of Information**:
   If the provided A&I records lacks sufficient information to answer a query, refrain from stating "not mentioned in the A&I records." Instead, indicate the limitations of your training.

6. **Do not include any explanations or apologies in your responses**   

Example:
Q: What can you tell me about molecular epidemiology?
A: <p><a href="Concept URI 1">Molecular epidemiology</a> leverages <a href="Concept URI 2">molecular biology</a> techniques and genetic information to elucidate disease dynamics in populations, affording a deeper comprehension of disease patterns and <a href="Concept URI 3">transmission</a> dynamics.</p>
<p>..next para but structured similarly as above</p>
... (continue for other paragraphs)
<br/>
<h5>Source Articles</h5>
<ul>
   <li><a href="Article Source 1">Articles Title 1</a></li>
   <li><a href="Article Source 2">Articles Title 2</a></li>
   <li><a href="Article Source 3">Articles Title 3</a></li>
</ul>

------
<ctx>
{summaries}
</ctx>
------
Q: {question}
A: 
"""
    
PROMPT = PromptTemplate(
    template=TEMPLATE, input_variables=["summaries", "question"]
)
chain_type_kwargs = {"prompt": PROMPT,  "verbose": False}


llm = ChatOpenAI(
    model_name="gpt-4",
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    temperature=0,
    openai_api_key=OPEN_AI_API_SECRET,
)

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff", 
    retriever=local_index.as_retriever(),
    chain_type_kwargs=chain_type_kwargs
)

def test(question):
    return chain({"question": question},return_only_outputs=False)

In [51]:
# pan doesn't work
test("Which article has pan 20183000178?")

The article with PAN 20183000178 is not mentioned in the provided A&I records.

{'question': 'Which article has pan 20183000178?',
 'answer': 'The article with PAN 20183000178 is not mentioned in the provided A&I records.',
 'sources': ''}

In [40]:
# title works
test("Which article is about Environmental Transmission and Control of COVID-19?")

<p>The article titled <a href="https://www.cabidigitallibrary.org/doi/10.5555/20210326597">Special Issue: Environmental Transmission and Control of COVID-19</a> is specifically focused on the environmental transmission and behavior of SARS-CoV-2.</p>

<h5>Source Articles</h5>
<ul>
   <li><a href="https://www.cabidigitallibrary.org/doi/10.5555/20210326597">Special Issue: Environmental Transmission and Control of COVID-19</a></li>
</ul>

{'question': 'Which article is about Environmental Transmission and Control of COVID-19?',
 'answer': '<p>The article titled <a href="https://www.cabidigitallibrary.org/doi/10.5555/20210326597">Special Issue: Environmental Transmission and Control of COVID-19</a> is specifically focused on the environmental transmission and behavior of SARS-CoV-2.</p>\n\n<h5>Source Articles</h5>\n<ul>\n   <li><a href="https://www.cabidigitallibrary.org/doi/10.5555/20210326597">Special Issue: Environmental Transmission and Control of COVID-19</a></li>\n</ul>',
 'sources': ''}

In [42]:
# publishing date and location works
test("Which article was published in Washington, DC on 17th of January, 2018?")

<p>The article titled "Preventing bullying through science, policy, and practice" was published in Washington, DC on the 17th of January, 2018.</p>

<h5>Source Articles</h5>
<ul>
   <li><a href="https://www.cabidigitallibrary.org/doi/10.5555/20183014770">Preventing bullying through science, policy, and practice</a></li>
</ul>

{'question': 'Which article was published in Washington, DC on 17th of January, 2018?',
 'answer': '<p>The article titled "Preventing bullying through science, policy, and practice" was published in Washington, DC on the 17th of January, 2018.</p>\n\n<h5>Source Articles</h5>\n<ul>\n   <li><a href="https://www.cabidigitallibrary.org/doi/10.5555/20183014770">Preventing bullying through science, policy, and practice</a></li>\n</ul>',
 'sources': ''}

In [43]:
test("Which article has isbn 9780309440677?")
# docs_with_score = local_index.similarity_search_with_score("Which article has isbn 9780309440677?")
# docs_with_score = local_index.similarity_search_with_score("Which article has giant reed as preferred term?")
# for doc, score in docs_with_score:
#     print("-" * 80)
#     print("Score: ", score)
#     print(doc.page_content)
#     print("-" * 80)

The A&I records do not mention any article with the ISBN 9780309440677.

{'question': 'Which article has isbn 9780309440677?',
 'answer': 'The A&I records do not mention any article with the ISBN 9780309440677.',
 'sources': ''}

In [61]:
test("What can you tell me about Metrics that matter for population health action?")

<p>The "Metrics that matter for population health action" is a workshop summary that discusses the United States' efforts to improve population health. It describes the current metrics landscape, including several important milestones (events and publications). The document also presents examples of how metrics are being used to drive improvements in <a href="https://id.cabi.org/cabt/74668">population health</a> in <a href="https://id.cabi.org/cabt/74669">communities</a>. Furthermore, it discusses the uses of metrics to assess <a href="https://id.cabi.org/cabt/74670">health equity</a> at the population level.</p>

<h5>Source Articles</h5>
<ul>
   <li><a href="https://www.cabidigitallibrary.org/doi/10.5555/20183000177">Metrics that matter for population health action: workshop summary</a></li>
</ul>

{'question': 'What can you tell me about Metrics that matter for population health action?',
 'answer': '<p>The "Metrics that matter for population health action" is a workshop summary that discusses the United States\' efforts to improve population health. It describes the current metrics landscape, including several important milestones (events and publications). The document also presents examples of how metrics are being used to drive improvements in <a href="https://id.cabi.org/cabt/74668">population health</a> in <a href="https://id.cabi.org/cabt/74669">communities</a>. Furthermore, it discusses the uses of metrics to assess <a href="https://id.cabi.org/cabt/74670">health equity</a> at the population level.</p>\n\n<h5>Source Articles</h5>\n<ul>\n   <li><a href="https://www.cabidigitallibrary.org/doi/10.5555/20183000177">Metrics that matter for population health action: workshop summary</a></li>\n</ul>',
 'sources': ''}