### **Load Environment variables from .env file**

In [1]:
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os
from IPython.display import display, HTML, JSON, Markdown, Image
from neo4j import GraphDatabase
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain import PromptTemplate
from langchain_core.messages import HumanMessage
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import Neo4jVector
from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import AzureOpenAI

# test
import textwrap
import pandas as pd

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_GPT4_32k_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_32k_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
api_version = "2024-02-01"

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# define embeddings 
embeddings = AzureOpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    openai_api_version=api_version,
    chunk_size = 1
)

# call OpenAI to get the answer
clientOpenAI = AzureOpenAI(
  azure_endpoint = OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-05-15"
)


In [2]:
def call_openAI_for_NER(question, answer):
    prompt = 'Question: {}'.format(question) + '\n' + 'Information: {}'.format(answer)
    # prepare prompt
    messages = [{"role": "system", "content": "You are a HELPFUL assistant answering users questions. Answer the question using the provided information and do not add anything else."},
            {"role": "user", "content": prompt}]
    return call_openAI(messages)


def call_openAI_for_final_answer(question, answer):
    prompt = 'Question: {}'.format(question) + '\n' + 'Information: {}'.format(answer)
    # prepare prompt
    messages = [{"role": "system", "content": "You are a HELPFUL assistant answering users questions. Answer the question using the provided information and do not add anything else."},
            {"role": "user", "content": prompt}]
    return call_openAI(messages)



@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def call_openAI(messages):
    response = clientOpenAI.chat.completions.create(
        model=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
        messages = messages,
        temperature=0.7,
        max_tokens=800,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

    return response.choices[0].message.content

#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return embeddings.embed_query(txt)

def prettyprint(text: str) -> str:
    print(textwrap.fill(text, 60))

In [3]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
import uuid

splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=100,
)

pdf_folder_path = "./data/test/"
print(os.listdir(pdf_folder_path))

for file in os.listdir(pdf_folder_path):
    if file.endswith('.pdf'):
        # generate a unique guid for the document
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        # chunk the files
        pages = loader.load_and_split(text_splitter=splitter)
        graph_documents = llm_transformer.convert_to_graph_documents(pages)
        graph.add_graph_documents(graph_documents,baseEntityLabel=True,include_source=True)
        print(f"Nodes:{graph_documents[0].nodes}")
        print(f"Relationships:{graph_documents[0].relationships}")



['20071229X02007.pdf', '20071231X02009.pdf', '20080102X00002.pdf']
Nodes:[Node(id='National Transportation Safety Board', type='Organization'), Node(id='Tallahassee, Fl', type='Location'), Node(id='Nyc08Ca055', type='Accident'), Node(id='12/08/2007, 1730 Est', type='Date_time'), Node(id='N5921D', type='Aircraft_registration'), Node(id='Piper Pa-22-150', type='Aircraft'), Node(id='Part 91: General Aviation - Personal', type='Flight_rules'), Node(id='Pilot', type='Person'), Node(id='1,500-Foot-Long Turf Strip', type='Airstrip'), Node(id='2,319 Hours', type='Flight_experience'), Node(id='17 Hours', type='Recent_flight_experience'), Node(id='Obstacles', type='Object')]
Relationships:[Relationship(source=Node(id='National Transportation Safety Board', type='Organization'), target=Node(id='Nyc08Ca055', type='Accident'), type='INVESTIGATED'), Relationship(source=Node(id='Nyc08Ca055', type='Accident'), target=Node(id='Tallahassee, Fl', type='Location'), type='OCCURRED_IN'), Relationship(source

In [6]:
vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    url=url, 
    username=NEO4J_USERNAME, 
    password=NEO4J_PASSWORD
)

In [11]:
from langchain.chains import GraphCypherQAChain

query = "How many aviation accidents occured in France?"

chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True, top_k=2)
response = chain.invoke({"query": query})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:Accident)-[:OCCURRED_IN]->(l:Location) WHERE l.id = "France" RETURN COUNT(a)[0m
Full Context:
[32;1m[1;3m[{'COUNT(a)': 0}][0m

[1m> Finished chain.[0m


{'query': 'How many aviation accidents occured in France?',
 'result': 'There were no aviation accidents recorded in France.'}