### **Load Environment variables from .env file**

In [1]:
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os
from IPython.display import display, HTML, JSON, Markdown, Image
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import AzureOpenAI
import textwrap
import pandas as pd


load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_GPT4_32k_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_32k_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
api_version = "2024-02-01"


# define embeddings 
embeddings = AzureOpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    openai_api_version=api_version,
    chunk_size = 1
)

# call OpenAI to get the answer
clientOpenAI = AzureOpenAI(
  azure_endpoint = OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-05-15"
)


In [2]:
def call_openAI_for_final_answer(question, answer):
    prompt = 'Question: {}'.format(question) + '\n' + 'Information: {}'.format(answer)
    # prepare prompt
    messages = [{"role": "system", "content": "You are a HELPFUL assistant answering users questions. Answer the question using the provided information and do not add anything else."},
            {"role": "user", "content": prompt}]
    return call_openAI(messages)

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def call_openAI(messages):
    response = clientOpenAI.chat.completions.create(
        model=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
        messages = messages,
        temperature=0.7,
        max_tokens=800,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

    return response.choices[0].message.content

#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return embeddings.embed_query(txt)

def prettyprint(text: str) -> str:
    print(textwrap.fill(text, 60))

In [3]:
#Use GPT-4 to extract entities from a text
def extract_entities(text):
    system_message = """
        You are an assistant designed to extract entities from a text. 
        Users will paste in a string of text and you will respond with entities you have extracted from the text as a JSON object.
        Here is an example of your output format:{aircraft_make:'',accident_number:'',aircraft_damage:'',city:'', state:'', country:'', phase_of_operation:'', pilot_flight_hours:'', injuries:[], engine_manufacturer:'', probable_cause:'', findings:[], ocurrence:[]}
    """
    messages = [{"role":"system","content":system_message},{"role":"user","content":text}]
    response = call_openAI(messages)
    return response

In [4]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
import uuid
splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=100,
)

pdf_folder_path = "./data/test/"
print(os.listdir(pdf_folder_path))

# read the pdf files and split them into chunks, extract entities and calculate embeddings
embeddings_df = pd.DataFrame(columns=['document_id','document_name', 'content', 'embedding'])
entities_df = pd.DataFrame(columns=['document_id','document_name', 'entities'])

for file in os.listdir(pdf_folder_path):
    if file.endswith('.pdf'):
        # generate a unique guid for the document
        id = uuid.uuid4()
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        # calculate embeddings
        pages = loader.load_and_split(text_splitter=splitter)
        print("Number of pages: ", len(pages))
        all_pages = ''
        for page in pages:
            #save all the pages into a pandas dataframe
            embeddings_df.loc[len(embeddings_df.index)] = [id, file, page.page_content, ""]  
            all_pages += page.page_content

        # extract entities from the whole document - we assume the document is less than 32k tokens (we are using GPT4 32k model)
        entities = (extract_entities(all_pages))
        print("Entities: ", entities)
        entities_df.loc[len(entities_df.index)] = [id, file, entities]  


['20071229X02007.pdf', '20071231X02009.pdf', '20080102X00002.pdf', '20080102X00005.pdf', '20080117X00064.pdf', '20080117X00072.pdf', '20080122X00080.pdf', '20080123X00097.pdf', '20080124X00100.pdf', '20080124X00101.pdf', 'processed', 'test']
Number of pages:  4
Entities:  {"aircraft_make":"Piper PA-22-150","accident_number":"NYC08CA055","aircraft_damage":"Substantial","city":"Tallahassee", "state":"FL", "country":"USA", "phase_of_operation":"Approach", "pilot_flight_hours":2319, "injuries":[1], "engine_manufacturer":"Lycoming", "probable_cause":"The pilot's misjudged clearance over obstacles during final approach.", "findings":["In flight collision with object", "Tree(s)", "Altitude/Clearance - Misjudged"], "ocurrence":["In flight collision with object"]}
Number of pages:  4
Entities:  {"aircraft_make": "Beech A23", "accident_number": "SEA08CA049", "aircraft_damage": "Substantial", "city": "Death Valley", "state": "CA", "country": "US", "phase_of_operation": ["LANDING - FLARE/TOUCHDOWN

In [None]:
# calculate the embeddings using openAI ada 
embeddings_df["embedding"] = embeddings_df.content.apply(lambda x: calc_embeddings(x))

# save the dataframes to csv files
entities_df.to_csv('./data/processed/entities.csv', index=False)
embeddings_df.to_csv('./data/processed/embedddings.csv', index=False)


In [9]:
# read the entities csv into a pandas dataframe
import pandas as pd

entities_df = pd.read_csv("./data/processed/entities.csv")
embeddings_df = pd.read_csv("./data/processed/embeddings.csv")


In [11]:
import json 
# add entities to the graph
for index, row in entities_df.iterrows():
    document_id = row["document_id"]
    document_name = row["document_name"]
    entities_json = row["entities"] 

    entities = json.loads(entities_json)
    accident = Accident.nodes.first_or_none(document_name=document_name)
    if accident == None:
        accident = Accident(name=document_name).save() 
        accident.accident_number = get_json_key(entities,"accident_number")
        accident.aircraft_make = get_json_key(entities,"aircraft_make")
        accident.aircraft_damage = get_json_key(entities,"aircraft_damage")
        accident.phase_of_operation = get_json_key(entities,"phase_of_operation")
        accident.save() 

        city_name = get_json_key(entities,"city")
        state_name = get_json_key(entities,"state")
        country_name = get_json_key(entities,"country")

        city = City.nodes.first_or_none(name=city_name)
        if city == None:
            city = City(name=city_name).save()
        accident.city.connect(city)
        
        state = State.nodes.first_or_none(name=state_name)
        if state == None:
            state = State(name=state_name).save()
        city.state.connect(state)

        country = Country.nodes.first_or_none(name=country_name)
        if country == None:
            country = Country(name=country_name).save()
        state.country.connect(country)

In [14]:
from langchain_community.graphs import Neo4jGraph
graph = Neo4jGraph()
graph.connect(uri=NEO4J_URI, user=NEO4J_USERNAME, password=NEO4J_PASSWORD)

graph.add_graph_documents(
  pages, 
  baseEntityLabel=True, 
  include_source=True
)

neo4j_url = "neo4j+s://{}:7687".format(NEO4J_URI)


vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    url=neo4j_url,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)


def query_neo4j_vector(query):
    try:
        results = vector_index.similarity_search(query)
        vector_index._driver.close()
        return results[0].page_content
    except Exception as e:
        print(f"[ERROR] {e}")

In [18]:
query = "How many accident reports are there?"

results = vector_index.similarity_search(query, k=3)
# print(results[0].page_content)

In [19]:
from langchain.chains import GraphCypherQAChain

chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True)
response = chain.invoke({"query": "How many accident reports are there?"})
response

NameError: name 'graph' is not defined

In [None]:
from langchain.chains import RetrievalQA

qa_graph_chain = RetrievalQA.from_chain_type(
    llm, retriever=vector_index.as_retriever(), verbose = True
)

result = qa_graph_chain({"query": "Considering the financial crisis and resistance to reform by the Ancien Régime, how did the convocation of the Estates General in May 1789 specifically address these multifaceted issues?"})
result["result"]