### **Load Environment variables from .env file**

In [2]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os
from IPython.display import display, HTML, JSON, Markdown, Image
from neo4j import GraphDatabase
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain import PromptTemplate
from langchain_core.messages import HumanMessage
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt

import openai
import textwrap
import pandas as pd

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_GPT4_32k_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_32k_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
api_version = "2024-02-01"

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

llm = AzureChatOpenAI(
    model=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
    azure_deployment=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
    api_key=OPENAI_API_KEY,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    openai_api_version=api_version,
)

client = openai.AzureOpenAI(
        azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
        api_key=OPENAI_API_KEY,
        api_version="2023-09-01-preview"
    )

# define embeddings 
embeddings = AzureOpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    openai_api_version=api_version,
    chunk_size = 1
)


In [17]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def call_openAI(text):
    response = llm.chat.completions.create(
        model=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
        messages = text,
        temperature=0.0
    )
    return response.choices[0].message.content

#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return embeddings.embed_query(txt)

def prettyprint(text: str) -> str:
    print(textwrap.fill(text, 60))

In [12]:
def extract_entities(text):

    system_message = """
        You are an assistant designed to extract entities from a text. 
        Users will paste in a string of text and you will respond with entities you have extracted from the text as a JSON object.
        Here is an example of your output format:{aircraft_make:[],accident_number:[],aircraft_damage:[],location:[], phase_of_operation:[], pilot_flight_hours:[], injuries:[], engine_manufacturer:[], probable_cause:[], findings:[], ocurrence:[]}
    """

    response = client.chat.completions.create(
        model=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
        messages = [
            {"role":"system","content":system_message},
            {"role":"user","content":text}
            ],
        temperature=0.7,
        max_tokens=20000,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

    return response.choices[0].message.content

In [14]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=100,
)

pdf_folder_path = "./data/test/"
print(os.listdir(pdf_folder_path))

# read the pdf files and split them into chunks, extract entities and calculate embeddings
embeddings_df = pd.DataFrame(columns=['document_name', 'content', 'embedding'])
entities_df = pd.DataFrame(columns=['document_name', 'entities'])

for file in os.listdir(pdf_folder_path):
    if file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        # calculate embeddings
        pages = loader.load_and_split(text_splitter=splitter)
        print("Number of pages: ", len(pages))
        all_pages = ''
        for page in pages:
            #save all the pages into a pandas dataframe
            embeddings_df.loc[len(embeddings_df.index)] = [file, page.page_content, ""]  
            all_pages += page.page_content

        # extract entities from the whole document - we assume the document is less than 32k tokens (we are using GPT4 32k model)
        entities = (extract_entities(all_pages))
        print("Entities: ", entities)
        entities_df.loc[len(entities_df.index)] = [file, entities]  


['20071229X02007.pdf']
Number of pages:  4
Entities:  {"location":["Tallahassee, FL"],"accident_number":["NYC08CA055"],"date_time":["12/08/2007, 1730 EST"],"aircraft":["Piper PA-22-150"],"aircraft_damage":["Substantial"],"injuries":["1 Minor"],"pilot_age":["85"],"pilot_flight_hours":["2319 hours"],"engine_manufacturer":["Lycoming"],"probable_cause":["The pilot's misjudged clearance over obstacles during final approach."],"findings":["IN FLIGHT COLLISION WITH OBJECT"],"occurrence":["APPROACH"]}


In [19]:
# calculate the embeddings using openAI ada 
embeddings_df["embedding"] = embeddings_df.content.apply(lambda x: calc_embeddings(x))
embeddings_df.to_csv('./data/embeddings/adx_embeddings.csv', index=False)
print(embeddings_df.head(2))

        document_name                                            content  \
0  20071229X02007.pdf  Page 1 of 4\nNational Transportation Safety Bo...   
1  20071229X02007.pdf  Page 2 of 4 NYC08CA055Factual Information\nPil...   

                                           embedding  
0  [-0.007549772970378399, -0.01652275212109089, ...  
1  [0.006805323529988527, -0.002616920042783022, ...  
