### Create entities and embeddings from Aviation Reports for Hybrid RAG

In [15]:

from dotenv import load_dotenv
import os
from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import AzureOpenAI
import textwrap
import pandas as pd

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_GPT4_32k_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_32k_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
api_version = "2024-02-01"

AAD_TENANT_ID = os.getenv("AAD_TENANT_ID")
KUSTO_CLUSTER = os.getenv("KUSTO_CLUSTER")
KUSTO_DATABASE = os.getenv("KUSTO_DATABASE")
KUSTO_TABLE = os.getenv("KUSTO_TABLE")
KUSTO_MANAGED_IDENTITY_APP_ID = os.getenv("KUSTO_MANAGED_IDENTITY_APP_ID")
KUSTO_MANAGED_IDENTITY_SECRET = os.getenv("KUSTO_MANAGED_IDENTITY_SECRET")

# define embeddings 
embeddings = AzureOpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    openai_api_version=api_version,
    chunk_size = 1
)

# call OpenAI to get the answer
clientOpenAI = AzureOpenAI(
  azure_endpoint = OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-05-15"
)


In [16]:
#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def call_openAI(messages):
    response = clientOpenAI.chat.completions.create(
        model=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
        messages = messages,
        temperature=0
    )

    return response.choices[0].message.content

#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return embeddings.embed_query(txt)

#Use GPT-4 to extract entities from a text
def extract_entities(text):
    system_message = """
        You are an assistant designed to extract entities from a text. 
        Users will paste in a string of text and you will respond with entities you have extracted from the text as a JSON object.
        Here is an example of your output format:
        {aircraft_make:'',accident_number:'',aircraft_damage:'',city:'', state:'', country:'', phase_of_operation:'', pilot_flight_hours:'', engine_manufacturer:''}
    """
    messages = [{"role":"system","content":system_message},{"role":"user","content":text}]
    response = call_openAI(messages)
    return response


def call_openAI_for_final_answer(question, answer):
    prompt = 'Question: {}'.format(question) + '\n' + 'Information: {}'.format(answer)
    # prepare prompt
    messages = [{"role": "system", "content": "You are a HELPFUL assistant answering users questions. Answer the question using the provided information and do not add anything else."},
            {"role": "user", "content": prompt}]
    return call_openAI(messages)

def prettyprint(text: str) -> str:
    print(textwrap.fill(text, 60))

In [17]:
# splitting into 5000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
import uuid
splitter = RecursiveCharacterTextSplitter(chunk_size=10000,chunk_overlap=100)

pdf_folder_path = "./data/"
print(os.listdir(pdf_folder_path))

# read the pdf files and split them into chunks, extract entities and calculate embeddings
embeddings_df = pd.DataFrame(columns=['document_id','document_name', 'content', 'embedding'])
entities_df = pd.DataFrame(columns=['document_id','document_name', 'entities'])

for file in os.listdir(pdf_folder_path):
    if file.endswith('.pdf'):
        # generate a unique guid for the document
        id = uuid.uuid4()
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        # calculate embeddings
        pages = loader.load_and_split(text_splitter=splitter)
        print("Number of pages: ", len(pages))
        all_pages = ''
        for page in pages:
            #save all the pages into a pandas dataframe
            embeddings_df.loc[len(embeddings_df.index)] = [id, file, page.page_content, ""]  
            all_pages += page.page_content

        # extract entities from the whole document - we assume the document is less than 32k tokens (we are using GPT4 32k model)
        entities = (extract_entities(all_pages))
        print("Entities: ", entities)
        entities_df.loc[len(entities_df.index)] = [id, file, entities]  

# calculate the embeddings using openAI ada 
embeddings_df["embedding"] = embeddings_df.content.apply(lambda x: calc_embeddings(x))

['20071229X02007.pdf', '20071231X02009.pdf', '20080102X00002.pdf', '20080102X00005.pdf', '20080117X00064.pdf', '20080117X00072.pdf', '20080122X00080.pdf', '20080123X00097.pdf', '20080124X00100.pdf', '20080124X00101.pdf', '20080124X00101a.pdf', 'processed']
Number of pages:  4
Entities:  {"aircraft_make":"Piper PA-22-150","accident_number":"NYC08CA055","aircraft_damage":"Substantial","city":"Tallahassee","state":"FL","country":"USA","phase_of_operation":"APPROACH","pilot_flight_hours":"2319","engine_manufacturer":"Lycoming"}
Number of pages:  4
Entities:  {"city": "Death Valley", "state": "CA", "accident_number": "SEA08CA049", "date_time": "11/29/2007, 0830 PST", "aircraft_registration": "N8770M", "aircraft_make": "Beech A23", "aircraft_damage": "Substantial", "phase_of_operation": "LANDING - FLARE/TOUCHDOWN", "pilot_flight_hours": "119 hours", "engine_manufacturer": "Continental"}
Number of pages:  3
Entities:  {"aircraft_make":"Reims F172 M","accident_number":"DEN08WA042","aircraft_da

In [1]:
# save the dataframes to csv files
entities_df.to_csv('./data/processed/entities.csv', index=False)
embeddings_df.to_csv('./data/processed/embeddings.csv', index=False)

NameError: name 'entities_df' is not defined