### **Load Environment variables from .env file**

In [5]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os
from IPython.display import display, HTML, JSON, Markdown, Image
from neo4j import GraphDatabase
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain import PromptTemplate
from langchain_core.messages import HumanMessage
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import Neo4jVector
from langchain_community.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt

import openai
import textwrap
import pandas as pd

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_GPT4_32k_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_32k_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
api_version = "2024-02-01"

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

llm = AzureChatOpenAI(
    model=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
    azure_deployment=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
    api_key=OPENAI_API_KEY,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    openai_api_version=api_version,
)

client = openai.AzureOpenAI(
        azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
        api_key=OPENAI_API_KEY,
        api_version="2023-09-01-preview"
    )

# define embeddings 
embeddings = AzureOpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    openai_api_version=api_version,
    chunk_size = 1
)


In [None]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def call_openAI(text):
    response = llm.chat.completions.create(
        model=OPENAI_GPT4_32k_DEPLOYMENT_NAME,
        messages = text,
        temperature=0.0
    )
    return response.choices[0].message.content

#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return embeddings.embed_query(txt)

def prettyprint(text: str) -> str:
    print(textwrap.fill(text, 60))

In [6]:
import pandas as pd
embeddings_df = pd.read_csv("./data/embeddings/adx_embeddings.csv")
embeddings_df.head(2)

entities_df = pd.read_csv("./data/embeddings/adx_entities.csv")
entities_df.head(2)

Unnamed: 0,document_id,document_name,entities
0,c42d8f8c-6bad-494d-b95d-1c04ac9686c4,20071229X02007.pdf,"{""aircraft_make"": ""Piper PA-22-150"", ""accident..."
1,4ffeca39-3124-425b-b6c0-db4895da6280,20071231X02009.pdf,"{""aircraft_make"": ""Beech A23"", ""accident_numbe..."


In [2]:
#Define the graph schema
from neomodel import (config, StructuredNode, StringProperty, IntegerProperty,UniqueIdProperty, RelationshipTo)

url = "neo4j+s://{}:{}@{}:7687".format(NEO4J_USERNAME, NEO4J_PASSWORD, NEO4J_URI)

config.DATABASE_URL = url

class Country(StructuredNode):
    name = StringProperty(unique_index=True, required=True)

class State(StructuredNode):
    name = StringProperty(required=True)
    country = RelationshipTo(Country, 'FROM_COUNTRY')

class City(StructuredNode):
    name = StringProperty(required=True)
    state = RelationshipTo(State, 'FROM_STATE')

class Accident(StructuredNode):
    accident_number = StringProperty(unique_index=True)
    document_name = StringProperty(unique_index=True)
    aircraft_make = StringProperty(index=True, default="")
    aircraft_damage = StringProperty(index=True, default="")
    phase_of_operation = StringProperty(index=True, default="")

    # traverse outgoing IS_FROM relations, inflate to Country objects
    city = RelationshipTo(City, 'IN_CITY')

class Document(StructuredNode):
    document_name = StringProperty(unique_index=True)

    # traverse outgoing IS_FROM relations, inflate to Country objects
    accident = RelationshipTo(Accident, 'REPORTING_ACCIDENT')

class Chunk(StructuredNode):
    document_name = StringProperty(unique_index=True)
    chunk_number = IntegerProperty()

    # traverse outgoing IS_FROM relations, inflate to Country objects
    document = RelationshipTo(Document, 'REPORTING_ACCIDENT')

def get_json_key(json_object, key):
    if key not in json_object:
        return ""
    return json_object[key]

In [3]:
import json 
# add entities to the graph
for index, row in entities_df.iterrows():
    document_id = row["document_id"]
    document_name = row["document_name"]
    entities_json = row["entities"] 

    entities = json.loads(entities_json)
    accident = Accident.nodes.first_or_none(document_name=document_name)
    if accident == None:
        accident = Accident(name=document_name).save() 
        accident.accident_number = get_json_key(entities,"accident_number")
        accident.aircraft_make = get_json_key(entities,"aircraft_make")
        accident.aircraft_damage = get_json_key(entities,"aircraft_damage")
        accident.phase_of_operation = get_json_key(entities,"phase_of_operation")
        accident.save() 

        city_name = get_json_key(entities,"city")
        state_name = get_json_key(entities,"state")
        country_name = get_json_key(entities,"country")

        city = City.nodes.first_or_none(name=city_name)
        if city == None:
            city = City(name=city_name).save()
        accident.city.connect(city)
        
        state = State.nodes.first_or_none(name=state_name)
        if state == None:
            state = State(name=state_name).save()
        city.state.connect(state)

        country = Country.nodes.first_or_none(name=country_name)
        if country == None:
            country = Country(name=country_name).save()
        state.country.connect(country)

In [None]:
#create an index for the embeddings queries
neo4j_vector = Neo4jVector.from_existing_index(
    embeddings,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="vector",
)

def query_neo4j_vector(query):
    try:
        results = neo4j_vector.similarity_search(query)
        neo4j_vector._driver.close()
        return results[0].page_content
    except Exception as e:
        print(f"[ERROR] {e}")
