In [1]:
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from openai import OpenAI
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
import re




In [2]:

# Read the data from the CSV file
data = pd.read_csv('Aviation Quiz.csv')

In [3]:
loader = PyPDFLoader("14 CFR 23.1457.pdf") # FAA-CT-8080-7D manual
pages = loader.load_and_split()

loader2 = PyPDFLoader("atp_akts.pdf") 
pages2 = loader2.load_and_split()

loader3 = PyPDFLoader("annexes_booklet_en.pdf")
pages3 = loader3.load_and_split()

pages.extend(pages2) # combine the two lists    

pages.extend(pages3) # combine the three lists

In [4]:
model_name = "all-MiniLM-L6-v2" #this is under 600d
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

embeddings = hf

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)

chunked_documents = text_splitter.split_documents(pages)


#vectordb = Chroma.from_documents(
        #documents=chunked_documents,
       # 
   # )

from langchain_community.vectorstores import FAISS
faissdb = FAISS.from_documents(chunked_documents, embeddings)

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_most_similar_option(reply_embedding, a, b, c, embeddings):
    # Convert embeddings to NumPy arrays
    reply_embedding_array = np.array(reply_embedding)
    option_a_embedding_array = np.array(embeddings.embed_query(a))
    option_b_embedding_array = np.array(embeddings.embed_query(b))
    option_c_embedding_array = np.array(embeddings.embed_query(c))

    # Ensure embeddings are 2D arrays for cosine_similarity
    reply_embedding_2d = reply_embedding_array.reshape(1, -1)
    option_a_embedding_2d = option_a_embedding_array.reshape(1, -1)
    option_b_embedding_2d = option_b_embedding_array.reshape(1, -1)
    option_c_embedding_2d = option_c_embedding_array.reshape(1, -1)

    # Calculate similarity scores
    similarity_a = cosine_similarity(reply_embedding_2d, option_a_embedding_2d)[0][0]
    similarity_b = cosine_similarity(reply_embedding_2d, option_b_embedding_2d)[0][0]
    similarity_c = cosine_similarity(reply_embedding_2d, option_c_embedding_2d)[0][0]

    # Determine the most similar option
    similarities = {'A': similarity_a, 'B': similarity_b, 'C': similarity_c}
    most_similar_option = max(similarities, key=similarities.get)

    return most_similar_option

In [40]:
questionNumber = 2

query = data.iloc[questionNumber, 1]
query_embedding = embeddings.embed_query(query)

retriever = faissdb.as_retriever()
docs2 = retriever.get_relevant_documents(query)

string = docs2[0].page_content + query

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

completion = client.chat.completions.create(
  model="local-model", # this field is currently unused
  messages=[
    {"role": "system", "content": "You are a helpful assistant, who provides accurate answers to aviation problems. You can only ever answer with A, B or C. ANy text after the letter and a puppy dies, which you must avoid"},
    {"role": "user", "content": string},
  ],
  temperature=0.4,
  max_tokens = 20,
)

print(completion.choices[0].message.content)

reply = completion.choices[0].message.content
reply_embedding = embeddings.embed_query(reply)

import re


def extract_options(input_string):
    # Regular expression to find the options
    matches = re.findall(r'([ABC])\.\s(.*?)$', input_string, re.M)
    options = {letter: text for letter, text in matches}
    
    # Assign to individual strings
    option_a = options.get('A', 'Option A not found')
    option_b = options.get('B', 'Option B not found')
    option_c = options.get('C', 'Option C not found')
    
    return option_a, option_b, option_c

a, b, c = extract_options(data.iloc[questionNumber, 1])

print(a)
print(b)
print(c)

print(find_most_similar_option(reply_embedding, a, b, c, embeddings))

C. The cockpit voice recorder must remain powered for at least 20
The cockpit voice recorder and flight data recorder must be installed together in the same container.
The cockpit voice recorder must remain powered for at least 20 minutes after crash impact.
The recorder container must have an underwater locating device if required by the operating rules
B


A
