In [1]:
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from openai import OpenAI
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
import re




In [2]:

# Read the data from the CSV file
data = pd.read_csv('Aviation Quiz.csv')

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader


loader = PyMuPDFLoader("14 CFR 23.1457.pdf") # FAA-CT-8080-7D manual
pages = loader.load()

loader2 = PyPDFLoader("atp_akts.pdf") 
pages2 = loader2.load()

loader3 = PyPDFLoader("annexes_booklet_en.pdf")
pages3 = loader3.load()

pages.extend(pages2) # combine the two lists    

pages.extend(pages3) # combine the three lists

In [12]:
from langchain_community.embeddings import GPT4AllEmbeddings

hf = GPT4AllEmbeddings()

"""model_name = "all-MiniLM-L6-v2" #this is under 600d
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)"""

embeddings = hf

text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=100)

chunked_documents = text_splitter.split_documents(pages)


#vectordb = Chroma.from_documents(
        #documents=chunked_documents,
       # 
   # )

from langchain_community.vectorstores import FAISS
faissdb = FAISS.from_documents(chunked_documents, embeddings)

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_most_similar_option(reply_embedding, a, b, c, embeddings):
    # Convert embeddings to NumPy arrays
    reply_embedding_array = np.array(reply_embedding)
    option_a_embedding_array = np.array(embeddings.embed_query(a))
    option_b_embedding_array = np.array(embeddings.embed_query(b))
    option_c_embedding_array = np.array(embeddings.embed_query(c))

    # Ensure embeddings are 2D arrays for cosine_similarity
    reply_embedding_2d = reply_embedding_array.reshape(1, -1)
    option_a_embedding_2d = option_a_embedding_array.reshape(1, -1)
    option_b_embedding_2d = option_b_embedding_array.reshape(1, -1)
    option_c_embedding_2d = option_c_embedding_array.reshape(1, -1)

    # Calculate similarity scores
    similarity_a = cosine_similarity(reply_embedding_2d, option_a_embedding_2d)[0][0]
    similarity_b = cosine_similarity(reply_embedding_2d, option_b_embedding_2d)[0][0]
    similarity_c = cosine_similarity(reply_embedding_2d, option_c_embedding_2d)[0][0]

    # Determine the most similar option
    similarities = {'A': similarity_a, 'B': similarity_b, 'C': similarity_c}
    most_similar_option = max(similarities, key=similarities.get)

    return most_similar_option

In [10]:
questionNumber = 2

query = data.iloc[questionNumber, 1]
query_embedding = embeddings.embed_query(query)

retriever = faissdb.as_retriever()
docs2 = retriever.get_relevant_documents(query)

string = docs2[0].page_content +  docs2[1].page_content + query

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

completion = client.chat.completions.create(
  model="local-model", # this field is currently unused
  messages=[
    {"role": "system", "content": "You are a helpful assistant, who provides accurate answers to aviation problems. You can only ever answer with A, B or C. ANy text after the letter and a puppy dies, which you must avoid"},
    {"role": "user", "content": string},
  ],
  temperature=0.33,
  max_tokens = 40,
)

print(completion.choices[0].message.content)

reply = completion.choices[0].message.content
reply_embedding = embeddings.embed_query(reply)

import re


def extract_options(input_string):
    # Regular expression to find the options
    matches = re.findall(r'([ABC])\.\s(.*?)$', input_string, re.M)
    options = {letter: text for letter, text in matches}
    
    # Assign to individual strings
    option_a = options.get('A', 'Option A not found')
    option_b = options.get('B', 'Option B not found')
    option_c = options.get('C', 'Option C not found')
    
    return option_a, option_b, option_c

a, b, c = extract_options(data.iloc[questionNumber, 1])

print(a)
print(b)
print(c)

print(find_most_similar_option(reply_embedding, a, b, c, embeddings))

The correct answer is (B). According to 14 CFR 23.1457(e)(2), the cockpit voice recorder must remain powered for
The cockpit voice recorder and flight data recorder must be installed together in the same container.
The cockpit voice recorder must remain powered for at least 20 minutes after crash impact.
The recorder container must have an underwater locating device if required by the operating rules
B


In [8]:
print(data.iloc[questionNumber, 1])

Which statement is true according to 14 CFR 23.1457?
Options:
A. The cockpit voice recorder and flight data recorder must be installed together in the same container.
B. The cockpit voice recorder must remain powered for at least 20 minutes after crash impact.
C. The recorder container must have an underwater locating device if required by the operating rules


In [11]:
print(docs2[2].page_content)

Title 14 —Aeronautics and Space
Chapter I —Federal Aviation Administration, Department of Transportation
Subchapter C —Aircraft
Part 23 —Airworthiness Standards: Normal Category Airplanes
Authority: 49 U.S.C. 106(f), 106(g), 40113, 44701–44702, 44704, Pub. L. 113–53, 127 Stat. 584 (49 U.S.C. 44704) note.
Source: Doc. No. FAA–2015–1621, Amdt. 23–64, 81 FR 96689, Dec. 30, 2016, unless otherwise noted.
§ 23.1457 Cockpit voice recorders.
This content is from the eCFR and is authoritative but unofficial.
(a)
Each cockpit voice recorder required by the operating rules of this chapter must be approved and must be
installed so that it will record the following:
(1)
Voice communications transmitted from or received in the airplane by radio.
(2)
Voice communications of flightcrew members on the flight deck.
(3)
Voice communications of flightcrew members on the flight deck, using the airplane's interphone
system.
(4)
Voice or audio signals identifying navigation or approach aids introduced into a

In [16]:
#for index, row in data.iterrows():

for index, row in data.iterrows():
    questionNumber = index

    query = data.iloc[questionNumber, 1]
    query_embedding = embeddings.embed_query(query)

    retriever = faissdb.as_retriever()
    docs2 = retriever.get_relevant_documents(query)

    string = docs2[0].page_content +  docs2[1].page_content + query

    # Point to the local server
    client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

    completion = client.chat.completions.create(
    model="local-model", # this field is currently unused
    messages=[
      {"role": "system", "content": "You are a helpful assistant, who provides accurate answers to aviation problems. You can only ever answer with A, B or C. ANy text after the letter and a puppy dies, which you must avoid"},
      {"role": "user", "content": string},
    ],
    temperature=0.33,
    max_tokens = 40,
    )

    print(completion.choices[0].message.content)

    reply = completion.choices[0].message.content
    reply_embedding = embeddings.embed_query(reply)

    import re


    def extract_options(input_string):
      # Regular expression to find the options
      matches = re.findall(r'([ABC])\.\s(.*?)$', input_string, re.M)
      options = {letter: text for letter, text in matches}
    
      # Assign to individual strings
      option_a = options.get('A', 'Option A not found')
      option_b = options.get('B', 'Option B not found')
      option_c = options.get('C', 'Option C not found')
    
      return option_a, option_b, option_c

    a, b, c = extract_options(data.iloc[questionNumber, 1]) 
    
    most_similar_option = find_most_similar_option(reply_embedding, a, b, c, embeddings)

    # Append the result to the DataFrame
    data.at[index, 'y'] = most_similar_option

    data.to_csv('Aviation_Quiz_with_Answers.csv', index=False)

The correct answer is B. Area microphone. According to 14 CFR 23.1457(a)(2), each cockpit voice recorder must be


  data.at[index, 'y'] = most_similar_option


The required color of the recorder container is bright orange or bright yellow, as specified in 14 CFR 23.1457(g)(1). This is to
The correct answer is (B). According to 14 CFR 23.1457(e)(2), the cockpit voice recorder must remain powered for
C. All of the above. According to 14 CFR 23.1457, each cockpit voice recorder must be installed so that the part of the
The answer is B. 72 hours, commencing within 10 days after date of injury. According to the National Transportation Safety Board (NTSB), a serious injury is
A. 14 CFR part 91.
The Federal Aviation Administration (FAA) publishes the Aeronautical Information Publication (AIP) which includes amend
The maximum distance that a departure alternate airport may be located from the departure airport is not more than 2 hours at normal cruise speed in still air with one engine inoperative,
