In [2]:
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [3]:
loader = PyPDFDirectoryLoader('./docs')
docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

In [4]:
len(docs_after_split)

7141

In [5]:
print(docs_after_split[500].page_content)

ensure limitations for extending those devices are not exceeded. Practicing this maneuver in other configurations, such as a clean or 
takeoff configuration, is also good training and may be evaluated on the practical test. 
[Figure 5-7] The elevator control is less responsive and larger control movements are necessary to retain control of the airplane. In 
propeller-driven airplanes, torque, slipstream effect, and P-factor may produce a strong left yaw, which requires right rudder input to


In [6]:
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name='sentence-transformers/all-MiniLM-l6-v2',
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

  hf_embeddings = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [7]:
sample_embedding = np.array(hf_embeddings.embed_query(docs_after_split[0].page_content))
sample_embedding.shape

(384,)

In [8]:
index = FAISS.from_documents(docs_after_split, hf_embeddings)
index.index.ntotal

7141

In [25]:
retriever = index.as_retriever(
    search_type='mmr', 
    search_kwargs={'k': 3}
)

In [None]:
hf = HuggingFacePipeline.from_model_id(
    model_id='google/gemma-2-2b',
    task='text-generation',
    device=0,
    pipeline_kwargs={'max_new_tokens': 1000}
)

Downloading shards: 100%|██████████| 3/3 [06:31<00:00, 130.50s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  7.62it/s]


In [None]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum. Do not repeat yourself.

Context: 
{context}

Question: 
{question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=['context', 'question']
)

In [39]:
qa = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type='stuff',
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={'prompt': PROMPT}
)

In [73]:
query = "What is a VOR?"

In [74]:
result = qa.invoke({'query': query})
result

{'query': 'What is a VOR?',
 'result': 'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n2. If you find the answer, write the answer in a concise way with five sentences maximum.\n\nContext: \nVORTAC. By itself it is known as a VOR, and it provides \nmagnetic bearing information to and from the station. When \nDME is also installed with a VOR, the NAVAID is referred \nto as a VOR/DME. When military tactical air navigation \n(TACAN) equipment is installed with a VOR, the NAVAID \nis known as a VORTAC. DME is always an integral part of \na VORTAC. Regardless of the type of NAVAID utilized \n(VOR, VOR/DME, or VORTAC), the VOR indicator \nbehaves the same. Unless otherwise noted in this section, \nVOR, VOR/DME, and VORTAC NAVAIDs are all referred \nto hereafter as VORs.\nThe pre

In [75]:
print(result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

Context: 
VORTAC. By itself it is known as a VOR, and it provides 
magnetic bearing information to and from the station. When 
DME is also installed with a VOR, the NAVAID is referred 
to as a VOR/DME. When military tactical air navigation 
(TACAN) equipment is installed with a VOR, the NAVAID 
is known as a VORTAC. DME is always an integral part of 
a VORTAC. Regardless of the type of NAVAID utilized 
(VOR, VOR/DME, or VORTAC), the VOR indicator 
behaves the same. Unless otherwise noted in this section, 
VOR, VOR/DME, and VORTAC NAVAIDs are all referred 
to hereafter as VORs.
The prefix “omni-” means all, and an omnidirectional range 
is a V

In [76]:
for i, document in enumerate(result['source_documents']):
    print(f"{i + 1}. {document.metadata['source']}, pg. {document.metadata['page']}")

1. docs/Pilot's Handbook of Aeronautical Knowledge.pdf, pg. 408
2. docs/Pilot's Handbook of Aeronautical Knowledge.pdf, pg. 361
3. docs/Pilot's Handbook of Aeronautical Knowledge.pdf, pg. 500
