# Vector Stores and Retrievals

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

### Load documents

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

In [6]:
pdfs = []
for root, folders, files in os.walk(".././data/rag-dataset"):
  for file in files:
    if file.endswith(".pdf"):
         pdfs.append(os.path.join(root, file))

pdfs

['.././data/rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf',
 '.././data/rag-dataset\\gym supplements\\2. High Prevalence of Supplement Intake.pdf',
 '.././data/rag-dataset\\health supplements\\1. Dietary supplements.pdf',
 '.././data/rag-dataset\\health supplements\\2. Nutraceuticals research.pdf',
 '.././data/rag-dataset\\health supplements\\3. Health Supplements Side Effect.pdf']

In [7]:
docs = []
for pdf in pdfs:
  loader = PyMuPDFLoader(pdf)
  temp = loader.load()
  docs.extend(list(temp))
  
print("Total pages from all docs:", len(docs))

Total pages from all docs: 64


### Chunking

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(docs)

In [10]:
import tiktoken

In [12]:
encoder = tiktoken.encoding_for_model("gpt-4o-mini")
for i in range(5):
  token = encoder.encode(chunks[i].page_content)
  print(f"Token length chunk {i+1}:", len(token))

Token length chunk 1: 294
Token length chunk 2: 219
Token length chunk 3: 162
Token length chunk 4: 183
Token length chunk 5: 169


In [19]:
print("Total chunks:", len(chunks))

Total chunks: 321


### Vector Embedding

In [24]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_ollama import OllamaEmbeddings

In [25]:
base_url = "http://localhost:11434"
embedding_model = "nomic-embed-text"

embeddings = OllamaEmbeddings(model=embedding_model, base_url=base_url)

In [26]:
vector = embeddings.embed_query("Hello world!")
print("Embedding size:", len(vector))

Embedding size: 768


In [27]:
index = faiss.IndexFlatL2(len(vector))
index.ntotal, index.d

(0, 768)

In [28]:
vector_store = FAISS(
  embedding_function=embeddings,
  index=index,
  docstore=InMemoryDocstore(),
  index_to_docstore_id={}
)

In [29]:
ids = vector_store.add_documents(documents=chunks)

In [30]:
len(ids), vector_store.index.ntotal

(321, 321)

In [32]:
# Search for relevant chunks
question = "Is creatine useful to increase muscle mass?"
relevant_chunks = vector_store.search(query=question,
                                      search_type="similarity",
                                      k=5)
relevant_chunks

[Document(metadata={'source': '.././data/rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'file_path': '.././data/rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'page': 0, 'total_pages': 15, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'iLovePDF', 'creationDate': '', 'modDate': 'D:20241021113850Z', 'trapped': ''}, page_content='acids than traditional protein sources. Its numerous benefits have made it a popular choice\nfor snacks and drinks among consumers [3]. Another widely embraced supplement is\ncaffeine, which is found in many sports and food supplements. Caffeine reduces perceived\neffort, minimizes fatigue and pain, and proves to be effective for endurance and high-\nintensity activities, which is the choice of consumers [4].\nCreatine monohydrate is another well-known supplement used to gain muscle mass\nand support performance and recovery. It is known not to 

In [34]:
# Save the vector store
db_name = "./../store/db_supplements"
vector_store.save_local(folder_path=db_name)