In [36]:
# pip install -r requirements.txt # --force-reinstall

In [47]:
import pandas as pd
import numpy as np
import os
import fitz
import json
import chromadb
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import TokenTextSplitter
import torch
import tiktoken
import hashlib
from dotenv import load_dotenv

In [16]:
np.__version__

'1.26.4'

In [46]:
load_dotenv()

docs_path = os.getenv('DOCS_PATH')
index_db = os.getenv('DB_NAME')

## Load PDFs from docs folder

In [90]:
def load_fpaths(docs_path):
    return [os.path.join(docs_path, f) for f in os.listdir(docs_path)]

In [6]:
def load_file(file_path):

    with fitz.open(file_path) as doc:
        return([page.get_text() for page in doc][0])

In [61]:
def make_id(file_path: str, chunk_id: int) -> str:
    
    raw = f"{file_path}_{chunk_id}"
    return hashlib.md5(raw.encode()).hexdigest()

In [88]:

def generate_index_entries(file_path):

    tokenizer = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")

    file_text = load_file(file_path)

    token_lst = tokenizer.split_text(file_text)
    embeddings = embed_model.encode(token_lst, batch_size=32, show_progress_bar=True)
    ids = [make_id(file_path, i+1) for i, _ in enumerate(token_lst)]
    metadata = [{'source': file_path, 'chunk': i+1, 'preview': token[:100]} for i, token in enumerate(token_lst)]

    return ids, token_lst, embeddings, metadata

In [89]:

def push_to_index(file_path, index_db):

    client = chromadb.Client()
    collection = client.get_or_create_collection(index_db)

    index_entries = generate_index_entries(file_path)
    
    collection.add(
        ids=index_entries[0],
        documents=index_entries[1],
        embeddings=index_entries[2],
        metadatas=index_entries[3]
    )
    
    return collection

In [79]:
push_to_index('docs/doc3.pdf', index_db)

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]


Collection(name=doc_index)

In [87]:
query = model.encode(["how do dogs communicate with other dogs?"]).tolist()
results = collection.query(query_embeddings=query, n_results=2)
print('Top matches:\n')
print(results["documents"][0][0])

Top matches:

 are critical in communication with other dogs.
Every dog’s nose print is unique, similar to human fingerprints. In fact, some kennel clubs and 
organizations have experimented with nose-printing as a form of identification. The wet surface 
of a dog’s nose also plays an important role in scent detection. The moisture helps capture scent 
particles from the air, enhancing the dog’s ability to detect and analyze odors. When a dog licks 
its nose, it’s essentially "resetting" this system, making it easier to pick up the next scent.



In [80]:
client = chromadb.Client()
print(client.list_collections())
print("Number of items:", collection.count())
results = collection.get()
print("IDs:", results["ids"][:5])
print("Documents:", results["documents"][:2])   # show first 2 docs
print("Metadata:", results["metadatas"][:2])

[Collection(name=doc_index)]
Number of items: 10
IDs: ['159f62dcac4000f0db94e4cfaa7abdda', 'c9d0461272109d46f02e360cbac06fd1', '30624931fb9ff1a193bda505652f1bf2', 'f7cb9c9588495a45788a63f9ff491dd1', '5937f02a29431fb4f24be9126f8654ed']
Documents: ['The Remarkable Sense of Smell in Dogs\nDogs are often called "noses with legs," and for good reason. Their olfactory system is one of the \nmost advanced in the animal kingdom. While humans have about 5 million scent receptors, dogs \ncan have anywhere from 125 million to 300 million depending on the breed. This difference \ngives them an extraordinary ability to detect faint odors, sometimes at concentrations as low as \nparts per trillion. It’s not just the number of receptors, though—the part of a dog’s brain devoted \nto analyzing smells is 40 times larger than that of a human, relative to brain size.\nCertain breeds, like the Bloodhound, German Shepherd, and Belgian Malinois, are trained for \nscent detection work in law enforcement and 