In [36]:
# pip install -r requirements.txt # --force-reinstall

In [1]:
import pandas as pd
import numpy as np
import os
import fitz
import json
import chromadb
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import TokenTextSplitter
import torch
import tiktoken
import hashlib
from ollama import chat
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
load_dotenv()

docs_path = os.getenv('DOCS_PATH')
index_db = os.getenv('DB_NAME')
model_name = os.getenv('MODEL_NAME')

In [10]:
print(docs_path, index_db, model_name)

docs doc_index gemma3:1b


## Load PDFs from docs folder

In [11]:
def load_fpaths(docs_path):
    return [os.path.join(docs_path, f) for f in os.listdir(docs_path)]

In [12]:
def load_file(file_path):

    with fitz.open(file_path) as doc:
        return([page.get_text() for page in doc][0])

In [13]:
def make_id(file_path: str, chunk_id: int) -> str:
    
    raw = f"{file_path}_{chunk_id}"
    return hashlib.md5(raw.encode()).hexdigest()

In [14]:
def generate_index_entries(file_path):

    tokenizer = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")

    file_text = load_file(file_path)

    token_lst = tokenizer.split_text(file_text)
    embeddings = embed_model.encode(token_lst, batch_size=32, show_progress_bar=True)
    ids = [make_id(file_path, i+1) for i, _ in enumerate(token_lst)]
    metadata = [{'source': file_path, 'chunk': i+1, 'preview': token[:100]} for i, token in enumerate(token_lst)]

    return ids, token_lst, embeddings, metadata

In [15]:
def push_to_index(file_path, index_db):

    client = chromadb.Client()
    collection = client.get_or_create_collection(index_db)

    index_entries = generate_index_entries(file_path)
    
    collection.add(
        ids=index_entries[0],
        documents=index_entries[1],
        embeddings=index_entries[2],
        metadatas=index_entries[3]
    )
    
    return collection

In [17]:
for file_ in load_fpaths(docs_path):
    push_to_index(file_, index_db)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]


In [19]:
client = chromadb.Client()
collection = client.get_or_create_collection(index_db)
print(client.list_collections())
print("Number of items:", collection.count())
results = collection.get()
print("IDs:", results["ids"][:5])
print("Documents:", results["documents"][:2])   # show first 2 docs
print("Metadata:", results["metadatas"][:2])

[Collection(name=doc_index)]
Number of items: 10
IDs: ['cb24c98c67335c77a4462ca592af1370', '5e5aa3c9be20cd9d061f1a3b6b520085', '7f967126d546912dbcc484c33194ead0', 'dc792cf25e06482987a7cc6947895c3b', 'f7cb9c9588495a45788a63f9ff491dd1']
Documents: ['The Essentials of Dog Care and Companionship\nOwning a dog is a long-term commitment that goes far beyond food and shelter. Proper care \ninvolves a combination of physical health management, mental stimulation, and emotional \nsupport. Dogs require regular veterinary visits for vaccinations, parasite prevention, and overall \nhealth check-ups. Preventive care helps detect issues early and can greatly extend a dog’s \nlifespan. Additionally, dental health is often overlooked but is vital; untreated dental disease can \nlead to infections and even organ damage.\nExercise is one of the most important components of dog care. The amount and type of exercise \nrequired varies by breed, age, and health. High-energy breeds like Border Collies, Huski

In [26]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [32]:
query = 'What can happen if a dog is left alone too long?'

embed_query = model.encode(query).tolist()
results = collection.query(query_embeddings=embed_query, n_results=2)

retrieved_context = ''.join(results['documents'][0])

stream = chat(
    model=model_name,
    messages=[
      {'role': 'system',
        'content': 'You are a helpful assistant. Use only the provided context to answer the question. If you are not sure about the answer, reply with "From the provided context, I do not know the answer to your question."'},
      
      {'role': 'user',
        'content': f'Context:\n{retrieved_context}\nQuestion:\n{query}'}
      ],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)

Leaving a dog alone for extended periods can lead to separation anxiety and stress-related behaviors, ultimately impacting their well-being.

In [31]:
query = 'What can happen if a dog is left alone too long?'

embed_query = model.encode(query, convert_to_tensor=True, normalize_embeddings=True)
embed_query = embed_query.tolist()  # convert only once for Chroma

# 🔹 Retrieve top 2 chunks
results = collection.query(query_embeddings=embed_query, n_results=2)
retrieved_context = "\n\n".join(results['documents'][0])  # readable, no giant string

# 🔹 Send to Ollama with context
stream = chat(
    model=model_name,
    messages=[
        {
            "role": "system",
            "content": (
                "You are a helpful assistant. Use only the provided context to answer. "
                "If unsure, reply: 'From the provided context, I do not know.'"
            )
        },
        {
            "role": "user",
            "content": f"Context:\n{retrieved_context}\n\nQuestion:\n{query}"
        }
    ],
    stream=True,
)

# 🔹 Stream the response
for chunk in stream:
    print(chunk['message']['content'], end='', flush=True)

If a dog is left alone for extended periods, it can lead to separation anxiety and stress-related behaviors.