In [None]:
import sys
import os
import hashlib
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from IPython.display import HTML

# ------------------------------------------------------------
# STEP 1: Configuration
# ------------------------------------------------------------
# We define the paths where our documents and vector database will live.
sys.path.append(os.path.abspath("../src"))
markdown_dir = "../data/external"      # Folder with markdown files
persist_dir = "../data/chroma_db/01"   # Where the Chroma database will be stored

In [2]:
# ------------------------------------------------------------
# STEP 2: Load documents
# ------------------------------------------------------------
# We go through all markdown files and load them using TextLoader.
# Each document gets metadata like its filename and a unique SHA256 hash.
raw_documents = []
for filename in os.listdir(markdown_dir):
    if filename.endswith(".md"):
        filepath = os.path.join(markdown_dir, filename)
        loader = TextLoader(filepath, encoding="utf-8")
        docs = loader.load()
        for doc in docs:
            doc.metadata["source"] = filename
            doc.metadata["hash"] = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest()
            raw_documents.append(doc)

In [3]:
# ------------------------------------------------------------
# STEP 3: Split documents into chunks
# ------------------------------------------------------------
# Large documents are split into smaller fragments (chunks).
# This makes it easier for the AI to retrieve precise answers.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=75)
split_documents = text_splitter.split_documents(raw_documents)

print(f"📄 Total number of chunks: {len(split_documents)}\n")

# Let's show a quick preview of the first few chunks.
for i, doc in enumerate(split_documents[:5], start=1):
    preview = doc.page_content[:120].replace("\n", " ") + ("..." if len(doc.page_content) > 120 else "")
    print(f"Chunk {i}: {preview} (Source: {doc.metadata.get('source', 'unknown')})")

📄 Total number of chunks: 104

Chunk 1: # Varonil - Signature Tailoring  **Motto:** "Style is not bought, it is cultivated. Quality is not negotiated, it is dem... (Source: Varonil.md)
Chunk 2: We believe in a world where one buys less, but chooses better. Each of our pieces is designed to last for generations, n... (Source: Varonil.md)
Chunk 3: - **Vicuña Wool from the Puna:** Considered the finest and rarest animal fiber in the world. It is obtained by harmlessl... (Source: Varonil.md)
Chunk 4: - **Pima Cotton from the Calchaquí Valleys:** Hand-harvested in Salta, this extra-long staple cotton ensures exceptional... (Source: Varonil.md)
Chunk 5: ## 3. The "Estirpe" Permanent Collection  We do not produce seasonal collections. We offer a set of iconic pieces, perfe... (Source: Varonil.md)


In [4]:
# ------------------------------------------------------------
# Create embeddings
# ------------------------------------------------------------
# Here we transform each text chunk into a numerical vector (embedding).
# We use HuggingFace's MiniLM model because it is small, fast, and effective.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")


In [5]:
# ------------------------------------------------------------
# Create or load the Chroma vector database
# ------------------------------------------------------------
# If the database doesn't exist, we build it from our split documents.
# Otherwise, we simply load the existing database.
os.makedirs(persist_dir, exist_ok=True)
if len(os.listdir(persist_dir)) == 0:
    print("⚡ Creating new vector store...")
    vectorstore = Chroma.from_documents(
        documents=split_documents,
        embedding=embeddings,
        persist_directory=persist_dir
    )
else:
    print("✅ Loading existing vector store...")
    vectorstore = Chroma(
        persist_directory=persist_dir,
        embedding_function=embeddings
    )

# Show how many documents are currently in the database.
print("📦 Number of documents in the store:", vectorstore._collection.count())

⚡ Creating new vector store...
📦 Number of documents in the store: 104


In [None]:
# ------------------------------------------------------------
# Preview stored data and embeddings
# ------------------------------------------------------------
# We can retrieve documents and embeddings to see how text has been transformed into numbers.
sample_data = vectorstore.get(include=["embeddings", "documents", "metadatas"])

print("\n🔍 Preview of stored data (text + embedding numbers):\n")
for i, (doc, emb, meta) in enumerate(
    zip(sample_data["documents"][:3], sample_data["embeddings"][:3], sample_data["metadatas"][:3]),
    start=1
):
    # Show a snippet of the text
    preview_text = doc[:100].replace("\n", " ") + ("..." if len(doc) > 100 else "")
    # Show the first 10 numbers of the embedding vector
    preview_embedding = ", ".join(f"{x:.4f}" for x in emb[:10])

    print(f"--- Document {i} ---")
    print(f"Text preview: {preview_text}")
    print(f"Embedding (first 10 values): [{preview_embedding}]")
    print(f"(Source: {meta.get('source', 'unknown')})\n")


🔍 Preview of stored data (text + embedding numbers):

--- Document 1 ---
Text preview: # Varonil - Signature Tailoring  **Motto:** "Style is not bought, it is cultivated. Quality is not n...
Embedding (first 10 values): [-0.0507, -0.0285, -0.2047, -0.0649, -0.1940, 0.1361, 0.1589, 0.1468, -0.0748, -0.1889]
(Source: Varonil.md)

--- Document 2 ---
Text preview: We believe in a world where one buys less, but chooses better. Each of our pieces is designed to las...
Embedding (first 10 values): [-0.0294, -0.0250, -0.0426, -0.1478, 0.0665, -0.2179, -0.0942, 0.0094, -0.0485, -0.0820]
(Source: Varonil.md)

--- Document 3 ---
Text preview: - **Vicuña Wool from the Puna:** Considered the finest and rarest animal fiber in the world. It is o...
Embedding (first 10 values): [0.0258, 0.1227, 0.0576, -0.1084, 0.0754, -0.2187, 0.1298, -0.0948, -0.1254, -0.0063]
(Source: Varonil.md)



In [7]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_ollama import OllamaLLM

# ------------------------------------------------------------
# Define a prompt template
# ------------------------------------------------------------
# This template will be used to structure the question and the context
template = """
You are an assistant that answers questions based on the provided context.

Context:
{context}

Question:
{question}

Answer in a clear and concise way:
"""
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [8]:
# ------------------------------------------------------------
# Load a local LLM with Ollama
# ------------------------------------------------------------
# We use a light model (like mistral) that runs on local hardware.
llm = OllamaLLM(model="phi3:mini")


In [9]:
# ------------------------------------------------------------
# Build a RetrievalQA chain
# ------------------------------------------------------------
# The retriever will find relevant chunks, and the LLM will use them to answer.
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 1}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

In [None]:
# ------------------------------------------------------------
# Ask a question and inspect the prompt
# ------------------------------------------------------------
query = "What is the price of The Catalyst Blazer?"

# Manually retrieve the relevant documents (chunks)
docs = vectorstore.as_retriever(search_kwargs={"k": 4}).get_relevant_documents(query)

# Build the context from the retrieved chunks
context = "\n".join([doc.page_content for doc in docs])

# Fill the prompt template with actual context and question
filled_prompt = prompt.format(context=context, question=query)

# Show the exact prompt sent to the LLM
print("\n🧾 Prompt sent to the model:")
print(filled_prompt)

# Directly invoke the model with the composed prompt
response = llm.invoke(filled_prompt)

# Display the model's response
print("\n🤖 Model response:\n")
print(response)



🧾 Prompt sent to the model:

You are an assistant that answers questions based on the provided context.

Context:
## 4. The Capsule Collection

### 4.1. The Catalyst Blazer

The cornerstone of a power wardrobe. Structured, but as comfortable as your favorite cardigan.
- **Material:** Signature **Flex-Form™** Twill.
- **Features:** Action-Back Construction, Secure-Zip Pockets (2 external, 1 internal), single-button closure.
- **Fit:** Tailored, with a slightly nipped-in waist.
- **Colors:** Onyx Black, Midnight Navy, Slate Gray, Ivory.
- **Price:** $295 USD.

### 4.2. The Pivot Pant
## 5. Frequently Asked Questions (FAQ)

- **Q: Why is the "Legado" Blazer so expensive?**
  - A: The price reflects the use of vicuña wool, one of the rarest and most costly fibers in the world, and the 80 hours of artisanal labor each piece requires. It is an investment in a functional work of art.
## 6. Frequently Asked Questions (FAQ)

- **Q: Are the pockets on the Catalyst Blazer actually real and funct