In [1]:
# Install requirements
!wget https://github.com/eur-nl/bongaerts-10k-rag/raw/refs/heads/main/chroma.sqlite3
!pip install langchain-community
!pip install langchain-chroma
!pip install langchain-huggingface
!pip install langchain-ollama

--2025-07-01 11:16:21--  https://github.com/eur-nl/bongaerts-10k-rag/raw/refs/heads/main/chroma.sqlite3
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/eur-nl/bongaerts-10k-rag/refs/heads/main/chroma.sqlite3 [following]
--2025-07-01 11:16:21--  https://media.githubusercontent.com/media/eur-nl/bongaerts-10k-rag/refs/heads/main/chroma.sqlite3
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1350488064 (1.3G) [application/octet-stream]
Saving to: ‘chroma.sqlite3.4’


2025-07-01 11:16:41 (248 MB/s) - ‘chroma.sqlite3.4’ saved [1350488064/1350488064]



In [2]:
# Import requirements
import os
import subprocess

from chromadb.config import Settings
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama

In [12]:
"""
Pick a segment from the vector database for RAG. Possible values:
"500" - static chunks with 500 characters
"1000" - static chunks with 1000 characters
"2500" - static chunks with 2500 characters
"optimized" - dynamic optimized chunks
"""
db_segment = "2500"

In [4]:
# Get the database and embeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

In [13]:
# Initialize the vector dabase
db_segments = ["500", "1000", "2500", "optimized"]

if db_segment not in db_segments:
    db_segment = "optimized"

vector_db = Chroma(
    persist_directory="/content",
    embedding_function=embeddings,
    collection_name=f"10k_{db_segment}"
)

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


<langchain_chroma.vectorstores.Chroma object at 0x7d6ad03422d0>


In [22]:
# Install and run ollama
os.environ.update({'OLLAMA_HOST': '0.0.0.0'})
!curl https://ollama.ai/install.sh | sh
!nohup ollama serve > ollama.log 2>&1 &
!ollama pull llama3

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13281    0 13281    0     0  63755      0 --:--:-- --:--:-- --:--:-- 63850
>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l


In [24]:
llm = ChatOllama(model="llama3.2:1b")

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(),
    llm,
    prompt=QUERY_PROMPT
)

template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
)

In [25]:
# Ask a question
query = "Which products are mentioned?"
print(chain.invoke(query))

The products mentioned in the document are:

- Sundar Pichai (Chief Executive Officer and Director)
- Ruth M. Porat (President and Chief Investment Officer; Chief Financial Officer (Principal Financial Officer))
- Amie Thuener O'Toole (Vice President, Corporate Controller and Principal Accounting Officer)
- Frances H. Arnold (Director)
- Sergey Brin (Co-Founder and Director)
- R. Martin Chávez (Director)
- L. John Doerr (Director)
- Roger W. Ferguson Jr. (Director)
- John L. Hennessy (Chairman, but no product mentioned directly; Director)
- Larry Page (Co-Founder and Director)
- K. Ram Shriram (Director)
- Robin L. Washington (Director)

Note that the document mentions "Sundar Pichai" in a table of contents section, indicating his role as CEO and director, but does not explicitly mention any specific product.
