In [None]:
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from chromadb import PersistentClient
from tqdm import tqdm
from litellm import completion
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go


load_dotenv(override=True)

MODEL = "gpt-4.1-nano"

DB_NAME = "preprocessed_db"
collection_name = "docs"
embedding_model = "text-embedding-3-large"
#KNOWLEDGE_BASE_PATH = Path("knowledge-base")

GOOGLE_DOC_URLS = [
    "https://docs.google.com/document/d/1DKs6IcvDa3xeE-E0DTPUwAIAq3VHBZm7vv4b-rNe1uk/edit?usp=sharing",
    "https://docs.google.com/document/d/1TIser-RhtD_sbyOfltpzfQmnYpJV2wpew53R2Ydyd5s/edit?usp=sharing",
    "https://docs.google.com/document/d/1QEr8KK0y91MI5PVZ5kUk824yLzV6vzb23TJqHptxPzQ/edit?usp=sharing",
    "https://docs.google.com/document/d/18Sp6koXBKHbo9QA0utYfbDv7TZ-MdkCRV1txNxikt5Y/edit?usp=sharing",
    "https://docs.google.com/document/d/1x7Nj6Bt99PMIyJKCDw5b5Vw--WK_rhT4cJE0aXTR4PM/edit?usp=sharing",
    

AVERAGE_CHUNK_SIZE = 500

openai = OpenAI()

In [2]:

class Result(BaseModel):
    page_content: str
    metadata: dict

In [3]:


class Chunk(BaseModel):
    headline: str = Field(description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query")
    summary: str = Field(description="A few sentences summarizing the content of this chunk to answer common questions")
    original_text: str = Field(description="The original text of this chunk from the provided document, exactly as is, not changed in any way")

    def as_result(self, document):
        metadata = {
            "source": document["source"],
            "type": document.get("type", "google_doc")   
        }
        return Result(page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text, metadata=metadata)


class Chunks(BaseModel):
    chunks: list[Chunk]

In [4]:
import requests

def fetch_google_doc_text(doc_url: str) -> str:
    doc_id = doc_url.split("/d/")[1].split("/")[0]
    export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
    response = requests.get(export_url)
    response.raise_for_status()
    return response.text

In [5]:
def fetch_documents():
    """Fetch documents from Google Docs"""

    documents = []

    for url in GOOGLE_DOC_URLS:
        text = fetch_google_doc_text(url)

        documents.append({
            "type": "google_doc",
            "source": url,
            "text": text
        })

    print(f"Loaded {len(documents)} documents")
    return documents

In [6]:
documents = fetch_documents()

Loaded 5 documents


In [7]:
def make_prompt(document):
    how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
    return f"""
You take a document and split it into overlapping chunks for a personal programming knowledge base.

The document type: {document.get("type", "google_doc")}
The document source: {document.get("source", "unknown")}

These chunks will be used by a chatbot to answer the user's programming questions (Python, LLMs, APIs, tooling, debugging, etc.).
Split the document in a way that supports retrieval: keep topics coherent, preserve code blocks, and avoid breaking explanations mid-thought.
Make sure the entire document is covered by the chunks — do not omit anything.
This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
Include overlap between chunks (typically ~25% overlap or ~50 words), so key context appears in multiple chunks.

For each chunk, provide:
- headline: a short, query-friendly title (a few words)
- summary: a few sentences summarizing what this chunk helps with (focus on likely questions)
- original_text: the exact original text of the chunk (do not rewrite or alter it)

Return your answer as JSON in this exact format:
{{"chunks":[{{"headline":"...","summary":"...","original_text":"..."}}, ...]}}

Here is the document:

{document["text"]}
"""

In [8]:
print(make_prompt(documents[0]))


You take a document and split it into overlapping chunks for a personal programming knowledge base.

The document type: google_doc
The document source: https://docs.google.com/document/d/1DKs6IcvDa3xeE-E0DTPUwAIAq3VHBZm7vv4b-rNe1uk/edit?usp=sharing

These chunks will be used by a chatbot to answer the user's programming questions (Python, LLMs, APIs, tooling, debugging, etc.).
Split the document in a way that supports retrieval: keep topics coherent, preserve code blocks, and avoid breaking explanations mid-thought.
Make sure the entire document is covered by the chunks — do not omit anything.
This document should probably be split into 71 chunks, but you can have more or less as appropriate.
Include overlap between chunks (typically ~25% overlap or ~50 words), so key context appears in multiple chunks.

For each chunk, provide:
- headline: a short, query-friendly title (a few words)
- summary: a few sentences summarizing what this chunk helps with (focus on likely questions)
- origin

In [9]:
def make_messages(document):
    return [
        {"role": "user", "content": make_prompt(document)},
    ]

In [10]:
make_messages(documents[0])

[{'role': 'user',
  'content': '\nYou take a document and split it into overlapping chunks for a personal programming knowledge base.\n\nThe document type: google_doc\nThe document source: https://docs.google.com/document/d/1DKs6IcvDa3xeE-E0DTPUwAIAq3VHBZm7vv4b-rNe1uk/edit?usp=sharing\n\nThese chunks will be used by a chatbot to answer the user\'s programming questions (Python, LLMs, APIs, tooling, debugging, etc.).\nSplit the document in a way that supports retrieval: keep topics coherent, preserve code blocks, and avoid breaking explanations mid-thought.\nMake sure the entire document is covered by the chunks — do not omit anything.\nThis document should probably be split into 71 chunks, but you can have more or less as appropriate.\nInclude overlap between chunks (typically ~25% overlap or ~50 words), so key context appears in multiple chunks.\n\nFor each chunk, provide:\n- headline: a short, query-friendly title (a few words)\n- summary: a few sentences summarizing what this chunk 

In [11]:
def process_document(document):
    messages = make_messages(document)
    response = completion(model=MODEL, messages=messages, response_format=Chunks)
    reply = response.choices[0].message.content
    doc_as_chunks = Chunks.model_validate_json(reply).chunks
    return [chunk.as_result(document) for chunk in doc_as_chunks]

In [12]:
process_document(documents[0])

[Result(page_content='Materials and Resources\n\nProvides links to materials, communities, and resources for learning programming with Python and related tools.\n\nKarta – 1\nMateriały\nhttps://discord.com/invite/FYCHf64Xuc\nhttps://zerotomastery.io/\nhttps://www.youtube.com/@ZeroToMastery\nhttps://www.udemy.com/course/complete-python-developer-zero-to-mastery/learn/lecture/16337512#overview\n\nZadania z Pythona: https://github.com/darkprinx/break-the-ice-with-python?tab=readme-ov-file', metadata={'source': 'https://docs.google.com/document/d/1DKs6IcvDa3xeE-E0DTPUwAIAq3VHBZm7vv4b-rNe1uk/edit?usp=sharing', 'type': 'google_doc'})]

In [13]:
def create_chunks(documents):
    chunks = []
    for doc in tqdm(documents):
        chunks.extend(process_document(doc))
    return chunks

In [14]:
chunks = create_chunks(documents)

100%|██████████| 5/5 [00:51<00:00, 10.24s/it]


In [15]:
print(len(chunks))

17


In [16]:
def create_embeddings(chunks):
    chroma = PersistentClient(path=DB_NAME)
    if collection_name in [c.name for c in chroma.list_collections()]:
        chroma.delete_collection(collection_name)

    texts = [chunk.page_content for chunk in chunks]
    emb = openai.embeddings.create(model=embedding_model, input=texts).data
    vectors = [e.embedding for e in emb]

    collection = chroma.get_or_create_collection(collection_name)

    ids = [str(i) for i in range(len(chunks))]
    metas = [chunk.metadata for chunk in chunks]

    collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
    print(f"Vectorstore created with {collection.count()} documents")

In [17]:
create_embeddings(chunks)

Vectorstore created with 17 documents


In [18]:
chroma = PersistentClient(path=DB_NAME)
collection = chroma.get_or_create_collection(collection_name)

result = collection.get(include=["embeddings", "documents", "metadatas"])

vectors = np.array(result["embeddings"])
documents = result["documents"]
metadatas = result["metadatas"]

# Color per document (source)
sources = [m.get("source", "unknown") for m in metadatas]
unique_sources = sorted(set(sources))

palette = [
    "blue", "green", "red", "orange", "purple",
    "brown", "pink", "gray", "olive", "cyan"
]

SOURCE_TO_COLOR = {
    source: palette[i % len(palette)]
    for i, source in enumerate(unique_sources)
}

colors = [SOURCE_TO_COLOR[source] for source in sources]

In [20]:
n = vectors.shape[0]
perplexity = max(2, min(30, n - 1))  # musi być < n_samples

tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode="markers",
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Source: {s}<br>Text: {d[:200]}..." for s, d in zip(sources, documents)],
    hoverinfo="text"
)])

fig.update_layout(
    title="2D Chroma Vector Store Visualization",
    xaxis_title="x",
    yaxis_title="y",
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [21]:
class RankOrder(BaseModel):
    order: list[int] = Field(
        description="The order of relevance of chunks, from most relevant to least relevant, by chunk id number"
    )

In [57]:
RETRIEVAL_K = 10

def fetch_context_unranked(question):
    query_vec = openai.embeddings.create(
        model=embedding_model,
        input=[question]
    ).data[0].embedding

    # (minimalny) guard: nie proś o więcej niż masz w bazie
    k = min(RETRIEVAL_K, collection.count())

    results = collection.query(
        query_embeddings=[query_vec],
        n_results=k,
        include=["documents", "metadatas", "distances"]
    )

    docs = (results.get("documents") or [[]])[0]
    metas = (results.get("metadatas") or [[]])[0]
    dists = (results.get("distances") or [[None] * len(docs)])[0]

    chunks = []
    for doc, meta, dist in zip(docs, metas, dists):
        if not doc:  # pomiń None / pusty string
            continue
        meta = dict(meta or {})
        meta["distance"] = dist
        chunks.append(Result(page_content=doc, metadata=meta))

    return chunks

In [58]:
def rerank(question, chunks):
    system_prompt = """
You are a document re-ranker.
You are provided with a question and a list of relevant chunks of text from a query of a knowledge base.
The chunks are provided in the order they were retrieved; this should be approximately ordered by relevance, but you may be able to improve on that.
You must rank order the provided chunks by relevance to the question, with the most relevant chunk first.
Reply only as JSON in this exact format: {"order":[...]}.
Include all the chunk ids you are provided with, reranked.
"""

    n = len(chunks)

    user_prompt = (
        f"The user has asked the following question:\n\n{question}\n\n"
        "Order all the chunks of text by relevance to the question, from most relevant to least relevant. "
        "Include all the chunk ids you are provided with, reranked.\n\n"
        f"Valid chunk ids are integers from 1 to {n}.\n\n"
        "Here are the chunks:\n\n"
    )

    for index, chunk in enumerate(chunks):
        user_prompt += f"# CHUNK ID: {index + 1}:\n\n{chunk.page_content}\n\n"

    user_prompt += 'Reply only as JSON in this exact format: {"order":[...]}'

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    response = completion(model=MODEL, messages=messages, response_format=RankOrder)
    reply = response.choices[0].message.content
    order = RankOrder.model_validate_json(reply).order

    # --- minimal robustness: keep only valid ids, remove duplicates, add missing ids ---
    order = [i for i in order if isinstance(i, int) and 1 <= i <= n]

    seen = set()
    order = [i for i in order if not (i in seen or seen.add(i))]

    missing = [i for i in range(1, n + 1) if i not in order]
    order.extend(missing)
    # -------------------------------------------------------------------------------

    print("len(chunks) =", n)
    print("order =", order)
    print("min/max =", min(order), max(order))

    return [chunks[i - 1] for i in order]

In [59]:
question = "Co to są dunder methods"
chunks = fetch_context_unranked(question)

In [60]:
for chunk in chunks:
    print(chunk.page_content[:15]+"...")

Dekorator (Deco...
Singleton

Prze...
Obserwator

Opi...
Builder

Wzorze...
Komendy podstaw...
Developer Envir...
Pyłek (Flyweigh...
Setting Up Envi...
Adapter (Adapte...
Model Size Expl...


In [61]:
reranked = rerank(question, chunks)

len(chunks) = 10
order = [9, 1, 4, 7, 3, 2, 5, 6, 8, 10]
min/max = 1 10


In [62]:
for chunk in reranked:
    print(chunk.page_content[:15]+"...")

Adapter (Adapte...
Dekorator (Deco...
Builder

Wzorze...
Pyłek (Flyweigh...
Obserwator

Opi...
Singleton

Prze...
Komendy podstaw...
Developer Envir...
Setting Up Envi...
Model Size Expl...


In [63]:
question = "Co robi funkcja void?"
RETRIEVAL_K = 20
chunks = fetch_context_unranked(question)
for index, c in enumerate(chunks):
    if "void" in c.page_content.lower():
        print(index)

0
5
6
7
8


In [64]:
reranked = rerank(question, chunks)

len(chunks) = 17
order = [7, 9, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17]
min/max = 1 17


In [65]:
for index, c in enumerate(reranked):
    if "void" in c.page_content.lower():
        print(index)

0
1
2
7
8


In [66]:
reranked[0].page_content

'Pyłek (Flyweight)\n\nWzorzec Pyłek optymalizuje pamięć, współdzieląc wspólne części obiektów (np. kolory), zwłaszcza przy dużej liczbie podobnych instancji, poprzez odróżnianie danych wewnętrznych i zewnętrznych.\n\nTo wzorzec projektowy z grupy strukturalnych, który służy do oszczędzania pamięci poprzez współdzielenie wspólnych części obiektów zamiast tworzenia ich kopii.\n________________\n\n📌 Idea:\n                                    * Jeśli aplikacja potrzebuje wielu podobnych obiektów, zamiast powielać te same dane w każdym obiekcie, można je wydzielić do współdzielonego obiektu (intrinsic state).\n\n                                    * Dane unikalne (extrinsic state) przekazuje się tylko w momencie użycia.\n\n________________\n\nElementy:\n                                       * Flyweight (Pyłek) – wspólny interfejs dla obiektów lekkich.\n\n                                       * ConcreteFlyweight – implementacja, która przechowuje stan wspólny (dzielony).\n\n               

In [67]:
def fetch_context(question):
    chunks = fetch_context_unranked(question)
    return rerank(question, chunks)

In [68]:
SYSTEM_PROMPT = """
You are a knowledgeable, helpful programming assistant.
You are chatting with a user who is asking questions about programming concepts, languages, tools, and software development.

Your answers will be evaluated for accuracy, relevance, and completeness.
Answer the user's question using ONLY the information provided in the context below.
Do not invent information or rely on outside knowledge.
If the context does not contain enough information to answer the question, say so clearly.

For context, here are specific extracts from the personal programming knowledge base that may be relevant to the user's question:
{context}

Using only this context, please answer the user's question.
Be accurate, relevant, and complete.
"""

In [69]:

def make_rag_messages(question, history, chunks):
    context = "\n\n".join(f"Extract from {chunk.metadata['source']}:\n{chunk.page_content}" for chunk in chunks)
    system_prompt = SYSTEM_PROMPT.format(context=context)
    return [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": question}]

In [70]:
def rewrite_query(question, history=None):
    """Rewrite the user's question into a short, specific query suitable for searching a personal programming knowledge base."""
    if history is None:
        history = []

    message = f"""
You are assisting a user by rewriting their question into a concise search query
that will be used to look up information in a personal programming knowledge base.

This is the history of the conversation so far:
{history}

This is the user's current question:
{question}

Respond only with a single, refined, very short query that is most likely
to surface relevant programming-related content (concepts, APIs, syntax, behavior).
Do not include explanations or extra text.
IMPORTANT: Respond ONLY with the search query, nothing else.
"""

    response = completion(
        model=MODEL,
        messages=[{"role": "system", "content": message}]
    )
    return response.choices[0].message.content.strip()

In [71]:
rewrite_query("Co robi funkcja void?", [])

'void function purpose'

In [72]:
from typing import Optional

def answer_question(question: str, history: Optional[list[dict]] = None) -> tuple[str, list]:
    """
    Answer a question using RAG and return the answer and the retrieved context
    """
    if history is None:
        history = []

    query = rewrite_query(question, history)
    print(query)

    chunks = fetch_context(query)
    messages = make_rag_messages(question, history, chunks)

    response = completion(model=MODEL, messages=messages)
    return response.choices[0].message.content, chunks

In [73]:
answer_question("Co robi funkcja void?", [])

void functions in programming
len(chunks) = 17
order = [7, 9, 14, 8, 10, 13, 15, 16, 17, 1, 2, 3, 4, 5, 6, 11, 12]
min/max = 1 17


('W przedstawionym kontekście nie ma bezpośrednio informacji na temat funkcji void, ale w języku C++ funkcja zadeklarowana jako void oznacza, że nie zwraca żadnej wartości. Taka funkcja wykonuje określone działania, ale po jej wywołaniu nie przekazuje żadnego wyniku z powrotem do miejsca, z którego została wywołana. W kodzie funkcje void często służą do wykonywania operacji, takich jak modyfikacja danych, wyświetlanie informacji lub inne działania, które nie wymagają zwracania wartości do dalszego użycia.',
 [Result(page_content='HTML text formatting tags\n\nThis chunk discusses various HTML tags used for emphasizing, bolding, underlining, and structuring text, along with best practices for their usage.\n\n-emfaza:\nSpecyficznym wyróżnieniem tekstu są znaczniki <em></em>, definiujące tzw. emfazę (ang. emphasis – nacisk w wymowie, emfaza) Kiedy kładziemy na jakieś słowa emfazę wypowiadając się, to znaczy to, iż celowo wypowiadamy te słowa ze zmienioną intonacją, gestykulacją, czy natęże

In [56]:
answer_question("Co to są dictionary w pythonie?", [])

Python dictionaries
len(chunks) = 17
order = [1, 2, 4, 5, 7, 8, 9, 11, 6, 10, 12, 13, 14, 15, 16, 17, 3]
min/max = 1 17


('W podanym kontekście nie ma bezpośredniej definicji słowa "dictionary" w Pythonie. Jednak na podstawie ogólnej znajomości programowania i tego, co zostało zawarte w materiałach, można wyjaśnić, że:\n\nDictionary w Pythonie to struktura danych, która przechowuje pary klucz-wartość. Umożliwia szybkie wyszukiwanie wartości na podstawie klucza. Dictionary jest podobny do słownika w języku naturalnym, gdzie korzystasz z słowa jako klucza i otrzymujesz definicję (wartość).\n\nPrzykład w Pythonie:\n```python\nmy_dict = {\n    "imie": "Jan",\n    "wiek": 30,\n    "miasto": "Warszawa"\n}\n```\n\nW tym przypadku:\n- Klucze to stringi: "imie", "wiek", "miasto"\n- Wartości to odpowiednio: "Jan", 30, "Warszawa"\n\nDictionary pozwala na dostęp do wartości po kluczu, np.:\n```python\nprint(my_dict["imie"])  # Wyświetli "Jan"\n```\n\nPodsumowując, w Pythonie dictionary to elastyczna, szybka struktura danych do przechowywania powiązań między kluczami a wartościami.',
 [Result(page_content='Configurin

In [74]:
answer_question("W którym dokumencie znajdę informacje na temat formularzy html", [])

html form documentation
len(chunks) = 17
order = [1, 2, 3, 16, 17, 13, 12, 11, 9, 10, 8, 7, 6, 5, 4, 14, 15]
min/max = 1 17


('Na podstawie dostępnego kontekstu nie ma bezpośredniej wzmianki o dokumentach zawierających informacje na temat formularzy HTML. Poniżej przedstawiono, czego można się dowiedzieć:\n\n- Dokument opisuje podstawy HTML, struktury tags, znaczniki nagłówków, paragrafów, tekstów, wzorców projektowych, takich jak Singleton, Adapter, Obserwator, Dekorator, Builder, Factory, Flyweight, a także zagadnienia związane z ustawieniem środowiska i API, oraz techniki ingestowania i dzielenia tekstów na fragmenty.\n\n- Nie ma w nim szczegółowych informacji o formularzach HTML (np. <form>, <input>, <select>).\n\nJeśli szukasz informacji na temat formularzy HTML, w podanym kontekście ich bezpośrednio nie znajdziesz.',
 [Result(page_content='Introduction to HTML files\n\nThis chunk explains the role of different HTML tags and elements, such as headings, paragraphs, and semantic tags, essential for structuring web content.\n\nHTML\nWprowadzenie\n\n  * HyperText to łącza, odnośniki do innych stron\n* Marku