In [52]:
# Get Data from documents

import requests
from openai.types.responses import ResponseFunctionToolCall

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()


documents = []

for record in documents_raw:
    course_name = record['course'] #data-engineering-zoomcamp#

    for element in record['documents']: #documents[]
        element['course'] = course_name
        documents.append(element)

In [53]:
# Create an Appendable Index ( Advantage is that we can keep adding to an appendable index)
from minsearch import AppendableIndex


# text fields are tokenized and put in an inverted index.
# Overview of how the index workds below
"""
🧩Text Fields and  Inverted Index — Complete Side-by-Side Diagram
(for text_fields=["question", "text", "section"])

Example Documents:

Doc 0:
    question: "What is deep learning?"
    text:     "Deep learning is a subset of machine learning."
    section:  "AI Basics"

Doc 1:
    question: "How to use Python?"
    text:     "Python is widely used for data analysis."
    section:  "Programming"

Doc 2:
    question: "What is machine learning?"
    text:     "Machine learning involves algorithms that learn from data."
    section:  "AI Basics"

-------------------------------------------------------------------------------
Tokenization (stop words removed, lowercase)

Doc 0 tokens:
    question → ["deep", "learning"]
    text     → ["deep", "learning", "subset", "machine", "learning"]
    section  → ["ai", "basics"]

Doc 1 tokens:
    question → ["python", "use"]
    text     → ["python", "widely", "used", "data", "analysis"]
    section  → ["programming"]

Doc 2 tokens:
    question → ["machine", "learning"]
    text     → ["machine", "learning", "involves", "algorithms", "learn", "data"]
    section  → ["ai", "basics"]

-------------------------------------------------------------------------------
Inverted Index — Side-by-Side View

+----------------+---------------------------+------------------+
| question field | text field                | section field    |
+----------------+---------------------------+------------------+
| deep     → [0] | deep       → [0]          | ai          → [0,2] |
| learning → [0,2]| learning   → [0,2]       | basics      → [0,2] |
| python   → [1] | subset     → [0]          | programming → [1]   |
| use      → [1] | machine    → [0,2]        |                  |
| machine  → [2] | python     → [1]          |                  |
|                | widely     → [1]          |                  |
|                | used       → [1]          |                  |
|                | data       → [1,2]        |                  |
|                | analysis   → [1]          |                  |
|                | involves   → [2]          |                  |
|                | algorithms → [2]          |                  |
|                | learn      → [2]          |                  |
+----------------+---------------------------+------------------+

-------------------------------------------------------------------------------
Example Search:

Query: "deep learning in AI" → ["deep", "learning", "ai"]

Lookup in inverted index:

question field:
    "deep" → [0]
    "learning" → [0,2]
    "ai" → []

text field:
    "deep" → [0]
    "learning" → [0,2]
    "ai" → []

section field:
    "deep" → []
    "learning" → []
    "ai" → [0,2]

Candidate documents = union of all matches → [0,2]

Rank using TF-IDF across fields.

-------------------------------------------------------------------------------
Summary:

This diagram shows:

1. Original documents
2. Tokens extracted per text field
3. Inverted index mapping tokens → document IDs

It demonstrates how `AppendableIndex` efficiently finds candidate documents
without scanning all documents.

"""
"""
🧩 Keyword Fields — Side-by-Side Diagram
(for keyword_fields=["category", "author", "year"])

Example Documents:

Doc 0:
    category: "AI"
    author:   "Andrew Ng"
    year:     2023

Doc 1:
    category: "Programming"
    author:   "Guido van Rossum"
    year:     2022

Doc 2:
    category: "AI"
    author:   "Geoff Hinton"
    year:     2023

-------------------------------------------------------------------------------
Keyword Field Mapping (Value → Document IDs)

+--------------+---------------------+------------+
| category     | author              | year       |
+--------------+---------------------+------------+
| AI           → [0,2]   | Andrew Ng        → [0] | 2023 → [0,2] |
| Programming  → [1]     | Guido van Rossum  → [1] | 2022 → [1]   |
|              | Geoff Hinton      → [2] |            |
+--------------+---------------------+------------+

-------------------------------------------------------------------------------
Example Search:

Query: "learning" with filters {"category": "AI", "year": 2023}

1. Text search may match Docs 0, 1, 2 based on TF-IDF.
2. Apply keyword filters:
    - category = "AI" → keep Docs [0,2]
    - year = 2023      → keep Docs [0,2]
3. Result after filtering → [Doc 0, Doc 2]
4. Rank using TF-IDF across text fields.

-------------------------------------------------------------------------------
Summary:

- Keyword fields map exact values → document IDs.
- Enable filtering, faceted search, and grouping.
- Complement text fields, which are scored for relevance.
"""


# can
index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

# to append , we can use the following function
#index.append(XXXX);


<minsearch.append.AppendableIndex at 0x1135c5c40>

In [54]:
# Create the search function to get results from Search Index create above
"""
🧩 TF-IDF + Boost Explanation in AppendableIndex

TF-IDF = Term Frequency × Inverse Document Frequency
It measures how important a word is in a document relative to the entire corpus.

-------------------------------------------------------------------------------
1️⃣ Term Frequency (TF)
- Counts how often a term appears in a document.
- Sublinear scaling: TF = 1 + log(count), if count > 0
- Example:
    Doc 0, text: "Deep learning learning learning"
    Count of "learning" = 3
    TF("learning") = 1 + log(3) ≈ 2.10

2️⃣ Inverse Document Frequency (IDF)
- Measures how rare a term is across all documents.
- Formula: IDF = log((N + 1) / (DF + 1)) + 1
    - N = total number of documents
    - DF = number of documents containing the term
- Rare terms get higher IDF.

Example:
    3 documents contain "learning" in text field:
        N = 3, DF("learning") = 2
        IDF("learning") = log((3+1)/(2+1)) + 1 ≈ 1.29

3️⃣ TF-IDF
- Multiply TF × IDF for each token in a document.
- L2 normalize vectors to compare similarity with cosine similarity.

Example Calculation for Doc 0:
    - Token "learning": TF = 2.10, IDF = 1.29
    - TF-IDF("learning") = 2.10 * 1.29 ≈ 2.71

-------------------------------------------------------------------------------
4️⃣ Field Boosts

- `boost_dict` in search allows weighting specific text fields.
- Each field score is multiplied by its boost before combining.

Example:

Text fields: ["question", "text", "section"]
Boosts: {"question": 2.0, "text": 1.0}  # section uses default 1.0

Doc 0 raw TF-IDF scores for a query:
+-----------+-----------+----------------+
| Field     | Raw Score | Boosted Score  |
+-----------+-----------+----------------+
| question  | 0.5       | 0.5 * 2.0 = 1.0|
| text      | 0.3       | 0.3 * 1.0 = 0.3|
| section   | 0.2       | 0.2 * 1.0 = 0.2|
+-----------+-----------+----------------+

Total Score = 1.0 + 0.3 + 0.2 = 1.5
- Boosting "question" doubles its impact on ranking.

-------------------------------------------------------------------------------
5️⃣ How Search Works with TF-IDF + Boost

1. Tokenize query and documents.
2. Compute TF-IDF vectors for query and each document (per text field).
3. Apply L2 normalization.
4. Calculate cosine similarity between query vector and document vectors.
5. Multiply each field score by its boost (from `boost_dict`).
6. Combine scores across fields for final ranking.
7. Optionally, apply keyword filters to remove non-matching documents.

-------------------------------------------------------------------------------
Summary:

- TF-IDF ranks documents based on query relevance.
- Rare and query-specific terms get higher scores.
- Field boosts allow tuning importance of specific fields.
- Keyword fields filter results without affecting TF-IDF scoring.
"""

def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'}, #This is used because we added it as a keyword field above
        boost_dict=boost,
        num_results=15,
    )

    return results

In [37]:
len(search("how do I install kafka"))

15

In [55]:
# Define the search tool
# This is needed because when the LLM is deciding to pick a "tool" it use the description and the properties to call the tool.
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": { #parameters
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}


In [56]:
!uv add toyaikit

[2mResolved [1m155 packages[0m [2min 11ms[0m[0m
[2mAudited [1m136 packages[0m [2min 0.29ms[0m[0m


In [57]:
from toyaikit.llm import OpenAIClient
from toyaikit.chat import IPythonChatInterface
from toyaikit.chat.runners import OpenAIResponsesRunner
from toyaikit.chat.runners import DisplayingRunnerCallback
from toyaikit.tools import Tools

In [41]:
# Set System prompt
instructions = """
You're a course teaching assistant.
You're given a question from a course student and your task is to answer it.

If you want to look up the answer, explain why before making the call.
Display the responses returned by the tool
""".strip()

In [42]:
## USe the ToyAI kit helper function to map a tool to its schema.
agent_tools = Tools()
agent_tools.add_tool(search,search_tool)

In [43]:
#Create a chat

chat_interface = IPythonChatInterface() #Creates an interface to show chat messages, function calls , reasoning

# A runner has two loops, the Agentic Loop (loop function) and the Chat runner loop (run )

runner = OpenAIResponsesRunner(
    tools=agent_tools,
    developer_prompt=instructions,
    chat_interface=chat_interface,
    llm_client=OpenAIClient()
)

In [44]:
callback = DisplayingRunnerCallback(chat_interface)

question = 'how to fix Docker error with kafka'
loop_result = runner.loop(prompt=question, callback=callback) #This it to only test the Agentic Loop

In [45]:
import json

def prettyprint_json(obj):
    """
    Pretty-prints a nested object (like a LoopResult) as JSON.
    Automatically converts dataclass-like objects to dicts if needed.
    """
    def to_serializable(o):
        # Convert dataclass-like or custom objects to dictionaries
        if hasattr(o, "__dict__"):
            return {k: to_serializable(v) for k, v in vars(o).items()}
        elif isinstance(o, list):
            return [to_serializable(i) for i in o]
        elif isinstance(o, dict):
            return {k: to_serializable(v) for k, v in o.items()}
        else:
            return o

    # Convert and pretty-print
    print(json.dumps(to_serializable(obj), indent=2, ensure_ascii=False))

In [46]:
import json
prettyprint_json(loop_result)

{
  "new_messages": [
    {
      "role": "developer",
      "content": "You're a course teaching assistant.\nYou're given a question from a course student and your task is to answer it.\n\nIf you want to look up the answer, explain why before making the call.\nDisplay the responses returned by the tool"
    },
    {
      "role": "user",
      "content": "how to fix Docker error with kafka"
    },
    {
      "id": "msg_0fc17b85f430f4eb0068fe36ace560819597b8ed9c18245b74",
      "content": [
        {
          "annotations": [],
          "text": "To address your issue effectively, I'll look up common solutions for Docker errors related to Apache Kafka. Issues can vary widely, so it's important to identify possible causes such as configuration errors, connection issues, or compatibility problems.\n\nLet's find relevant solutions in the FAQ database.",
          "type": "output_text",
          "logprobs": []
        }
      ],
      "role": "assistant",
      "status": "completed",
  

In [47]:
#Run activates chat loop

runner.run()

KeyboardInterrupt: Interrupted by user

In [31]:
# Adding a new tool which adds a Answer back to the Appendable index

#
def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)


add_entry_tool = {
    "type": "function",
    "name": "add_entry",
    "description": "Add an entry to the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question to be added to the FAQ database",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question",
            }
        },
        "required": ["question", "answer"],
        "additionalProperties": False
    }
}



In [32]:
agent_tools.add_tool(add_entry, add_entry_tool)

In [33]:
runner.run()

KeyboardInterrupt: Interrupted by user

In [50]:

from typing import List, Dict, Any


class SearchTools:

    def __init__(self, index):
        self.index = index

    def search(self, query: str) -> List[Dict[str, Any]]:
        """
        Search the FAQ database for entries matching the given query.

        Args:
            query (str): Search query text to look up in the course FAQ.

        Returns:
            List[Dict[str, Any]]: A list of search result entries, each containing relevant metadata.
        """
        boost = {'question': 3.0, 'section': 0.5}

        results = self.index.search(
            query=query,
            filter_dict={'course': 'data-engineering-zoomcamp'},
            boost_dict=boost,
            num_results=5,
            output_ids=True
        )

        return results

    def add_entry(self, question: str, answer: str) -> None:
        """
        Add a new entry to the FAQ database.

        Args:
            question (str): The question to be added to the FAQ database.
            answer (str): The corresponding answer to the question.
        """
        doc = {
            'question': question,
            'text': answer,
            'section': 'user added',
            'course': 'data-engineering-zoomcamp'
        }
        self.index.append(doc)


In [58]:
search_tools = SearchTools(index)

agent_tools = Tools()
agent_tools.add_tools(search_tools)


runner = OpenAIResponsesRunner(
    tools=agent_tools,
    developer_prompt=instructions,
    chat_interface=chat_interface,
    llm_client=OpenAIClient()
)

runner.run();

Chat ended.
