In [132]:
# Get Data from documents

import requests
from openai.types.responses import ResponseFunctionToolCall

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()


documents = []

for record in documents_raw:
    course_name = record['course'] #data-engineering-zoomcamp#

    for element in record['documents']: #documents[]
        element['course'] = course_name
        documents.append(element)

In [133]:
# Create an Appendable Index ( Advantage is that we can keep adding to an appendable index)
from minsearch import AppendableIndex


# text fields are tokenized and put in an inverted index.
# Overview of how the index workds below
"""
🧩Text Fields and  Inverted Index — Complete Side-by-Side Diagram
(for text_fields=["question", "text", "section"])

Example Documents:

Doc 0:
    question: "What is deep learning?"
    text:     "Deep learning is a subset of machine learning."
    section:  "AI Basics"

Doc 1:
    question: "How to use Python?"
    text:     "Python is widely used for data analysis."
    section:  "Programming"

Doc 2:
    question: "What is machine learning?"
    text:     "Machine learning involves algorithms that learn from data."
    section:  "AI Basics"

-------------------------------------------------------------------------------
Tokenization (stop words removed, lowercase)

Doc 0 tokens:
    question → ["deep", "learning"]
    text     → ["deep", "learning", "subset", "machine", "learning"]
    section  → ["ai", "basics"]

Doc 1 tokens:
    question → ["python", "use"]
    text     → ["python", "widely", "used", "data", "analysis"]
    section  → ["programming"]

Doc 2 tokens:
    question → ["machine", "learning"]
    text     → ["machine", "learning", "involves", "algorithms", "learn", "data"]
    section  → ["ai", "basics"]

-------------------------------------------------------------------------------
Inverted Index — Side-by-Side View

+----------------+---------------------------+------------------+
| question field | text field                | section field    |
+----------------+---------------------------+------------------+
| deep     → [0] | deep       → [0]          | ai          → [0,2] |
| learning → [0,2]| learning   → [0,2]       | basics      → [0,2] |
| python   → [1] | subset     → [0]          | programming → [1]   |
| use      → [1] | machine    → [0,2]        |                  |
| machine  → [2] | python     → [1]          |                  |
|                | widely     → [1]          |                  |
|                | used       → [1]          |                  |
|                | data       → [1,2]        |                  |
|                | analysis   → [1]          |                  |
|                | involves   → [2]          |                  |
|                | algorithms → [2]          |                  |
|                | learn      → [2]          |                  |
+----------------+---------------------------+------------------+

-------------------------------------------------------------------------------
Example Search:

Query: "deep learning in AI" → ["deep", "learning", "ai"]

Lookup in inverted index:

question field:
    "deep" → [0]
    "learning" → [0,2]
    "ai" → []

text field:
    "deep" → [0]
    "learning" → [0,2]
    "ai" → []

section field:
    "deep" → []
    "learning" → []
    "ai" → [0,2]

Candidate documents = union of all matches → [0,2]

Rank using TF-IDF across fields.

-------------------------------------------------------------------------------
Summary:

This diagram shows:

1. Original documents
2. Tokens extracted per text field
3. Inverted index mapping tokens → document IDs

It demonstrates how `AppendableIndex` efficiently finds candidate documents
without scanning all documents.

"""
"""
🧩 Keyword Fields — Side-by-Side Diagram
(for keyword_fields=["category", "author", "year"])

Example Documents:

Doc 0:
    category: "AI"
    author:   "Andrew Ng"
    year:     2023

Doc 1:
    category: "Programming"
    author:   "Guido van Rossum"
    year:     2022

Doc 2:
    category: "AI"
    author:   "Geoff Hinton"
    year:     2023

-------------------------------------------------------------------------------
Keyword Field Mapping (Value → Document IDs)

+--------------+---------------------+------------+
| category     | author              | year       |
+--------------+---------------------+------------+
| AI           → [0,2]   | Andrew Ng        → [0] | 2023 → [0,2] |
| Programming  → [1]     | Guido van Rossum  → [1] | 2022 → [1]   |
|              | Geoff Hinton      → [2] |            |
+--------------+---------------------+------------+

-------------------------------------------------------------------------------
Example Search:

Query: "learning" with filters {"category": "AI", "year": 2023}

1. Text search may match Docs 0, 1, 2 based on TF-IDF.
2. Apply keyword filters:
    - category = "AI" → keep Docs [0,2]
    - year = 2023      → keep Docs [0,2]
3. Result after filtering → [Doc 0, Doc 2]
4. Rank using TF-IDF across text fields.

-------------------------------------------------------------------------------
Summary:

- Keyword fields map exact values → document IDs.
- Enable filtering, faceted search, and grouping.
- Complement text fields, which are scored for relevance.
"""


# can
index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

# to append , we can use the following function
#index.append(XXXX);


<minsearch.append.AppendableIndex at 0x15054fe60>

In [134]:
# Create the search function to get results from Search Index create above
"""
🧩 TF-IDF + Boost Explanation in AppendableIndex

TF-IDF = Term Frequency × Inverse Document Frequency
It measures how important a word is in a document relative to the entire corpus.

-------------------------------------------------------------------------------
1️⃣ Term Frequency (TF)
- Counts how often a term appears in a document.
- Sublinear scaling: TF = 1 + log(count), if count > 0
- Example:
    Doc 0, text: "Deep learning learning learning"
    Count of "learning" = 3
    TF("learning") = 1 + log(3) ≈ 2.10

2️⃣ Inverse Document Frequency (IDF)
- Measures how rare a term is across all documents.
- Formula: IDF = log((N + 1) / (DF + 1)) + 1
    - N = total number of documents
    - DF = number of documents containing the term
- Rare terms get higher IDF.

Example:
    3 documents contain "learning" in text field:
        N = 3, DF("learning") = 2
        IDF("learning") = log((3+1)/(2+1)) + 1 ≈ 1.29

3️⃣ TF-IDF
- Multiply TF × IDF for each token in a document.
- L2 normalize vectors to compare similarity with cosine similarity.

Example Calculation for Doc 0:
    - Token "learning": TF = 2.10, IDF = 1.29
    - TF-IDF("learning") = 2.10 * 1.29 ≈ 2.71

-------------------------------------------------------------------------------
4️⃣ Field Boosts

- `boost_dict` in search allows weighting specific text fields.
- Each field score is multiplied by its boost before combining.

Example:

Text fields: ["question", "text", "section"]
Boosts: {"question": 2.0, "text": 1.0}  # section uses default 1.0

Doc 0 raw TF-IDF scores for a query:
+-----------+-----------+----------------+
| Field     | Raw Score | Boosted Score  |
+-----------+-----------+----------------+
| question  | 0.5       | 0.5 * 2.0 = 1.0|
| text      | 0.3       | 0.3 * 1.0 = 0.3|
| section   | 0.2       | 0.2 * 1.0 = 0.2|
+-----------+-----------+----------------+

Total Score = 1.0 + 0.3 + 0.2 = 1.5
- Boosting "question" doubles its impact on ranking.

-------------------------------------------------------------------------------
5️⃣ How Search Works with TF-IDF + Boost

1. Tokenize query and documents.
2. Compute TF-IDF vectors for query and each document (per text field).
3. Apply L2 normalization.
4. Calculate cosine similarity between query vector and document vectors.
5. Multiply each field score by its boost (from `boost_dict`).
6. Combine scores across fields for final ranking.
7. Optionally, apply keyword filters to remove non-matching documents.

-------------------------------------------------------------------------------
Summary:

- TF-IDF ranks documents based on query relevance.
- Rare and query-specific terms get higher scores.
- Field boosts allow tuning importance of specific fields.
- Keyword fields filter results without affecting TF-IDF scoring.
"""

def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'}, #This is used because we added it as a keyword field above
        boost_dict=boost,
        num_results=5,
    )

    return results

In [135]:
# Define the search tool
# This is needed because when the LLM is deciding to pick a "tool" it use the description and the properties to call the tool.
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": { #parameters
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}


In [136]:
# Set System prompt
instructions = """
You're a course teaching assistant.
You're given a question from a course student and your task is to answer it.
""".strip()

In [137]:
# Define the tools. in our case we are including search tool. More tools can be provided
tools = [search_tool]

In [138]:
from pprint import pprint
import json

def print_response_readable(response):
    """
    Pretty-print key parts of an OpenAI Response object in Jupyter.
    Includes tool calls, function arguments, and model text outputs.
    """
    print("📝 Response Summary")
    print("-" * 60)
    print(f"Response ID: {getattr(response, 'id', None)}")
    print(f"Model: {getattr(response, 'model', None)}")
    print(f"Status: {getattr(response, 'status', None)}")
    print(f"Tool Choice: {getattr(response, 'tool_choice', None)}")
    print(f"Parallel Tool Calls: {getattr(response, 'parallel_tool_calls', None)}")
    print("-" * 60)

    # Print tool outputs
    if hasattr(response, 'output') and response.output:
        for i, item in enumerate(response.output):
            item_type = getattr(item, 'type', None)

            if item_type == 'function_call':
                print(f"🔧 Tool Call [{i}]: {getattr(item, 'name', None)}")
                print(f"    Call ID: {getattr(item, 'call_id', None)}")
                print(f"    Status: {getattr(item, 'status', None)}")
                # Arguments as pretty JSON
                args = getattr(item, 'arguments', None)
                if args:
                    try:
                        args_json = json.loads(args)
                        print("    Arguments:")
                        pprint(args_json, indent=4)
                    except Exception:
                        print(f"    Arguments: {args}")

            elif item_type == 'text':
                text_content = getattr(item, 'content', None)
                print(f"💬 Model Text Output [{i}]:")
                if text_content:
                    if isinstance(text_content, list):
                        # Sometimes output content is a list of dicts with 'text'
                        for part in text_content:
                            pprint(part.get('text', ''), indent=4)
                    else:
                        pprint(text_content, indent=4)
                else:
                    print("    <No text content>")
            else:
                # Fallback for unknown output types
                print(f"[{i}] Output item (type: {item_type}):")
                pprint(item)
    else:
        print("No outputs found.")

    print("-" * 60)


In [139]:
#Create Open AI Client
from openai import OpenAI
openai_client = OpenAI()

question = 'I just discovered the course. Can I still join it?'
chat_messages = [
    {"role": "developer", "content": instructions}, #intermediary
    {"role": "user", "content": question} #from users
]

# Send the tools defines along with the system prompt and user question
response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)


In [140]:
import json
#Checkout the response. This will have instructions asking to call the search tool to get the data.
print_response_readable(response)

📝 Response Summary
------------------------------------------------------------
Response ID: resp_063994e02160e3740068fd760392fc819495f10f9b784e1582
Model: gpt-4o-mini-2024-07-18
Status: completed
Tool Choice: auto
Parallel Tool Calls: True
------------------------------------------------------------
🔧 Tool Call [0]: search
    Call ID: call_LtdrG9M2jCpft5vKf8VvuxEr
    Status: completed
    Arguments:
{'query': 'join course'}
------------------------------------------------------------


In [141]:
# write a function that takes in the call object and returns back a json formatted value
# that can be passed onto the Open AI call with chat messages

def call_search_tool(call):
    arguments = json.loads(call.arguments)
    query = arguments['query']
    search_results = json.dumps(search(query))
    return {
        "type": "function_call_output",
        "call_id": call.call_id,
        "output": json.dumps(search_results)
    }

In [142]:
import json
# let's put the response in an object called callParameters
# The key here is
## name : This is the name of the tool
## type : How we expect the tool to work. In our case it's a function_call
## id : An id that will be used by OpenAI to match the response of the function call
## arguments : The arguments that should be provided to the tool

call = response.output[0]
chat_messages.append(call)

In [143]:
# append the result to chat messages and send it back to OpenAI
search_tool_response = call_search_tool(call)
chat_messages.append(search_tool_response)
print(len(chat_messages))

4


In [144]:
# Send the tools defines along with the system prompt and user question and tool output

response_with_tool_output = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [145]:
print_response_readable(response_with_tool_output)
#print(response_with_tool_output)

📝 Response Summary
------------------------------------------------------------
Response ID: resp_063994e02160e3740068fd760479b48194b4ab7ab3b28e752e
Model: gpt-4o-mini-2024-07-18
Status: completed
Tool Choice: auto
Parallel Tool Calls: True
------------------------------------------------------------
[0] Output item (type: message):
ResponseOutputMessage(id='msg_063994e02160e3740068fd760544288194a90cc27a428d33cf', content=[ResponseOutputText(annotations=[], text='Yes, you can still join the course even after its start date. You are eligible to submit homework assignments, but keep in mind that there are deadlines for turning in the final projects, so it’s best not to leave everything to the last minute.', type='output_text', logprobs=[])], role='assistant', status='completed', type='message')
------------------------------------------------------------


In [146]:
"""
🧩 OpenAI Function/Tool Calling Flow (ASCII Diagram)
https://platform.openai.com/docs/guides/function-calling#the-tool-calling-flow

User Input
   │
   ▼
+-----------------+
| System Message  |
| (instructions)  |
+-----------------+
   │
   ▼
+-----------------+
| Model Receives  |
| User Prompt     |
+-----------------+
   │
   ▼
+-----------------+
| Model Decides   |
| Whether to Call |
| a Tool/Function |
+-----------------+
   │
   ▼
┌───────────────┐
│ Function Call │
│ (name + args) │
└───────────────┘
   │
   ▼
+-----------------+
| Function Executes|
| with Provided   |
| Arguments       |
+-----------------+
   │
   ▼
+-----------------+
| Function Output |
| returned to     |
| Model           |
+-----------------+
   │
   ▼
+-----------------+
| Model Generates |
| Final Response  |
| for User        |
+-----------------+

Notes:
- Tools/functions are defined with names, descriptions, and parameters.
- Model may call multiple tools in parallel if enabled.
- Tool outputs can be used by the model to generate more accurate final responses.
"""

"""
If this goes in a loop where the LLM asks us to call tools and we send the responses and the LLM asks us to call the tools again till we find the answer. It's an LLM with Loops i.e. it is showing "Agency" in deciding which tools to use, take the output and make more decisions to see if other tools are called before providing an answer. This interaction is called an Agentic Loop which is the heart of an Agent
"""

'\nIf this goes in a loop where the LLM asks us to call tools and we send the responses and the LLM asks us to call the tools again till we find the answer. It\'s an LLM with Loops i.e. it is showing "Agency" in deciding which tools to use, take the output and make more decisions to see if other tools are called before providing an answer. This interaction is called an Agentic Loop which is the heart of an Agent\n'

In [147]:
"""
We will first write a function to automate the tool calling.
Remember that the LLM has already provided you the tool name it wants to be called and the arguments
All you need to do is use them to call the right function
"""
def make_call(call_request_from_llm):
    f_name = call_request_from_llm.name # Coming from LLM
    arguments = json.loads(call_request_from_llm.arguments) # Coming from LLM

    if f_name == 'search':
        results = search(**arguments)
    # if you add another function, we can put it here
    # elif f_name == 'add_entry':
    #    results = add_entry(**arguments)
    else:
        raise ValueError(f'unknown function {f_name}')

    json_results = json.dumps(results)

    return {
        "type": "function_call_output",
        "call_id": call_request_from_llm.call_id, # This is for the llm to know which tool request this response is for.
        "output": json_results,
    }



In [148]:
question = 'I just discovered the course. Can I still join it?'

chat_messages = [
    {"role": "developer", "content": instructions},
    {"role": "user", "content": question}
]

while True: #agent loop
    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=chat_messages,
        tools=tools
    )

    has_function_calls = False

    # Add response to chat history for LLM's "memory"
    chat_messages.extend(response.output) #we keep adding all the tool call requests along with it's responses.

    for entry in response.output:
        if entry.type == "function_call":
            print('Function call:')
            print(entry)
            result = make_call(entry)
            print('   ', 'Output:')
            print('   ', result['output'])
            chat_messages.append(result) # We add the result and we also add the call_id. see the make_call function.
            has_function_calls = True
            print()

        elif entry.type == "message":
            print('Assistant:')
            print(entry.content[0].text)
            print()

    if not has_function_calls:
        break

Function call:
ResponseFunctionToolCall(arguments='{"query":"join course late enrollment"}', call_id='call_Bvw4LnXuwpsB02L2ODj7aFGC', name='search', type='function_call', id='fc_017fd55038a01de30068fd7607d3908195b5289763b647c512', status='completed')
    Output:
    [{"text": "No, late submissions are not allowed. But if the form is still not closed and it\u2019s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlder news:[source1] [source2]", "section": "General course-related questions", "question": "Homework - Are late submissions of homework allowed?", "course": "data-engineering-zoomcamp"}, {"text": "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", "section": "General course-related questions", "question": "Course - Can I still join the course aft