### Imports

In [1]:
%load_ext autoreload
%autoreload 2
# Standard libraries
import io
import os
import re
import zipfile
import json

# Third-party libraries
import requests
import frontmatter
from dotenv import load_dotenv
from tqdm.notebook import tqdm
from IPython.display import Markdown

# Google Gemini API
import google.generativeai as genai
# OpenAI API
import openai


# Day 1: Download and extract the zip file

In [2]:
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)

In [3]:
repository_data =[]

# Zipfile object from downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()
    # Get md files only
    if not (filename.endswith('.md') or filename.endswith('.mdx')):
        continue
    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()

In [4]:
print(repository_data[1])

{'id': '9e508f2212', 'question': 'Course: When does the course start?', 'sort_order': 1, 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.", 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md'}


In [5]:
from read import read_repo_data

In [6]:
prefix = 'https://codeload.github.com'
dtc_faq = read_repo_data('DataTalksClub', 'faq', prefix=prefix)
evidently_docs = read_repo_data('evidentlyai', 'docs', prefix=prefix)

In [7]:
print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")

FAQ documents: 1219
Evidently documents: 95


In [8]:
evidently_docs[45]['content']



# Day 2: Chunking and Intelligent Processing for Data

## 1. Chunking by sliding window

In [9]:
def sliding_window_chunking(seq, size, step):
    """Chunk a text sequence using a sliding window approach.

    Args:
        seq (str): text sequence to chunk
        size (int): size of each chunk
        step (int): overlap step between chunks

    Raises:
        ValueError: size and step must be positive.

    Returns:
     list: list of dict with 'start' and 'chunk' keys
    """
    if size <= 0 or step <= 0:
        raise ValueError("Size and step must be positive.")

    result = []
    # Sliding window up to the end of the sequence
    for i in range(0, len(seq), step):
        chunk = seq[i:i + size]
        result.append({'start': i, 'end': i + size, 'chunk': chunk})
        # If the chunk is smaller than size, we reached the end
        if i+size >= len(seq):
            break
    return result

In [10]:
sliding_window_chunking(evidently_docs[45]['content'], 2000, 1000)

[{'start': 0,
  'end': 2000,
  'chunk': "In this tutorial, you will learn how to perform regression testing for LLM outputs.\n\nYou can compare new and old responses after changing a prompt, model, or anything else in your system. By re-running the same inputs with new parameters, you can spot any significant changes. This helps you push updates with confidence or identify issues to fix.\n\n<Info>\n  **This example uses Evidently Cloud.** You'll run evals in Python and upload them. You can also skip the upload and view Reports locally. For self-hosted, replace `CloudWorkspace` with `Workspace`.\n</Info>\n\n# Tutorial scope\n\nHere's what we'll do:\n\n* **Create a toy dataset**. Build a small Q&A dataset with answers and reference responses.\n\n* **Get new answers**. Imitate generating new answers to the same question.\n\n* **Create and run a Report with Tests**. Compare the answers using LLM-as-a-judge to evaluate length, correctness and style consistency.\n\n* **Build a monitoring Das

In [11]:
evidently_chunks = []
for doc in evidently_docs:
    doc_copy = doc.copy()
    # Remove content and keep metadata
    doc_content = doc_copy.pop('content')
    chunks = sliding_window_chunking(doc_content, 2000, 1000)
    # Add metadata to each chunk
    for chunk in chunks:
        chunk.update(doc_copy) # add metadata by updating the chunk dict
        evidently_chunks.append(chunk)

In [12]:
evidently_chunks[5]

{'start': 4000,
 'end': 6000,
 'chunk': '2-17" description="Evidently v0.6.4">\n  ## **Evidently 0.6.4**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.6.4).\n</Update>\n\n<Update label="2025-02-12" description="Evidently v0.6.3">\n  ## **Evidently 0.6.3**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.6.3). Added new RAG descriptors: see [tutorial](/examples/LLM_rag_evals) and [release blog](https://www.evidentlyai.com/blog/open-source-rag-evaluation-tool).\n</Update>\n\n<Update label="2025-02-07" description="Evidently v0.6.2">\n  ## **Evidently 0.6.2**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.6.2). We extended support for `litellm` , so you can easily use different providers like Gemini, Anthropic, etc. for LLM-based evaluations.\n</Update>\n\n<Update label="2025-01-31" description="Evidently v0.6.1">\n  ## **Evidently 0.6.1**\n\n  Full rele

## 2. Chunking by Paragraphs and sections

In [13]:
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())
paragraphs

['In this tutorial, you will learn how to perform regression testing for LLM outputs.',
 'You can compare new and old responses after changing a prompt, model, or anything else in your system. By re-running the same inputs with new parameters, you can spot any significant changes. This helps you push updates with confidence or identify issues to fix.',
 "<Info>\n  **This example uses Evidently Cloud.** You'll run evals in Python and upload them. You can also skip the upload and view Reports locally. For self-hosted, replace `CloudWorkspace` with `Workspace`.\n</Info>",
 '# Tutorial scope',
 "Here's what we'll do:",
 '* **Create a toy dataset**. Build a small Q&A dataset with answers and reference responses.',
 '* **Get new answers**. Imitate generating new answers to the same question.',
 '* **Create and run a Report with Tests**. Compare the answers using LLM-as-a-judge to evaluate length, correctness and style consistency.',
 '* **Build a monitoring Dashboard**. Get plots to track th

In [14]:
def split_markdown_by_level(text, level=2):
    """Split markdown text into sections based on header levels.

    Args:
        text (str): Markdown text to split.
        level (int): Header level to split by (e.g., 2 for '##').

    Returns:
        list: List of sections as strings.
    """
    # Create a regex pattern to match headers of the specified level
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)
    # Split the text into parts based on the header pattern
    parts = pattern.split(text)
    # Reconstruct sections with headers
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # include the '## ' part
        header = header.strip()
        # get the content after the header
        if i+2 < len(parts): # check if there's content after the header
            content = parts[i+2].strip()
        if content:
            section = f"{header}\n\n{content}"
        else:
            section = header
        sections.append(section)
    return sections

**Note**: This code may not work perfectly if we want to split by level 1 headings and have Python code with # comments. But in general, this is not a big problem for documentation.

In [15]:
split_markdown_by_level(evidently_docs[45]['content'], level=2)

['## 1. Installation and Imports\n\nInstall Evidently:\n\n```python\npip install evidently[llm] \n```\n\nImport the required modules:\n\n```python\nimport pandas as pd\nfrom evidently.future.datasets import Dataset\nfrom evidently.future.datasets import DataDefinition\nfrom evidently.future.datasets import Descriptor\nfrom evidently.future.descriptors import *\nfrom evidently.future.report import Report\nfrom evidently.future.presets import TextEvals\nfrom evidently.future.metrics import *\nfrom evidently.future.tests import *\n\nfrom evidently.features.llm_judge import BinaryClassificationPromptTemplate\n```\n\nTo connect to Evidently Cloud:\n\n```python\nfrom evidently.ui.workspace.cloud import CloudWorkspace\n```\n\n**Optional.** To create monitoring panels as code:\n\n```python\nfrom evidently.ui.dashboards import DashboardPanelPlot\nfrom evidently.ui.dashboards import DashboardPanelTestSuite\nfrom evidently.ui.dashboards import DashboardPanelTestSuiteCounter\nfrom evidently.ui.das

In [16]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [17]:
evidently_chunks[2]

{'title': 'Data definition',
 'description': 'How to map the input data.',
 'filename': 'docs-main/docs/library/data_definition.mdx',
 'section': '## Basic flow\n\n**Step 1. Imports.** Import the following modules:\n\n```python\nfrom evidently import Dataset\nfrom evidently import DataDefinition\n```\n\n**Step 2. Prepare your data.** Use a pandas.DataFrame.\n\n<Info>\n  Your data can have [flexible structure](/docs/library/overview#dataset) with any mix of categorical, numerical or text columns. Check the [Reference table](/metrics/all_metrics) for data requirements in specific evaluations.\n</Info>\n\n**Step 3. Create a Dataset object**. Use `Dataset.from_pandas` with `data_definition`:\n\n```python\neval_data = Dataset.from_pandas(\n    source_df,\n    data_definition=DataDefinition()\n)\n```\n\nTo map columns automatically, pass an empty `DataDefinition()` . Evidently will map columns:\n\n- By type (numerical, categorical).\n- By matching column names to roles (e.g., a column "targe

## 3. Intelligent Chunking with LLM

In [None]:
# Load environment variables from a .env file
load_dotenv()
# Get the API key from environment variables
API_KEY = os.getenv('GEMINI_API_KEY')
# Check if the API key was found
if not API_KEY:
    raise ValueError("API key not found. Please set the GEMINI_API_KEY environment variable.")
else:
    print("API key loaded successfully.")

In [None]:
# trying with the gemini api
genai.configure(api_key=API_KEY)

def llm(prompt: str, model: str = "gemini-2.5-flash-lite") -> str:
    """
    Call Gemini with a text prompt and return the output text.

    Args:
        prompt (str): The input prompt for the LLM.
        model (str): Gemini model name (default: gemini-1.5-flash).

    Returns:
        str: The generated text.
    """
    try:
        model_obj = genai.GenerativeModel(model)
        response = model_obj.generate_content(prompt)

        if not response or not hasattr(response, "text"):
            raise ValueError("LLM returned no text.")

        return response.text

    except Exception as e:
        # Debug report
        print("❌ Error during LLM call")
        print(f"Model: {model}")
        print(f"Prompt (truncated): {prompt[:200]}{'...' if len(prompt) > 200 else ''}")
        print(f"Error: {e}")
        raise


In [None]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

Considerations to improve prompt:

- Unbounded length: the model might produce very large sections if the input doc is long (could exceed embedding limits).

- Ambiguous instructions: “logical sections” might be interpreted differently by the model (especially across varied docs).

- No output constraints: doesn’t say “keep each section < N tokens” or “max 5 sections” → could be inconsistent.

In [None]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [None]:
evidently_docs[5:6]

In [None]:
# Test with example
evidently_chunks = []

for doc in tqdm(evidently_docs[5:6]):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [None]:
evidently_chunks

# Day 3 - Search — lexical + semantic + hybrid


## 1. Lexical search

In [18]:
# Index evidently chunks using minsearch
from minsearch import Index

# instantiate index object
index = Index(
    text_fields=['chunk', 'title', 'description', 'filename'],
    keyword_fields=[]
)

# fit to the evidently chunks
index.fit(evidently_chunks)

<minsearch.minsearch.Index at 0x78f3af1fd360>

In [19]:
# Test a query
query = 'What should be in a test dataset for AI evaluation?'
results = index.search(query)
results[1]

{'title': 'LLM Evaluation',
 'description': 'Evaluate text outputs in under 5 minutes',
 'filename': 'docs-main/quickstart_llm.mdx',
 'section': '## 1. Set up your environment\n\nFor a fully local flow, skip steps 1.1 and 1.3.\n\n### 1.1. Set up Evidently Cloud\n\n<CloudSignup />\n\n### 1.2. Installation and imports\n\nInstall the Evidently Python library:\n\n```python\n!pip install evidently\n```\n\nComponents to run the evals:\n\n```python\nimport pandas as pd\nfrom evidently import Dataset\nfrom evidently import DataDefinition\nfrom evidently import Report\nfrom evidently.presets import TextEvals\nfrom evidently.tests import lte, gte, eq\nfrom evidently.descriptors import LLMEval, TestSummary, DeclineLLMEval, Sentiment, TextLength, IncludesWords\nfrom evidently.llm.templates import BinaryClassificationPromptTemplate\n```\n\nComponents to connect with Evidently Cloud:\n\n```python\nfrom evidently.ui.workspace import CloudWorkspace\n```\n\n### 1.3. Create a Project\n\n<CreateProject /

In [20]:
# get datatalks faq  and filter files  with data engineering
dtc_faq = read_repo_data('DataTalksClub', 'faq', prefix=prefix)
de_dtc_faq = [doc for doc in dtc_faq if 'data-engineering' in doc['filename']]
faq_index = Index(
    text_fields=['question', 'content'],
    keyword_fields=[]
)
faq_index.fit(de_dtc_faq)

<minsearch.minsearch.Index at 0x78f3af225030>

In [21]:
query = 'Can I join the course after it starts?'
results = faq_index.search(query)
results

[{'id': '068529125b',
  'question': 'Course - Can I follow the course after it finishes?',
  'sort_order': 8,
  'content': 'Yes, we will keep all the materials available, so you can follow the course at your own pace after it finishes.\n\nYou can also continue reviewing the homeworks and prepare for the next cohort. You can also start working on your final capstone project.',
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/008_068529125b_course-can-i-follow-the-course-after-it-finishes.md'},
 {'id': '9e508f2212',
  'question': 'Course: When does the course start?',
  'sort_order': 1,
  'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's S

In [22]:
query1 = "I just discovered the program, can I still enroll?"
query2 = "I just found out about the course, can I still join?"

results1 = faq_index.search(query1)
results2 = faq_index.search(query2)
print(results2)


[{'id': '3f1424af17', 'question': 'Course: Can I still join the course after the start date?', 'sort_order': 3, 'content': "Yes, even if you don't register, you're still eligible to submit the homework.\n\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.", 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md'}, {'id': '9e508f2212', 'question': 'Course: When does the course start?', 'sort_order': 1, 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.", 'file

## 2. Vector search

The ```multi-qa-distilbert-cos-v1 model``` is trained explicitly for question-answering tasks. It creates embeddings optimized for finding answers to questions.  
Other popular models include:  
- all-MiniLM-L6-v2 - General-purpose, fast, and efficient
- all-mpnet-base-v2 - Higher quality, slower  
Check Sentence Transformers documentation for more options.
https://www.sbert.net/docs/pretrained_models.html

In [23]:
# Import and select embedding model
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [24]:
# Create embedding for a document
record = de_dtc_faq[2]
text = record['question'] + ' ' + record['content'] # Concatenate question and content
v_doc = embedding_model.encode(text)

In [25]:
# Create embeding for a query
query = 'I just found out about the course. Can I enroll now?'
v_query = embedding_model.encode(query)

In [26]:
# Calculate similarity - normalized embeddings where dot product equals cosine similarity
similarity = v_query.dot(v_doc)
similarity

np.float32(0.51909333)

In [27]:
import numpy as np

faq_embeddings = []

for d in tqdm(de_dtc_faq):
    text = d['question'] + ' ' + d['content']
    v = embedding_model.encode(text)
    faq_embeddings.append(v)

faq_embeddings = np.array(faq_embeddings)
faq_embeddings.shape

  0%|          | 0/449 [00:00<?, ?it/s]

(449, 768)

In [28]:
from minsearch import VectorSearch
# Create vector search index
faq_vindex = VectorSearch()
faq_vindex.fit(faq_embeddings, de_dtc_faq)

<minsearch.vector.VectorSearch at 0x78f3a3aa8760>

In [29]:
#### Query vector index
query = 'Can I join the course now?'
# embed query
q = embedding_model.encode(query)
# search for the most similar document based on query embedding
results = faq_vindex.search(q)
results

[{'id': '3f1424af17',
  'question': 'Course: Can I still join the course after the start date?',
  'sort_order': 3,
  'content': "Yes, even if you don't register, you're still eligible to submit the homework.\n\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.",
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md'},
 {'id': '068529125b',
  'question': 'Course - Can I follow the course after it finishes?',
  'sort_order': 8,
  'content': 'Yes, we will keep all the materials available, so you can follow the course at your own pace after it finishes.\n\nYou can also continue reviewing the homeworks and prepare for the next cohort. You can also start working on your final capstone project.',
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/008_068529125b_course-can-i-follow-the-course-after-i

In [30]:
### 2.1 Embed evidently chunks
evidently_embeddings = []
# Create embeddings for each chunk
for d in tqdm(evidently_chunks):
    text = d['chunk'] # Use the chunk text
    v = embedding_model.encode(text)
    evidently_embeddings.append(v)
# Convert to numpy array
evidently_embeddings = np.array(evidently_embeddings)

# Create vector search index for evidently chunks
evidently_vindex = VectorSearch()
evidently_vindex.fit(evidently_embeddings, evidently_chunks)

  0%|          | 0/262 [00:00<?, ?it/s]

KeyError: 'chunk'

## 3. Hybrid search

In [34]:
#### Join lexical and vector search results
query = "Can I join the course now?"

# Lexical search
text_results = faq_index.search(query, num_results=5)

# Embed query and search vector index
q = embedding_model.encode(query)
vector_results = faq_vindex.search(q, num_results=5)
# Combine results (here we just concatenate, but you could interleave or rank them)
final_results = text_results + vector_results

In [35]:
final_results

[{'id': '3f1424af17',
  'question': 'Course: Can I still join the course after the start date?',
  'sort_order': 3,
  'content': "Yes, even if you don't register, you're still eligible to submit the homework.\n\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.",
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md'},
 {'id': '9e508f2212',
  'question': 'Course: When does the course start?',
  'sort_order': 1,
  'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the c

In [36]:
from typing import List, Dict, Any


def text_search(query: str, num_results: int = 5) -> List[Dict[str, Any]]:
    """
    Perform a lexical (keyword-based) search on the document index.

    Args:
        query (str): The search query string.
        num_results (int, optional): Maximum number of results to return. Defaults to 5.

    Returns:
        List[Dict[str, Any]]: A list of search results containing metadata
        such as filename, chunk, and score.
    """
    return faq_index.search(query, num_results=num_results)


def vector_search(query: str, num_results: int = 5) -> List[Dict[str, Any]]:
    """
    Perform a semantic (vector-based) search on the vector index.

    Args:
        query (str): The search query string.
        num_results (int, optional): Maximum number of results to return. Defaults to 5.

    Returns:
        List[Dict[str, Any]]: A list of search results containing metadata
        such as filename, chunk, and score.
    """
    q = embedding_model.encode(query)
    return faq_vindex.search(q, num_results=num_results)


def hybrid_search(query: str, num_results: int = 5) -> List[Dict[str, Any]]:
    """
    Perform a hybrid search combining text and vector results.

    Results from both searches are merged and deduplicated by filename.

    Args:
        query (str): The search query string.
        num_results (int, optional): Maximum number of results to return. Defaults to 5.

    Returns:
        List[Dict[str, Any]]: A deduplicated list of search results with metadata.
    """
    text_results = text_search(query, num_results)
    vector_results = vector_search(query, num_results)

    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result["filename"] not in seen_ids:
            seen_ids.add(result["filename"])
            combined_results.append(result)

    return combined_results


## Day 4 Agents and tools

In [38]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
openai_client = openai.OpenAI(api_key=openai_api_key)

In [55]:
# Generic question and answer
user_prompt = "I just discovered the course, can I join now?"

chat_messages = [
    {"role": "user", "content": user_prompt}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
)

print(response.output_text)

To determine if you can still join the course, it would be best to check the course's official website or contact the course instructor or administrator directly. They can provide information on enrollment deadlines, prerequisites, and any other relevant details. Let me know if you need help with anything specific!


In [56]:
# create a text search tool in openAI descriptio format
text_search_tool = {
    "type": "function",
    "name": "text_search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search the query text to look up in the course FAQ.",
            },
        },
        "required": ["query"],
        "additionalProperties": False
    },
}

In [None]:
# New question with a tool for text search

system_prompt = """
You are a helpful assistant for a course.
"""

question = "I just discovered the course, can I join now?"

chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[text_search_tool]
)


In [58]:
print(response.output)

[ResponseFunctionToolCall(arguments='{"query":"join course late"}', call_id='call_dolGroei1rhNbwwX2aUl4YJS', name='text_search', type='function_call', id='fc_68d9a563c3f08190b666bed211f701840e63869e45076428', status='completed')]


In [59]:
# Load the output query and send to the text serch tool the save the output
call = response.output[0]

arguments = json.loads(call.arguments)
result = text_search(**arguments)

call_output = {
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": json.dumps(result),
}


LLMs are stateless. When we make one call to the OpenAI API and then shortly afterwards make another, it doesn't know anything about the first call. So if we only send it call_output, it would have no idea how to respond to it.  
This is why we need to send it the entire conversation history. It needs to know everything that happened so far:  
- The system prompt (so it knows what the initial instructions are) - system_prompt  
- The user prompt (so it knows what task it needs to perform) - question  
The decision to invoke the text_search tool (so it knows what function was called) - that's our call  
- The output of the function (so it knows what the function returned) - that's our call_output  

In [None]:
# Extend chat history with call and call output to give full context
chat_messages.append(call)
chat_messages.append(call_output)

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[text_search_tool]
)

print(response.output_text)

Yes, you can still join the course even if it's started. You don't need to register beforehand to participate. However, keep in mind that there are deadlines for submitting homework and final projects, so be sure to manage your time effectively.

For more detailed information, you can check the course materials and join the related channels for updates. If you have any specific questions about getting started, feel free to ask!


In [39]:
# Extend system prompt with more details and instructions
system_prompt = """
You are a helpful assistant for a course.

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.
If the search doesn't return relevant results, let the user know and provide general guidance.
"""

In [65]:
# Allow for multiple searches
system_prompt= """
You are a helpful assistant for a course.

Always search for relevant information before answering.
If the first search doesn't give you enough information, try different search terms.

Make multiple searches if needed to provide comprehensive answers
"""

In [40]:
from pydantic_ai import Agent

agent = Agent(
    name="faq_agent",
    instructions=system_prompt,
    tools=[text_search],
    model='gpt-4o-mini'
)

In [41]:
# Assync call to the agent
question = "I just discovered the course, can I join now?"
result = await agent.run(user_prompt=question)

In [42]:
Markdown(result.output)

Yes, you can still join the course even after it has started. If you don't register, you're still eligible to submit the homework. However, be mindful that there will be deadlines for submitting homework and final projects, so it's advisable not to leave everything until the last minute.

For more details about registration and the upcoming cohorts, you can check the course [registration link](https://airtable.com/shr6oVXeQvSI5HuWD) and the course start date is January 13th, 2025.

In [43]:
result.new_messages()

[ModelRequest(parts=[UserPromptPart(content='I just discovered the course, can I join now?', timestamp=datetime.datetime(2025, 9, 29, 11, 35, 2, 933422, tzinfo=datetime.timezone.utc))], instructions="You are a helpful assistant for a course.\n\nUse the search tool to find relevant information from the course materials before answering questions.\n\nIf you can find specific information through search, use it to provide accurate answers.\nIf the search doesn't return relevant results, let the user know and provide general guidance."),
 ModelResponse(parts=[ToolCallPart(tool_name='text_search', args='{"query":"course enrollment","num_results":5}', tool_call_id='call_iHg52Da3LSCLuOWP8RzlTduH')], usage=RequestUsage(input_tokens=194, output_tokens=20, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}), model_name='gpt-4o-mini-2024-07-18', timestamp=datetime.datetime(2025, 9, 29, 11, 35, 3, tzinfo=TzInfo(UTC)), provider_name='

- ModelRequest: Represents a request sent to the model. It includes the user's prompt (UserPromptPart) and the agent's instructions.
- ModelResponse: The model's reply. We see a ToolCallPart with the decision to invoke text_search.
- ModelRequest: Contains ToolReturnPart - the results returned by the tool (search results from the FAQ index).
- ModelResponse: The final answer generated by the model in TextPart.

# Day 5 - Offline Evaluation and Testing

In [None]:
# Convert messages from pydantic to dict for logging
from pydantic_ai.messages import ModelMessagesTypeAdapter


def log_entry(agent, messages, source="user"):
    tools = []
    # Get all tools used in the agent
    for ts in agent.toolsets:
        tools.extend(ts.tools.keys())
    # Convert messages from pydantic to dict (serialize)
    dict_messages = ModelMessagesTypeAdapter.dump_python(messages)

    return {
        "agent_name": agent.name,
        "system_prompt": agent._instructions,
        "provider": agent.model.system,
        "model": agent.model.model_name,
        "tools": tools,
        "messages": dict_messages,
        "source": source
    }

In [None]:
"""
Creates a logs directory (if not created previously).
Generates unique filenames with timestamp and random hex.
Saves complete interaction logs as JSON files.
Handles datetime serialization (using the serializer function).
"""

import json
import secrets
from pathlib import Path
from datetime import datetime


# ✅ Create a folder `logs/` in the project root (if not already there).
LOG_DIR = Path('logs')
LOG_DIR.mkdir(exist_ok=True)


def serializer(obj):
    """
    Custom serializer for objects that aren't natively JSON serializable.
    Currently only supports datetime -> ISO 8601 string.
    """
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")


def log_interaction_to_file(agent, messages, source='user'):
    """
    Save an agent interaction log to a uniquely named JSON file.

    Args:
        agent: The agent object (must have .name attribute).
        messages: A list of message dicts (each must include 'timestamp').
        source: String describing who triggered the interaction (default: 'user').

    Returns:
        Path: Path to the saved JSON log file.
    """

    # Convert
    entry = log_entry(agent, messages, source)

    # Build unique filename based on last message timestamp + random hex
    ts = entry['messages'][-1]['timestamp']
    ts_str = ts.strftime("%Y%m%d_%H%M%S")
    rand_hex = secrets.token_hex(3)  # adds randomness to avoid collisions

    filename = f"{agent.name}_{ts_str}_{rand_hex}.json"
    filepath = LOG_DIR / filename

    # Write JSON file with pretty formatting and datetime serialization
    with filepath.open("w", encoding="utf-8") as f_out:
        json.dump(entry, f_out, indent=2, default=serializer)

    return filepath


In [48]:
questions =["how do I use docker on windows?",
"can I join late and get a certificate?",
"what do I need to do for the certificate?"
]

In [None]:
for question in questions:
    result = await agent.run(user_prompt=question)
    print(result.output)
    log_interaction_to_file(agent, result.new_messages())

In [None]:
# Add references to the system prompt, also correct faq_main issue
system_prompt = """
You are a helpful assistant for a course.

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.

Always include references by citing the filename of the source material you used.
When citing the reference, replace "faq-main" by the full path to the GitHub repository: "https://github.com/DataTalksClub/faq/blob/main/"
Format: [LINK TITLE](FULL_GITHUB_LINK)

If the search doesn't return relevant results, let the user know and provide general guidance.
""".strip()

# Create another version of agent, let's call it faq_agent_v2
agent = Agent(
    name="faq_agent_v2",
    instructions=system_prompt,
    tools=[text_search],
    model='gpt-4o-mini'
)

Note that I added this to the prompt:  
When citing the reference, replace "faq-main" by the full path to the GitHub repository: "https://github.com/DataTalksClub/faq/blob/main/"  
When analyzing the results, I noticed that we should have stripped "faq-main" from the filename on Day 1 when we were preparing the data. We should come back to it and adjust the ingestion process, but I won't do it here now.

In [58]:
for question in questions:
    result = await agent.run(user_prompt=question)
    log_interaction_to_file(agent, result.new_messages())
    display(Markdown(result.output))

To use Docker on Windows, follow these steps according to your Windows version (Pro or Home):

### For Windows 10 Pro / 11 Pro Users:
1. **Install Docker Desktop**: 
   - Ensure you are using the latest version of Docker for Windows. Download it from [Docker's official site](https://docs.docker.com/desktop/install/windows-install/).
   
2. **Enable Hyper-V**: 
   - Enable Hyper-V as this is necessary for Docker to use as a backend. You can follow this [tutorial to enable Hyper-V](https://www.c-sharpcorner.com/article/install-and-configured-docker-desktop-in-windows-10/).

3. **Running Docker**: 
   - After installation, launch Docker Desktop and follow any setup instructions.

### For Windows 10 Home / 11 Home Users:
1. **Install Docker Desktop**: 
   - Similarly, install Docker Desktop from [Docker's official site](https://docs.docker.com/desktop/install/windows-install/).

2. **Use WSL2 (Windows Subsystem for Linux)**: 
   - Since Home versions do not support Hyper-V, you will need to use WSL2. Follow this detailed guide to [install WSL on Windows 11](https://pureinfotech.com/install-wsl-windows-11/).

3. **Setting up WSL2**: 
   - Make sure your WSL2 Linux kernel is updated. You can reference the guidelines at [GitHub: WSL Issue 5393](https://github.com/microsoft/WSL/issues/5393).

### Common Issues:
- If Docker doesn't start or is stuck, try switching between Windows and Linux containers by right-clicking the Docker icon in the system tray.
- For permission issues, make sure to run Docker with elevated privileges.

For a comprehensive process on troubleshooting and using Docker on Windows, you can refer to the following references:
- [Docker won't start or is stuck in settings](https://github.com/DataTalksClub/faq/blob/main/faq-main/_questions/data-engineering-zoomcamp/module-1/015_docker-docker-wont-start-or.md)
- [Error during connect on Windows](https://github.com/DataTalksClub/faq/blob/main/faq-main/_questions/data-engineering-zoomcamp/module-1/011_46dbe4810d_docker-error-during-connect-in-the-default-daemon.md)

Feel free to ask if you have more specific questions!

You can join the course late, but to receive a certificate, you must complete the course with a "live" cohort. Certificates are not awarded for self-paced mode participation. Hence, if you join late, ensure that you still fulfill the requirements, particularly the peer-reviewed capstone projects, in order to obtain the certificate.

For more detailed information, you can refer to the following resources:

- [Do I need to do the homeworks to get the certificate?](https://github.com/DataTalksClub/faq/blob/main/faq-main/_questions/data-engineering-zoomcamp/general/014_3774a79c13_certificate-do-i-need-to-do-the-homeworks-to-get-t.md)
- [Can I follow the course in a self-paced mode and get a certificate?](https://github.com/DataTalksClub/faq/blob/main/faq-main/_questions/data-engineering-zoomcamp/general/015_900f60fd25_certificate-can-i-follow-the-course-in-a-self-pace.md)

To obtain your certificate for the course, you need to complete the following steps:

1. **Complete the Peer-Reviewed Capstone Projects**: You must finish all the required capstone projects on time. It is important to note that you do not need to complete homework assignments if you joined the course late.

2. **Participate in a Live Cohort**: You can only receive a certificate by completing the course with a "live" cohort. Certificates are not awarded for those following the course in a self-paced manner, as peer reviews for capstone projects can only occur while the course is ongoing.

3. **Certificate Issuance**: When the grading is completed, announcements will be made in the Telegram group and the course channel. You'll need to check that your full name is displayed correctly in your course profile, which can be accessed at this link: [Course Profile](https://courses.datatalks.club/de-zoomcamp-2025/enrollment). After receiving notifications, follow the instructions provided in the [certificates.md](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/certificates.md) for generating your certificate document.

For more details, please refer to the following sources:
- [Do I need to do the homeworks to get the certificate?](https://github.com/DataTalksClub/faq/blob/main/faq-main/_questions/data-engineering-zoomcamp/general/014_3774a79c13_certificate-do-i-need-to-do-the-homeworks-to-get-t.md)
- [Can I follow the course in a self-paced mode and get a certificate?](https://github.com/DataTalksClub/faq/blob/main/faq-main/_questions/data-engineering-zoomcamp/general/015_900f60fd25_certificate-can-i-follow-the-course-in-a-self-pace.md)
- [How do I get my certificate?](https://github.com/DataTalksClub/faq/blob/main/faq-main/_questions/data-engineering-zoomcamp/general/046_6314bc3029_how-do-i-get-my-certificate.md)

## LLM as a judge

So, in our case, we can have the following checks:
- Does the agent follow the instructions?
- Given the question, does the answer make sense?
- Does it include references?
- Did the agent use the available tools?


In [None]:
# Evaluation prompt for LLM as a judge
evaluation_prompt = """
Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.

For each item, check if the condition is met.

Checklist:

- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do
- answer_relevant: The response directly addresses the user's question
- answer_clear: The answer is clear and correct
- answer_citations: The response includes proper citations or sources when required
- completeness: The response is complete and covers all key aspects of the request
- tool_call_search: Is the search tool invoked?

Output true/false for each check and provide a short explanation for your judgment.
""".strip()

Since we expect a very well defined structure of the response, we can use [structured output](https://platform.openai.com/docs/guides/structured-outputs).

We can define a Pydantic class with the expected response structure, and the LLM will produce output that matches this schema exactly.

In [69]:
# Structured output class
from pydantic import BaseModel

class EvaluationCheck(BaseModel):
    check_name: str
    justification: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str

In [70]:
# Instantiate judge agent with eval prompt and structured output class
eval_agent = Agent(
    name='eval_agent',
    model='gpt-5-nano',
    instructions=evaluation_prompt,
    output_type=EvaluationChecklist
)

In [71]:
# input template prompt with xml formating
user_prompt_format = """
<INSTRUCTIONS>
{instructions}
</INSTRUCTIONS>
<QUESTION>
{question}
</QUESTION>
<LOG>
{log}
</LOG>
<ANSWER>
{answer}
</ANSWER>
""".strip()

In [72]:
# Helper function to load og file
def load_log_file(log_file):
    with open(log_file, 'r') as f_in:
        log_data = json.load(f_in)
        log_data['log_file'] = log_file
        return log_data

In [73]:
# load log file format user prompt
log_record = load_log_file('./logs/faq_agent_v2_20250929_115644_8f2153.json')

instructions = log_record['system_prompt']
question = log_record['messages'][0]['parts'][0]['content']
answer = log_record['messages'][-1]['parts'][0]['content']
log = json.dumps(log_record['messages'])

user_prompt = user_prompt_format.format(
    instructions=instructions,
    question=question,
    answer=answer,
    log=log
)

In [74]:
user_prompt

'<INSTRUCTIONS>\nYou are a helpful assistant for a course.  \n\nUse the search tool to find relevant information from the course materials before answering questions.  \n\nIf you can find specific information through search, use it to provide accurate answers.\n\nAlways include references by citing the filename of the source material you used.  \nWhen citing the reference, replace "faq-main" by the full path to the GitHub repository: "https://github.com/DataTalksClub/faq/blob/main/"\nFormat: [LINK TITLE](FULL_GITHUB_LINK)\n\nIf the search doesn\'t return relevant results, let the user know and provide general guidance.\n</INSTRUCTIONS>\n<QUESTION>\nhow do I use docker on windows?\n</QUESTION>\n<LOG>\n[{"parts": [{"content": "how do I use docker on windows?", "timestamp": "2025-09-29T11:56:44.168061+00:00", "part_kind": "user-prompt"}], "instructions": "You are a helpful assistant for a course.  \\n\\nUse the search tool to find relevant information from the course materials before answ

In [75]:
result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)

checklist = result.output
print(checklist.summary)

for check in checklist.checklist:
    print(check)

Overall, the answer was helpful and accurate but the citations formatting did not fully comply with the required format; tool usage was appropriate.
check_name='instructions_follow' justification='Used course materials as sources and provided references to relevant Docker Windows guidance; followed instruction to cite sources, albeit citation format not perfectly aligned.' check_pass=True
check_name='instructions_avoid' justification='No disallowed action observed; no harmful content.' check_pass=True
check_name='answer_relevant' justification='Answer directly addresses using Docker on Windows with steps for Pro and Home and mentions common issues and references.' check_pass=True
check_name='answer_clear' justification='Clear step-by-step guidance with sections for Pro and Home; includes bullet points and follow-ups.' check_pass=True
check_name='answer_citations' justification='Cited sources but not in required format; URLs provided but not [TITLE](URL) with full path; thus not fully c

In [None]:
# Simplify log for prompt
def simplify_log_messages(messages):
    log_simplified = []

    for m in messages:
        parts = []

        for original_part in m['parts']:
            part = original_part.copy()
            kind = part['part_kind']
            # Remove unnecessary fields
            if kind == 'user-prompt':
                del part['timestamp']
            if kind == 'tool-call':
                del part['tool_call_id']
            if kind == 'tool-return':
                del part['tool_call_id']
                del part['metadata']
                del part['timestamp']
                # Replace actual search results with placeholder to save tokens
                part['content'] = 'RETURN_RESULTS_REDACTED'
            if kind == 'text':
                del part['id']

            parts.append(part)

        message = {
            'kind': m['kind'],
            'parts': parts
        }

        log_simplified.append(message)
    return log_simplified


In [None]:
async def evaluate_log_record(eval_agent, log_record):
    messages = log_record['messages']
    # Extract relevant fields
    instructions = log_record['system_prompt']
    question = messages[0]['parts'][0]['content']
    answer = messages[-1]['parts'][0]['content']
    # Simplify log
    log_simplified = simplify_log_messages(messages)
    log = json.dumps(log_simplified)
    # Build prompt
    user_prompt = user_prompt_format.format(
        instructions=instructions,
        question=question,
        answer=answer,
        log=log
    )
    # Run evaluation prompt and return result
    result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)
    return result.output


log_record = load_log_file('logs/faq_agent_v2_20250929_115655_2b3ad1.json')
eval1 = await evaluate_log_record(eval_agent, log_record)

In [80]:
print(eval1.summary)

for check in eval1.checklist:
    print(check)

All evaluation criteria met. The answer effectively uses course materials, cites sources, and provides clear guidance for late joiners seeking a certificate.
check_name='instructions_follow' justification='Used course search results and provided citations per user instruction.' check_pass=True
check_name='instructions_avoid' justification='No disallowed content or actions detected in the answer.' check_pass=True
check_name='answer_relevant' justification='Directly answered whether late joining is possible and certificate eligibility.' check_pass=True
check_name='answer_clear' justification='Clear policy: live cohort needed for certificate; self-paced not eligible; mentions requirements.' check_pass=True
check_name='answer_citations' justification='Cited two FAQ sources with full GitHub links as required.' check_pass=True
check_name='completeness' justification='Addresses main question and provides further reading resources; mentions capstone projects.' check_pass=True
check_name='tool_

## Question generation

In [81]:
question_generation_prompt = """
You are helping to create test questions for an AI agent that answers questions about a data engineering course.

Based on the provided FAQ content, generate realistic questions that students might ask.

The questions should:

- Be natural and varied in style
- Range from simple to complex
- Include both specific technical questions and general course questions

Generate one question for each record.
""".strip()

class QuestionsList(BaseModel):
    questions: list[str]

question_generator = Agent(
    name="question_generator",
    instructions=question_generation_prompt,
    model='gpt-4o-mini',
    output_type=QuestionsList
)


In [82]:
import random

sample = random.sample(de_dtc_faq, 10)
prompt_docs = [d['content'] for d in sample]
prompt = json.dumps(prompt_docs)

result = await question_generator.run(prompt)
questions = result.output.questions


In [83]:
questions

['Why does Python 3.11 cause issues with Spark 3.0.3, and how can I resolve it?',
 'Can you explain the differences in demo interfaces between dbt cloud Developer and Team licenses?',
 'Do I need to register to start learning and submitting homework for the course?',
 'What should I do to define the column format when converting CSV to Parquet?',
 'How do I fix SIGILL errors in the Java Runtime Environment on MacOS M4 when using Kestra?',
 'How can I determine which Spark session I am observing if multiple are active?',
 'What should I do if my credit or debit card is rejected by Google for course-related payments?',
 'Can I use pandas 2.0.1 with PySpark 3.5.1, and how do I set it up?',
 "Is it possible to get a course certificate if I join late and don't complete all the homework?",
 'How do I fix the broken `dbt_utils.surrogate_key` function in my SQL code?']

In [84]:
# Iterate over questions
for q in tqdm(questions):
    print(q)

    result = await agent.run(user_prompt=q)
    print(result.output)

    log_interaction_to_file(
        agent,
        result.new_messages(),
        source='ai-generated'
    )

    print()

  0%|          | 0/10 [00:00<?, ?it/s]

Why does Python 3.11 cause issues with Spark 3.0.3, and how can I resolve it?
Python 3.11 causes issues with Spark 3.0.3 due to certain inconsistencies between the new Python version and the older Spark framework. One of the known errors is the `TypeError: code() argument 13 must be str, not int`, which commonly occurs when attempting to import PySpark.

### Solutions:

1. **Downgrade Python Version:**  
   Switch to Python 3.9, which is compatible with Spark 3.0.3. You can create a conda environment to manage different Python versions:
   ```bash
   conda create -n pyspark_env python=3.9
   conda activate pyspark_env
   ```

2. **Upgrade PySpark Version:**  
   If you prefer to keep Python 3.11, you can upgrade your PySpark version to 3.5.1 or above, which offers compatibility with Python 3.11:
   ```bash
   pip install pyspark==3.5.1
   ```

Make sure to set up your environment correctly to avoid version mismatches. This approach should help you resolve the issues between Python 3.11

In [None]:
eval_set = []

for log_file in LOG_DIR.glob('*.json'):
    if 'faq_agent_v2' not in log_file.name:
        continue
    # Only use ai-generated question logs
    log_record = load_log_file(log_file)
    if log_record['source'] != 'ai-generated':
        continue

    eval_set.append(log_record)

In [87]:
len(eval_set)

10

In [88]:
eval_results = []

for log_record in tqdm(eval_set):
    eval_result = await evaluate_log_record(eval_agent, log_record)
    eval_results.append((log_record, eval_result))

  0%|          | 0/10 [00:00<?, ?it/s]

## Evaluation results analysis

In [89]:
# Create dataframe by defining rows per file and adding checks
rows = []

for log_record, eval_result in eval_results:
    messages = log_record['messages']

    row = {
        'file': log_record['log_file'].name,
        'question': messages[0]['parts'][0]['content'],
        'answer': messages[-1]['parts'][0]['content'],
    }
    # Add checks to row dict
    checks = {c.check_name: c.check_pass for c in eval_result.checklist}
    row.update(checks)

    rows.append(row)

In [None]:
import pandas as pd

df_evals = pd.DataFrame(rows)
df_evals

Unnamed: 0,file,question,answer,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,completeness,tool_call_search,dummy
0,faq_agent_v2_20250929_193631_130d0c.json,Do I need to register to start learning and su...,To start learning and submitting homework for ...,True,True,True,True,True,True,True,
1,faq_agent_v2_20250929_193714_0b5362.json,What should I do if my credit or debit card is...,If your credit or debit card is rejected by Go...,True,True,True,True,True,True,True,
2,faq_agent_v2_20250929_193704_975cd4.json,How can I determine which Spark session I am o...,To determine which Spark session you are obser...,True,True,True,True,True,True,True,
3,faq_agent_v2_20250929_193750_3c6d10.json,How do I fix the broken `dbt_utils.surrogate_k...,To fix the broken `dbt_utils.surrogate_key` fu...,True,True,True,True,True,True,False,
4,faq_agent_v2_20250929_193638_7e1ea6.json,What should I do to define the column format w...,To define the column format when converting CS...,True,True,True,True,True,True,True,
5,faq_agent_v2_20250929_193722_0c915d.json,"Can I use pandas 2.0.1 with PySpark 3.5.1, and...","Yes, you can use Pandas 2.0.1 with PySpark 3.5...",,,,,,,,
6,faq_agent_v2_20250929_193744_65cd06.json,Is it possible to get a course certificate if ...,"Yes, it is possible to receive a course certif...",,,,,,,,True
7,faq_agent_v2_20250929_193655_ae792a.json,How do I fix SIGILL errors in the Java Runtime...,To fix SIGILL errors in the Java Runtime Envir...,True,True,True,True,True,True,True,
8,faq_agent_v2_20250929_193624_9036b7.json,Can you explain the differences in demo interf...,The differences in demo interfaces between dbt...,True,True,True,True,False,True,True,
9,faq_agent_v2_20250929_193614_108abd.json,Why does Python 3.11 cause issues with Spark 3...,Python 3.11 causes issues with Spark 3.0.3 due...,,,,,,,True,


In [101]:
df_evals[[col for col in df_evals.columns if "_" in col]]

Unnamed: 0,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,tool_call_search
0,True,True,True,True,True,True
1,True,True,True,True,True,True
2,True,True,True,True,True,True
3,True,True,True,True,True,False
4,True,True,True,True,True,True
5,,,,,,
6,,,,,,
7,True,True,True,True,True,True
8,True,True,True,True,False,True
9,,,,,,True


In [102]:
for col in df_evals.columns:
    if "_" in col:
        df_evals[col] = df_evals[col].astype(bool).astype(int)

In [103]:
df_evals

Unnamed: 0,file,question,answer,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,completeness,tool_call_search,dummy
0,faq_agent_v2_20250929_193631_130d0c.json,Do I need to register to start learning and su...,To start learning and submitting homework for ...,1,1,1,1,1,True,1,
1,faq_agent_v2_20250929_193714_0b5362.json,What should I do if my credit or debit card is...,If your credit or debit card is rejected by Go...,1,1,1,1,1,True,1,
2,faq_agent_v2_20250929_193704_975cd4.json,How can I determine which Spark session I am o...,To determine which Spark session you are obser...,1,1,1,1,1,True,1,
3,faq_agent_v2_20250929_193750_3c6d10.json,How do I fix the broken `dbt_utils.surrogate_k...,To fix the broken `dbt_utils.surrogate_key` fu...,1,1,1,1,1,True,0,
4,faq_agent_v2_20250929_193638_7e1ea6.json,What should I do to define the column format w...,To define the column format when converting CS...,1,1,1,1,1,True,1,
5,faq_agent_v2_20250929_193722_0c915d.json,"Can I use pandas 2.0.1 with PySpark 3.5.1, and...","Yes, you can use Pandas 2.0.1 with PySpark 3.5...",1,1,1,1,1,,1,
6,faq_agent_v2_20250929_193744_65cd06.json,Is it possible to get a course certificate if ...,"Yes, it is possible to receive a course certif...",1,1,1,1,1,,1,True
7,faq_agent_v2_20250929_193655_ae792a.json,How do I fix SIGILL errors in the Java Runtime...,To fix SIGILL errors in the Java Runtime Envir...,1,1,1,1,1,True,1,
8,faq_agent_v2_20250929_193624_9036b7.json,Can you explain the differences in demo interf...,The differences in demo interfaces between dbt...,1,1,1,1,0,True,1,
9,faq_agent_v2_20250929_193614_108abd.json,Why does Python 3.11 cause issues with Spark 3...,Python 3.11 causes issues with Spark 3.0.3 due...,1,1,1,1,1,,1,


In [105]:
df_evals.describe()

Unnamed: 0,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,tool_call_search
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,1.0,1.0,1.0,1.0,0.9,0.9
std,0.0,0.0,0.0,0.0,0.316228,0.316228
min,1.0,1.0,1.0,1.0,0.0,0.0
25%,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


## Evaluate search quality

- Precision and Recall: How many relevant results were retrieved vs. how many relevant results were missed
- Hit Rate: Percentage of queries that return at least one relevant result
- MRR (Mean Reciprocal Rank): Reflects the position of the first relevant result in the ranking


In [None]:
def evaluate_search_quality(search_function, test_queries):
    results = []

    for query, expected_docs in test_queries:
        search_results = search_function(query, num_results=5)

        # Calculate hit rate
        relevant_found = any(doc['filename'] in expected_docs for doc in search_results)

        # Calculate MRR
        for i, doc in enumerate(search_results):
            if doc['filename'] in expected_docs:
                mrr = 1 / (i + 1)
                break
        else:
            mrr = 0

        results.append({
            'query': query,
            'hit': relevant_found,
            'mrr': mrr
        })
    return results