### Imports

In [1]:
%load_ext autoreload
%autoreload 2
# Standard libraries
import io
import os
import re
import zipfile

# Third-party libraries
import requests
import frontmatter
from dotenv import load_dotenv
from tqdm.notebook import tqdm

# Google Gemini API
import google.generativeai as genai


# Day 1: Download and extract the zip file

In [2]:
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)

In [3]:
repository_data =[]

# Zipfile object from downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()
    # Get md files only
    if not (filename.endswith('.md') or filename.endswith('.mdx')):
        continue
    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()

In [4]:
print(repository_data[1])

{'id': '9e508f2212', 'question': 'Course: When does the course start?', 'sort_order': 1, 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.", 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md'}


In [5]:
from read import read_repo_data

In [6]:
prefix = 'https://codeload.github.com'
dtc_faq = read_repo_data('DataTalksClub', 'faq', prefix=prefix)
evidently_docs = read_repo_data('evidentlyai', 'docs', prefix=prefix)

In [7]:
print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")

FAQ documents: 1219
Evidently documents: 95


In [8]:
evidently_docs[45]['content']



# Day 2: Chunking and Intelligent Processing for Data

## 1. Chunking by sliding window

In [10]:
def sliding_window_chunking(seq, size, step):
    """Chunk a text sequence using a sliding window approach.

    Args:
        seq (str): text sequence to chunk
        size (int): size of each chunk
        step (int): overlap step between chunks

    Raises:
        ValueError: size and step must be positive.

    Returns:
     list: list of dict with 'start' and 'chunk' keys
    """
    if size <= 0 or step <= 0:
        raise ValueError("Size and step must be positive.")

    result = []
    # Sliding window up to the end of the sequence
    for i in range(0, len(seq), step):
        chunk = seq[i:i + size]
        result.append({'start': i, 'end': i + size, 'chunk': chunk})
        # If the chunk is smaller than size, we reached the end
        if i+size >= len(seq):
            break
    return result

In [11]:
sliding_window_chunking(evidently_docs[45]['content'], 2000, 1000)

[{'start': 0,
  'end': 2000,
  'chunk': "In this tutorial, you will learn how to perform regression testing for LLM outputs.\n\nYou can compare new and old responses after changing a prompt, model, or anything else in your system. By re-running the same inputs with new parameters, you can spot any significant changes. This helps you push updates with confidence or identify issues to fix.\n\n<Info>\n  **This example uses Evidently Cloud.** You'll run evals in Python and upload them. You can also skip the upload and view Reports locally. For self-hosted, replace `CloudWorkspace` with `Workspace`.\n</Info>\n\n# Tutorial scope\n\nHere's what we'll do:\n\n* **Create a toy dataset**. Build a small Q&A dataset with answers and reference responses.\n\n* **Get new answers**. Imitate generating new answers to the same question.\n\n* **Create and run a Report with Tests**. Compare the answers using LLM-as-a-judge to evaluate length, correctness and style consistency.\n\n* **Build a monitoring Das

In [17]:
evidently_chunks = []
for doc in evidently_docs:
    doc_copy = doc.copy()
    # Remove content and keep metadata
    doc_content = doc_copy.pop('content')
    chunks = sliding_window_chunking(doc_content, 2000, 1000)
    # Add metadata to each chunk
    for chunk in chunks:
        chunk.update(doc_copy) # add metadata by updating the chunk dict
        evidently_chunks.append(chunk)

In [21]:
evidently_chunks[5]

{'start': 4000,
 'end': 6000,
 'chunk': '2-17" description="Evidently v0.6.4">\n  ## **Evidently 0.6.4**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.6.4).\n</Update>\n\n<Update label="2025-02-12" description="Evidently v0.6.3">\n  ## **Evidently 0.6.3**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.6.3). Added new RAG descriptors: see [tutorial](/examples/LLM_rag_evals) and [release blog](https://www.evidentlyai.com/blog/open-source-rag-evaluation-tool).\n</Update>\n\n<Update label="2025-02-07" description="Evidently v0.6.2">\n  ## **Evidently 0.6.2**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.6.2). We extended support for `litellm` , so you can easily use different providers like Gemini, Anthropic, etc. for LLM-based evaluations.\n</Update>\n\n<Update label="2025-01-31" description="Evidently v0.6.1">\n  ## **Evidently 0.6.1**\n\n  Full rele

## 2. Chunking by Paragraphs and sections

In [None]:
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())
paragraphs

In [None]:
def split_markdown_by_level(text, level=2):
    """Split markdown text into sections based on header levels.

    Args:
        text (str): Markdown text to split.
        level (int): Header level to split by (e.g., 2 for '##').

    Returns:
        list: List of sections as strings.
    """
    # Create a regex pattern to match headers of the specified level
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)
    # Split the text into parts based on the header pattern
    parts = pattern.split(text)
    # Reconstruct sections with headers
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # include the '## ' part
        header = header.strip()
        # get the content after the header
        if i+2 < len(parts): # check if there's content after the header
            content = parts[i+2].strip()
        if content:
            section = f"{header}\n\n{content}"
        else:
            section = header
        sections.append(section)
    return sections

**Note**: This code may not work perfectly if we want to split by level 1 headings and have Python code with # comments. But in general, this is not a big problem for documentation.

In [None]:
split_markdown_by_level(evidently_docs[45]['content'], level=2)

In [None]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [None]:
evidently_chunks[2]

## 3. Intelligent Chunking with LLM

In [None]:
# Load environment variables from a .env file
load_dotenv()
# Get the API key from environment variables
API_KEY = os.getenv('GEMINI_API_KEY')
# Check if the API key was found
if not API_KEY:
    raise ValueError("API key not found. Please set the GEMINI_API_KEY environment variable.")
else:
    print("API key loaded successfully.")

In [None]:
# trying with the gemini api
genai.configure(api_key=API_KEY)

def llm(prompt: str, model: str = "gemini-2.5-flash-lite") -> str:
    """
    Call Gemini with a text prompt and return the output text.

    Args:
        prompt (str): The input prompt for the LLM.
        model (str): Gemini model name (default: gemini-1.5-flash).

    Returns:
        str: The generated text.
    """
    try:
        model_obj = genai.GenerativeModel(model)
        response = model_obj.generate_content(prompt)

        if not response or not hasattr(response, "text"):
            raise ValueError("LLM returned no text.")

        return response.text

    except Exception as e:
        # Debug report
        print("❌ Error during LLM call")
        print(f"Model: {model}")
        print(f"Prompt (truncated): {prompt[:200]}{'...' if len(prompt) > 200 else ''}")
        print(f"Error: {e}")
        raise


In [None]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

Considerations to improve prompt:

- Unbounded length: the model might produce very large sections if the input doc is long (could exceed embedding limits).

- Ambiguous instructions: “logical sections” might be interpreted differently by the model (especially across varied docs).

- No output constraints: doesn’t say “keep each section < N tokens” or “max 5 sections” → could be inconsistent.

In [None]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [None]:
evidently_docs[5:6]

In [None]:
# Test with example
evidently_chunks = []

for doc in tqdm(evidently_docs[5:6]):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [None]:
evidently_chunks

# Day 3 - Search — lexical + semantic + hybrid


## 1. Lexical search

In [22]:
# Index evidently chunks using minsearch
from minsearch import Index

# instantiate index object
index = Index(
    text_fields=['chunk', 'title', 'description', 'filename'],
    keyword_fields=[]
)

# fit to the evidently chunks
index.fit(evidently_chunks)

<minsearch.minsearch.Index at 0x759b872b4af0>

In [24]:
# Test a query
query = 'What should be in a test dataset for AI evaluation?'
results = index.search(query)
results[1]

{'start': 3000,
 'end': 5000,
 'chunk': ' Inputs, context, and outputs (for RAG evaluation)\n</Info>\n\n<Info>\n  **Collecting live data**. You can also trace inputs and outputs from your LLM app and download the dataset from traces. See the [Tracing Quickstart](/quickstart_tracing)\n</Info>\n\n## 3. Run evaluations\n\nWe\'ll evaluate the answers for:\n\n- **Sentiment:** from -1 (negative) to 1 (positive)\n- **Text length:** character count\n- **Denials:** refusals to answer. This uses an LLM-as-a-judge with built-in prompt.\n\nEach evaluation is a `descriptor`. It adds a new score or label to each row in your dataset.\n\nFor LLM-as-a-judge, we\'ll use OpenAI GPT-4o mini. Set OpenAI key as an environment variable:\n\n```python\n## import os\n## os.environ["OPENAI_API_KEY"] = "YOUR KEY"\n```\n\n<Info>\n  If you don\'t have an OpenAI key, you can use a keyword-based check `IncludesWords` instead.\n</Info>\n\nTo run evals, pass the dataset and specify the list of descriptors to add:\n\n``

In [None]:
# get datatalks faq  and filter files  with data engineering
dtc_faq = read_repo_data('DataTalksClub', 'faq', prefix=prefix)
de_dtc_faq = [doc for doc in dtc_faq if 'data-engineering' in doc['filename']]
faq_index = Index(
    text_fields=['question', 'content'],
    keyword_fields=[]
)
faq_index.fit(de_dtc_faq)

<minsearch.minsearch.Index at 0x759beab8d210>

In [27]:
query = 'Can I join the course after it starts?'
results = faq_index.search(query)
results

[{'id': '068529125b',
  'question': 'Course - Can I follow the course after it finishes?',
  'sort_order': 8,
  'content': 'Yes, we will keep all the materials available, so you can follow the course at your own pace after it finishes.\n\nYou can also continue reviewing the homeworks and prepare for the next cohort. You can also start working on your final capstone project.',
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/008_068529125b_course-can-i-follow-the-course-after-it-finishes.md'},
 {'id': '9e508f2212',
  'question': 'Course: When does the course start?',
  'sort_order': 1,
  'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's S

In [32]:
query1 = "I just discovered the program, can I still enroll?"
query2 = "I just found out about the course, can I still join?"

results1 = faq_index.search(query1)
results2 = faq_index.search(query2)
print(results2)


[{'id': '3f1424af17', 'question': 'Course: Can I still join the course after the start date?', 'sort_order': 3, 'content': "Yes, even if you don't register, you're still eligible to submit the homework.\n\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.", 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md'}, {'id': '9e508f2212', 'question': 'Course: When does the course start?', 'sort_order': 1, 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.", 'file

## 2. Vector search

The ```multi-qa-distilbert-cos-v1 model``` is trained explicitly for question-answering tasks. It creates embeddings optimized for finding answers to questions.  
Other popular models include:  
- all-MiniLM-L6-v2 - General-purpose, fast, and efficient
- all-mpnet-base-v2 - Higher quality, slower  
Check Sentence Transformers documentation for more options.
https://www.sbert.net/docs/pretrained_models.html

In [34]:
# Import and select embedding model
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Create embedding for a document
record = de_dtc_faq[2]
text = record['question'] + ' ' + record['content'] # Concatenate question and content
v_doc = embedding_model.encode(text)

In [None]:
# Create embeding for a query
query = 'I just found out about the course. Can I enroll now?'
v_query = embedding_model.encode(query)

In [39]:
# Calculate similarity - normalized embeddings where dot product equals cosine similarity
similarity = v_query.dot(v_doc)
similarity

np.float32(0.51909333)

In [None]:
import numpy as np

faq_embeddings = []

for d in tqdm(de_dtc_faq):
    text = d['question'] + ' ' + d['content']
    v = embedding_model.encode(text)
    faq_embeddings.append(v)

faq_embeddings = np.array(faq_embeddings)
faq_embeddings.shabpe

  0%|          | 0/449 [00:00<?, ?it/s]

(449, 768)

In [41]:
from minsearch import VectorSearch
# Create vector search index
faq_vindex = VectorSearch()
faq_vindex.fit(faq_embeddings, de_dtc_faq)

<minsearch.vector.VectorSearch at 0x759a7b6d31c0>

In [None]:
#### Query vector index
query = 'Can I join the course now?'
# embed query
q = embedding_model.encode(query)
# search for the most similar document based on query embedding
results = faq_vindex.search(q)
results

[{'id': '3f1424af17',
  'question': 'Course: Can I still join the course after the start date?',
  'sort_order': 3,
  'content': "Yes, even if you don't register, you're still eligible to submit the homework.\n\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.",
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md'},
 {'id': '068529125b',
  'question': 'Course - Can I follow the course after it finishes?',
  'sort_order': 8,
  'content': 'Yes, we will keep all the materials available, so you can follow the course at your own pace after it finishes.\n\nYou can also continue reviewing the homeworks and prepare for the next cohort. You can also start working on your final capstone project.',
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/008_068529125b_course-can-i-follow-the-course-after-i

In [43]:
### 2.1 Embed evidently chunks
evidently_embeddings = []
# Create embeddings for each chunk
for d in tqdm(evidently_chunks):
    text = d['chunk'] # Use the chunk text
    v = embedding_model.encode(text)
    evidently_embeddings.append(v)
# Convert to numpy array
evidently_embeddings = np.array(evidently_embeddings)

# Create vector search index for evidently chunks
evidently_vindex = VectorSearch()
evidently_vindex.fit(evidently_embeddings, evidently_chunks)

  0%|          | 0/575 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x759a7b374610>

## 3. Hybrid search

In [44]:
#### Join lexical and vector search results
query = "Can I join the course now?"

# Lexical search
text_results = faq_index.search(query, num_results=5)

# Embed query and search vector index
q = embedding_model.encode(query)
vector_results = faq_vindex.search(q, num_results=5)
# Combine results (here we just concatenate, but you could interleave or rank them)
final_results = text_results + vector_results

In [45]:
final_results

[{'id': '3f1424af17',
  'question': 'Course: Can I still join the course after the start date?',
  'sort_order': 3,
  'content': "Yes, even if you don't register, you're still eligible to submit the homework.\n\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.",
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md'},
 {'id': '9e508f2212',
  'question': 'Course: When does the course start?',
  'sort_order': 1,
  'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the c