### Imports

In [1]:
%load_ext autoreload
%autoreload 2
# Standard libraries
import io
import os
import re
import zipfile

# Third-party libraries
import requests
import frontmatter
from dotenv import load_dotenv
from tqdm.notebook import tqdm

# Google Gemini API
import google.generativeai as genai


# Day 1: Download and extract the zip file

In [2]:
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)

In [3]:
repository_data =[]

# Zipfile object from downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()
    # Get md files only
    if not (filename.endswith('.md') or filename.endswith('.mdx')):
        continue
    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()

In [4]:
print(repository_data[1])

{'id': '9e508f2212', 'question': 'Course: When does the course start?', 'sort_order': 1, 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.", 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md'}


In [5]:
from read import read_repo_data

In [6]:
prefix = 'https://codeload.github.com'
dtc_faq = read_repo_data('DataTalksClub', 'faq', prefix=prefix)
evidently_docs = read_repo_data('evidentlyai', 'docs', prefix=prefix)

In [7]:
print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")


FAQ documents: 1217
Evidently documents: 95


In [8]:
evidently_docs[45]['content']



# Day 2: Chunking and Intelligent Processing for Data

## 1. Chunking by sliding window

In [9]:
def sliding_window_chunking(seq, size, step):
    """Chunk a text sequence using a sliding window approach.

    Args:
        seq (str): text sequence to chunk
        size (int): size of each chunk
        step (int): overlap step between chunks

    Raises:
        ValueError: size and step must be positive.

    Returns:
     list: list of dict with 'start' and 'chunk' keys
    """
    if size <= 0 or step <= 0:
        raise ValueError("Size and step must be positive.")

    result = []
    # Sliding window up to the end of the sequence
    for i in range(0, len(seq), step):
        chunk = seq[i:i + size]
        result.append({'start': i, 'end': i + size, 'chunk': chunk})
        # If the chunk is smaller than size, we reached the end
        if i+size >= len(seq):
            break
    return result

In [10]:
sliding_window_chunking(evidently_docs[45]['content'], 2000, 1000)

[{'start': 0,
  'end': 2000,
  'chunk': "In this tutorial, you will learn how to perform regression testing for LLM outputs.\n\nYou can compare new and old responses after changing a prompt, model, or anything else in your system. By re-running the same inputs with new parameters, you can spot any significant changes. This helps you push updates with confidence or identify issues to fix.\n\n<Info>\n  **This example uses Evidently Cloud.** You'll run evals in Python and upload them. You can also skip the upload and view Reports locally. For self-hosted, replace `CloudWorkspace` with `Workspace`.\n</Info>\n\n# Tutorial scope\n\nHere's what we'll do:\n\n* **Create a toy dataset**. Build a small Q&A dataset with answers and reference responses.\n\n* **Get new answers**. Imitate generating new answers to the same question.\n\n* **Create and run a Report with Tests**. Compare the answers using LLM-as-a-judge to evaluate length, correctness and style consistency.\n\n* **Build a monitoring Das

In [11]:
evidently_chunks = []
for doc in evidently_docs:
    doc_copy = doc.copy()
    # Remove content and keep metadata
    doc_content = doc_copy.pop('content')
    chunks = sliding_window_chunking(doc_content, 2000, 1000)
    # Add metadata to each chunk
    for chunk in chunks:
        chunk.update(doc_copy) # add metadata by updating the chunk dict
        evidently_chunks.append(chunk)

In [12]:
evidently_chunks[45]

{'start': 13000,
 'end': 15000,
 'chunk': '                                            |\n| ----------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |\n| **RecSysPreset()**                  | <ul><li>Larget Preset. </li><li>Includes a range of recommendation system metrics.</li><li>Metric result: all metrics.</li><li>See [Preset page](/metrics/preset_recsys).</li></ul> | None.                                                                                                                                                          | As in

## 2. Chunking by Paragraphs and sections

In [13]:
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())
paragraphs

['In this tutorial, you will learn how to perform regression testing for LLM outputs.',
 'You can compare new and old responses after changing a prompt, model, or anything else in your system. By re-running the same inputs with new parameters, you can spot any significant changes. This helps you push updates with confidence or identify issues to fix.',
 "<Info>\n  **This example uses Evidently Cloud.** You'll run evals in Python and upload them. You can also skip the upload and view Reports locally. For self-hosted, replace `CloudWorkspace` with `Workspace`.\n</Info>",
 '# Tutorial scope',
 "Here's what we'll do:",
 '* **Create a toy dataset**. Build a small Q&A dataset with answers and reference responses.',
 '* **Get new answers**. Imitate generating new answers to the same question.',
 '* **Create and run a Report with Tests**. Compare the answers using LLM-as-a-judge to evaluate length, correctness and style consistency.',
 '* **Build a monitoring Dashboard**. Get plots to track th

In [14]:
def split_markdown_by_level(text, level=2):
    """Split markdown text into sections based on header levels.

    Args:
        text (str): Markdown text to split.
        level (int): Header level to split by (e.g., 2 for '##').

    Returns:
        list: List of sections as strings.
    """
    # Create a regex pattern to match headers of the specified level
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)
    # Split the text into parts based on the header pattern
    parts = pattern.split(text)
    # Reconstruct sections with headers
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # include the '## ' part
        header = header.strip()
        # get the content after the header
        if i+2 < len(parts): # check if there's content after the header
            content = parts[i+2].strip()
        if content:
            section = f"{header}\n\n{content}"
        else:
            section = header
        sections.append(section)
    return sections

**Note**: This code may not work perfectly if we want to split by level 1 headings and have Python code with # comments. But in general, this is not a big problem for documentation.

In [15]:
split_markdown_by_level(evidently_docs[45]['content'], level=2)

['## 1. Installation and Imports\n\nInstall Evidently:\n\n```python\npip install evidently[llm] \n```\n\nImport the required modules:\n\n```python\nimport pandas as pd\nfrom evidently.future.datasets import Dataset\nfrom evidently.future.datasets import DataDefinition\nfrom evidently.future.datasets import Descriptor\nfrom evidently.future.descriptors import *\nfrom evidently.future.report import Report\nfrom evidently.future.presets import TextEvals\nfrom evidently.future.metrics import *\nfrom evidently.future.tests import *\n\nfrom evidently.features.llm_judge import BinaryClassificationPromptTemplate\n```\n\nTo connect to Evidently Cloud:\n\n```python\nfrom evidently.ui.workspace.cloud import CloudWorkspace\n```\n\n**Optional.** To create monitoring panels as code:\n\n```python\nfrom evidently.ui.dashboards import DashboardPanelPlot\nfrom evidently.ui.dashboards import DashboardPanelTestSuite\nfrom evidently.ui.dashboards import DashboardPanelTestSuiteCounter\nfrom evidently.ui.das

In [16]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [17]:
evidently_chunks[2]

{'title': 'Data definition',
 'description': 'How to map the input data.',
 'filename': 'docs-main/docs/library/data_definition.mdx',
 'section': '## Basic flow\n\n**Step 1. Imports.** Import the following modules:\n\n```python\nfrom evidently import Dataset\nfrom evidently import DataDefinition\n```\n\n**Step 2. Prepare your data.** Use a pandas.DataFrame.\n\n<Info>\n  Your data can have [flexible structure](/docs/library/overview#dataset) with any mix of categorical, numerical or text columns. Check the [Reference table](/metrics/all_metrics) for data requirements in specific evaluations.\n</Info>\n\n**Step 3. Create a Dataset object**. Use `Dataset.from_pandas` with `data_definition`:\n\n```python\neval_data = Dataset.from_pandas(\n    source_df,\n    data_definition=DataDefinition()\n)\n```\n\nTo map columns automatically, pass an empty `DataDefinition()` . Evidently will map columns:\n\n- By type (numerical, categorical).\n- By matching column names to roles (e.g., a column "targe

## 3. Intelligent Chunking with LLM

In [None]:
# Load environment variables from a .env file
load_dotenv()
# Get the API key from environment variables
API_KEY = os.getenv('GEMINI_API_KEY')
# Check if the API key was found
if not API_KEY:
    raise ValueError("API key not found. Please set the GEMINI_API_KEY environment variable.")
else:
    print("API key loaded successfully.")

In [23]:
# trying with the gemini api
genai.configure(api_key=API_KEY)

def llm(prompt: str, model: str = "gemini-2.5-flash-lite") -> str:
    """
    Call Gemini with a text prompt and return the output text.

    Args:
        prompt (str): The input prompt for the LLM.
        model (str): Gemini model name (default: gemini-1.5-flash).

    Returns:
        str: The generated text.
    """
    try:
        model_obj = genai.GenerativeModel(model)
        response = model_obj.generate_content(prompt)

        if not response or not hasattr(response, "text"):
            raise ValueError("LLM returned no text.")

        return response.text

    except Exception as e:
        # Debug report
        print("❌ Error during LLM call")
        print(f"Model: {model}")
        print(f"Prompt (truncated): {prompt[:200]}{'...' if len(prompt) > 200 else ''}")
        print(f"Error: {e}")
        raise


In [24]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

Considerations to improve prompt:

- Unbounded length: the model might produce very large sections if the input doc is long (could exceed embedding limits).

- Ambiguous instructions: “logical sections” might be interpreted differently by the model (especially across varied docs).

- No output constraints: doesn’t say “keep each section < N tokens” or “max 5 sections” → could be inconsistent.

In [25]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [None]:
evidently_docs[5:6]

[{'title': 'Data definition',
  'description': 'How to map the input data.',
  'content': 'To run evaluations, you must create a `Dataset` object with a `DataDefinition`, which maps:\n\n- **Column types** (e.g., categorical, numerical, text).\n- **Column roles** (e.g., id, prediction, target).\n\nThis allows Evidently to process the data correctly. Some evaluations need specific columns and will fail if they\'re missing. You can define the mapping using the Python API or by assigning columns visually when uploading data to the Evidently platform.\n\n## Basic flow\n\n**Step 1. Imports.** Import the following modules:\n\n```python\nfrom evidently import Dataset\nfrom evidently import DataDefinition\n```\n\n**Step 2. Prepare your data.** Use a pandas.DataFrame.\n\n<Info>\n  Your data can have [flexible structure](/docs/library/overview#dataset) with any mix of categorical, numerical or text columns. Check the [Reference table](/metrics/all_metrics) for data requirements in specific evalua

In [32]:
# Test with example
evidently_chunks = []

for doc in tqdm(evidently_docs[5:6]):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

  0%|          | 0/1 [00:00<?, ?it/s]

In [33]:
evidently_chunks

[{'title': 'Data definition',
  'description': 'How to map the input data.',
  'filename': 'docs-main/docs/library/data_definition.mdx',
  'section': '## Introduction to Data Definition in Evidently\n\nTo run evaluations with Evidently, you need to create a `Dataset` object. This object uses a `DataDefinition` to understand how your data is structured, including column types (categorical, numerical, text) and column roles (id, prediction, target). This mapping is crucial for Evidently to process your data correctly, and some evaluations will fail if required columns are missing. You can define this mapping using the Python API or visually through the Evidently platform.'},
 {'title': 'Data definition',
  'description': 'How to map the input data.',
  'filename': 'docs-main/docs/library/data_definition.mdx',
  'section': '## Basic Data Preparation and Dataset Creation\n\nThis section outlines the fundamental steps to prepare your data and create an Evidently `Dataset` object for evaluat