## Step 1: Import Required Libraries


In [1]:
## URL Format : https://github.com/<owner>/<repository>/archive/refs/heads/<branch_name>.zip
import io
import zipfile
import requests
import frontmatter

## Step 2: Download the Repository
- GitHub's ZIP URL format:
https://codeload.github.com/{owner}/{repo}/zip/refs/heads/{branch}


In [2]:
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)
resp

<Response [200]>

## Step 3: Process the ZIP File in Memory


In [3]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    # Only process markdown files
    if not filename.endswith('.md'):
        continue

    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()

In [4]:
print(f"Total documents extracted: {len(repository_data)}")

Total documents extracted: 1232


In [5]:
# Look at multiple documents to find one with frontmatter
for i, doc in enumerate(repository_data[:5]):
    print(f"\n--- Document {i} ---")
    print(f"Filename: {doc.get('filename')}")
    print(f"Keys: {list(doc.keys())}")
    if 'question' in doc:
        print(f"Question: {doc.get('question')}")
        break


--- Document 0 ---
Filename: faq-main/contributing.md
Keys: ['content', 'filename']

--- Document 1 ---
Filename: faq-main/readme.md
Keys: ['content', 'filename']

--- Document 2 ---
Filename: faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md
Keys: ['id', 'question', 'sort_order', 'content', 'filename']
Question: Course: When does the course start?


In [None]:
# Find the document with the question
faq_doc = repository_data[2]
print(faq_doc)

In [None]:
# Filter out README and other docs, keep only FAQs
faq_documents = [doc for doc in repository_data if 'question' in doc]

print(f"Total files: {len(repository_data)}")
print(f"FAQ documents: {len(faq_documents)}")

# Look at the first few FAQs
for i, faq in enumerate(faq_documents[:3]):
    print(f"\n--- FAQ {i+1} ---")
    print(f"ID: {faq.get('id')}")
    print(f"Question: {faq.get('question')}")
    print(f"Sort Order: {faq.get('sort_order')}")
    print(f"Filename: {faq.get('filename')}")
    print(f"Content preview: {faq.get('content')[:100]}...")

In [None]:
#Group FAQs by course
from collections import defaultdict

faqs_by_course = defaultdict(list)

for faq in faq_documents:
    # Extract course name from filename
    # Example: 'faq-main/_questions/data-engineering-zoomcamp/...'
    parts = faq['filename'].split('/')
    if len(parts) >= 3 and parts[1] == '_questions':
        course = parts[2]
        faqs_by_course[course].append(faq)

# Show statistics
print("FAQs by course:")
for course, faqs in faqs_by_course.items():
    print(f"  {course}: {len(faqs)} FAQs")

In [None]:
# Create a clean list of FAQ entries
clean_faqs = []

for faq in faq_documents:
    # Extract course and section from filename
    parts = faq['filename'].split('/')

    course = parts[2] if len(parts) >= 3 else 'unknown'
    section = parts[3] if len(parts) >= 4 else 'general'

    clean_faq = {
        'id': faq.get('id', ''),
        'course': course,
        'section': section,
        'question': faq.get('question', ''),
        'answer': faq.get('content', ''),
        'sort_order': faq.get('sort_order', 999),
        'filename': faq.get('filename', '')
    }
    clean_faqs.append(clean_faq)

# Display first few
for faq in clean_faqs[:3]:
    print(f"\n{'='*60}")
    print(f"Course: {faq['course']}")
    print(f"Section: {faq['section']}")
    print(f"Question: {faq['question']}")
    print(f"Answer: {faq['answer'][:150]}...")

In [22]:
import pandas as pd

# Convert to DataFrame
df = pd.DataFrame(clean_faqs)

print(df.head())
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nCourses: {df['course'].unique()}")

           id                     course  section  \
0  9e508f2212  data-engineering-zoomcamp  general   
1  bfafa427b3  data-engineering-zoomcamp  general   
2  3f1424af17  data-engineering-zoomcamp  general   
3  52217fc51b  data-engineering-zoomcamp  general   
4  33fc260cd8  data-engineering-zoomcamp  general   

                                            question  \
0                Course: When does the course start?   
1  Course: What are the prerequisites for this co...   
2  Course: Can I still join the course after the ...   
3  Course: I have registered for the Data Enginee...   
4    Course: What can I do before the course starts?   

                                              answer  sort_order  \
0  The next cohort starts January 13th, 2025. Mor...           1   
1  To get the most out of this course, you should...           2   
2  Yes, even if you don't register, you're still ...           3   
3  You don't need a confirmation email. You're ac...           4   
4  S

In [23]:
#Search for FAQs containing specific keywords
def search_faqs(faq_list, keyword):
    results = []
    for faq in faq_list:
        if keyword.lower() in faq['question'].lower() or keyword.lower() in faq['answer'].lower():
            results.append(faq)
    return results

# Example search
results = search_faqs(clean_faqs, 'python')
print(f"Found {len(results)} FAQs about 'python'")

for r in results[:3]:
    print(f"\nQ: {r['question']}")
    print(f"Course: {r['course']}")

Found 357 FAQs about 'python'

Q: Course: What are the prerequisites for this course?
Course: data-engineering-zoomcamp

Q: Course: What can I do before the course starts?
Course: data-engineering-zoomcamp

Q: Environment: Is Python 3.9 still the recommended version to use in 2024?
Course: data-engineering-zoomcamp


## Step 5: Support Multiple Markdown Types
- To include .mdx files (React markdown):



In [25]:
for file_info in zf.infolist():
    filename = file_info.filename.lower()

    if not (filename.endswith('.md') or filename.endswith('.mdx')):
        continue


## Step 6: Complete Reusable Function
- Here's the production-ready version with error handling:

In [26]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.

    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name

    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com'
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)

    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))

    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') or filename_lower.endswith('.mdx')):
            continue

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

## Step 7: Use the Function

In [48]:
# Download and process different repositories
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')
task_manager_app = read_repo_data('fsamura01', 'taREDACTED_OPENAI_KEY-app')
print(evidently_docs[45])

print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")

FAQ documents: 1232
Evidently documents: 95


## Step 8: Inspect the Data

In [46]:
# Look at the first document
print(dtc_faq[2])

{'id': '9e508f2212', 'question': 'Course: When does the course start?', 'sort_order': 1, 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.", 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md'}


## Today’s Tasks (Day 2)

### 1. Simple Chunking

In [49]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result


In [50]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    evidently_chunks.extend(chunks)

In [None]:
evidently_chunks

### 2. Splitting by Paragraphs and Sections

In [53]:
import re
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())
paragraphs

['In this tutorial, you will learn how to perform regression testing for LLM outputs.',
 'You can compare new and old responses after changing a prompt, model, or anything else in your system. By re-running the same inputs with new parameters, you can spot any significant changes. This helps you push updates with confidence or identify issues to fix.',
 "<Info>\n  **This example uses Evidently Cloud.** You'll run evals in Python and upload them. You can also skip the upload and view Reports locally. For self-hosted, replace `CloudWorkspace` with `Workspace`.\n</Info>",
 '# Tutorial scope',
 "Here's what we'll do:",
 '* **Create a toy dataset**. Build a small Q&A dataset with answers and reference responses.',
 '* **Get new answers**. Imitate generating new answers to the same question.',
 '* **Create and run a Report with Tests**. Compare the answers using LLM-as-a-judge to evaluate length, correctness and style consistency.',
 '* **Build a monitoring Dashboard**. Get plots to track th

### Section Splitting

In [54]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.

    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)

    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)

    return sections


In [55]:
sections = split_markdown_by_level(text, level=2)

In [56]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)


In [None]:
!pip install groq

In [None]:
GROQ_API_KEY="api-key"
OPENAI_API_KEY="api-key"

In [None]:
#from openai import OpenAI
from groq import Groq

openai_client = Groq(api_key="api-key")

def llm(prompt, model='llama-3.1-8b-instant'): # Updated to a currently supported Groq model
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages
    )

    return response.choices[0].message.content

In [92]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()


In [93]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [97]:
from tqdm.auto import tqdm

evidently_chunks = []

# Maximum characters allowed in doc_content for a single LLM call
# This is a heuristic to prevent 'Payload Too Large' errors (HTTP 413).
# A conservative estimate of 20000 characters for the document content
# aims to keep the total prompt (document + instructions) well within API limits.
MAX_DOC_CHARS_FOR_LLM = 20000

for doc in tqdm(evidently_docs):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    if not doc_content.strip():
        # Skip empty documents to avoid sending empty prompts
        continue

    if len(doc_content) > MAX_DOC_CHARS_FOR_LLM:
        print(f"Skipping intelligent chunking for '{doc_copy.get('filename', 'Unknown')}' "
              f"because its content is too large ({len(doc_content)} chars > {MAX_DOC_CHARS_FOR_LLM} chars). "
              "Consider pre-chunking large documents with a simpler method before LLM processing.")
        continue

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

  0%|          | 0/95 [00:00<?, ?it/s]

Skipping intelligent chunking for 'docs-main/docs/library/leftover_content.mdx' because its content is too large (28655 chars > 20000 chars). Consider pre-chunking large documents with a simpler method before LLM processing.
Skipping intelligent chunking for 'docs-main/docs/library/overview.mdx' because its content is too large (22081 chars > 20000 chars). Consider pre-chunking large documents with a simpler method before LLM processing.
Skipping intelligent chunking for 'docs-main/docs/platform/dashboard_panel_types.mdx' because its content is too large (31538 chars > 20000 chars). Consider pre-chunking large documents with a simpler method before LLM processing.
Skipping intelligent chunking for 'docs-main/examples/LLM_judge.mdx' because its content is too large (21834 chars > 20000 chars). Consider pre-chunking large documents with a simpler method before LLM processing.
Skipping intelligent chunking for 'docs-main/examples/LLM_regression_testing.mdx' because its content is too larg