In [None]:
import yaml
from pathlib import Path

from minsearch import Index

In [123]:
import json

In [18]:
def parse_metadata(content):
    return yaml.safe_load(content)

def parse_frontmatter(content):
    if not content.startswith('---'):
        return {}, content

    try:
        # Split frontmatter and content
        parts = content.split('---', 2)
        if len(parts) < 3:
            return {}, content
        
        frontmatter = yaml.safe_load(parts[1])
        markdown_content = parts[2].strip()
        
        return frontmatter or {}, markdown_content
    except yaml.YAMLError:
        return {}, content


In [315]:
def write_frontmatter(question_file, frontmatter_data, content):
    with open(question_file, 'w', encoding='utf-8') as f:
        f.write('---\n')
        yaml.dump(frontmatter_data, f, default_flow_style=False, allow_unicode=True)
        f.write('---\n\n')

        f.write(f'{content}')

In [79]:
def read_metadata(course_dir):
    metadata_file = course_dir / '_metadata.yaml'
    content = metadata_file.read_text(encoding='utf8')
    metadata = parse_metadata(content)
    return metadata

In [101]:
def read_questions(course_dir: Path) -> list[dict]:
    course_id = course_dir.name

    metadata = read_metadata(course_dir)
    course_sections = {d['id']: d['name'] for d in metadata['sections']}

    documents = []

    for question_file in course_dir.glob('*/*.md'):
        content = question_file.read_text(encoding='utf8')
        fm, answer = parse_frontmatter(content)
    
        section_dir = question_file.parent
        section_id = section_dir.name
        course_dir = section_dir.parent
        course_id = course_dir.name
    
        section_name = course_sections.get(section_id, section_id)
    
        document = {
            'course': course_id,
            'section': section_name,
            'section_id': section_id,
            'question': fm['question'],
            'answer': answer,
            'document_id': fm['id'],
            'sort_order': fm['sort_order']
        }
    
        documents.append(document)
    return documents

In [257]:
def find_question_files(course_dir: Path) -> dict[str, Path]:
    docs = {}
    for question_file in course_dir.glob('*/*.md'):
        parts = question_file.name.split('_', maxsplit=3)
        doc_id = parts[1]
        docs[doc_id] = question_file
    return docs

In [204]:
def keep_relevant(results):
    new_results = []

    for d in results:
        d = d.copy()
        del d['course']
        del d['section']
        new_results.append(d)

    return new_results

In [454]:
def find_largest_sort_order(section_dir):
    last = sorted(section_dir.iterdir())[-1]
    sort_order, _ = last.name.split('_', maxsplit=1)
    return int(sort_order) + 1

In [180]:
import hashlib

def generate_document_id(question, answer, existing_ids):
    base_text = question + ' ' + answer

    document_id = hashlib.md5(base_text.encode()).hexdigest()[:10]

    if document_id not in existing_ids:
        return document_id
    
    counter = 1
    while True:
        collision_text = f"{base_text}_{counter}"
        collision_id = hashlib.md5(collision_text.encode()).hexdigest()[:10]
        if collision_id not in existing_ids:
            return collision_id
        counter += 1


In [431]:
prompt_template = """
<ENTRY>
{entry}
</ENTRY>
<SEARCH_RESULTS>
{results}
</SEARCH_RESULTS>
<SECTIONS>
{sections}
</SECTIONS>
""".strip()

system_prompt = """
You are an assistant that helps maintain a student FAQ repository.

Given:
1. A new proposal in ENTRY
2. A set of top similar existing FAQs in SEARCH_RESULTS

You must decide one of:
- `NEW`: create a new FAQ file
- `UPDATE:<document_id>`: the proposal adds meaningful info to an existing FAQ
- `DUPLICATE:<document_id>`: the proposal is already fully covered, no need to update or add

Rules
- NEW if the question is not covered in FAQ
- UPDATE if the existing FAQ is about the same issue but missing context or details
- DUPLICATE if the existing FAQ already answers the question fully
- Do not invent unrelated content, base decisions strictly on the provided proposal and FAQ excerpts
- When UPDATE, merge old and new answers into one, making the updated answer complete and containing all the information from both
- When UPDATE, make sure the new question is reflective of the both new and old records
- Carefully analyse existing sections to decide where it goes. Generic questions that don't fit any other section should go to "general"

Example reasoning
- If two FAQs are semantically the same but wording differs slightly → DUPLICATE.
- If an FAQ exists but lacks troubleshooting steps the student provided → UPDATE.
- If the topic is not covered in existing FAQs → NEW.
""".strip()

from typing import List, Optional, Literal
from pydantic import BaseModel, Field, model_validator

from openai import OpenAI
openai_client = OpenAI()


class FAQDecision(BaseModel):
    """
    Unified decision object returned by your triage agent.
    Contains placement (module/order/title) and action-specific payload.
    """

    # What to do
    action: Literal["NEW", "UPDATE", "DUPLICATE"] = Field(
        ...,
        description=(
            "Decision:\n"
            "- NEW: create a new FAQ file.\n"
            "- UPDATE: merge the proposal into an existing FAQ.\n"
            "- DUPLICATE: proposal is already covered by an existing FAQ."
        ),
    )
    rationale: str = Field(..., description="1-2 sentences explaining the decision.")
    document_id: str = Field(
        ...,
        description=(
            "ID to act on:\n"
            "- NEW → document_id to use for the new file.\n"
            "- UPDATE/DUPLICATE → document_id of the existing FAQ."
        ),
    )

    section_rationale: str = Field(..., description="1-2 sentences explaining why this section was chosen")
    section_id: str = Field(..., description="Section for this FAQ (e.g 'module-1').")
    
    order: int = Field(..., description="Integer controlling sort order within the module. Set to number if it should be placed near existing FAQ records, set to -1 it it should go at the end of the section")

    question: str = Field(..., description="FAQ question title displayed to users (plain-text question).")
    
    
    # Action-specific payload
    proposed_content: Optional[str] = Field(
        None,
        description="Only for NEW and UPDATE: markdown file with the answer. The question is not included. No headers.",
    )

    filename_slug: Optional[str] = Field(
        None,
        description="Only for NEW: file-system friendly slug with hyphens, up to 50 characters",
    )

    # Notes
    warnings: List[str] = Field(
        default_factory=list,
        description="Optional warnings (e.g., sort order collision, module mismatch).",
    )


In [432]:
# documents = []

# for course_dir in questions_root.iterdir():
#     course_documents = read_questions(course_dir)
#     documents.extend(course_documents)

# len(documents)

In [260]:
questions_root = Path('../_questions/')

course_name = 'machine-learning-zoomcamp'

In [261]:
course_dir = questions_root / course_name
documents = read_questions(course_dir)

metadata = read_metadata(course_dir)

In [262]:
doc_index = find_question_files(course_dir)

In [264]:
len(doc_index), len(documents)

(389, 389)

In [265]:
index = Index(
    text_fields=['section', 'question', 'answer'],
    keyword_fields=['course', 'section_id'],
)

In [266]:
index.fit(documents)

<minsearch.minsearch.Index at 0x1dbedb76ad0>

In [391]:
questions_from_launch = """
## Do I need prior machine learning experience?

No. The course starts from first principles. We do an introduction to ML, a gentle math refresher (only what you need), and the basics of linear algebra taught via code. You can begin with zero ML background and ramp up through hands-on exercises.

---

## Do I need to know Python already?
Not strictly, but you must be comfortable programming. If you know another major language (Java, JavaScript, Rust, etc.), you’ll pick up the small subset of Python we use. Expect a lot of coding and “code-along” sessions.

---

## What other prerequisites should I have?
Be ready to use the terminal. You’ll use Git, the command line, Docker, and basic OS tasks. These are demonstrated in the course, but you should be comfortable running commands and reading errors.

---

## How is the course structured overall?
It’s project-first and hands-on. You learn just enough theory to solve practical problems. The core path is: intro → regression → classification → evaluation → deployment → tree-based models → midterm project → neural nets + deployment (Lambda/ONNX) → Kubernetes. There’s an optional, older KServe module.

---

## What’s new in the 2025 edition?
- Deployment module updated to **FastAPI** (replacing Flask) and new tools.  
- Neural networks taught with **PyTorch** (theory videos in Keras are kept; an additional PyTorch implementation video is provided).  
- Deep learning deployment uses **ONNX Runtime** on AWS Lambda (replacing TensorFlow Lite).

---

## Where do I find all course materials and links?
Everything lives in the main **ML Zoomcamp GitHub repository**. Modules are folders with a README/markdown per unit, videos, and notes. Cohort-specific items (like homework and dates) are under `cohorts/2025/`. The playlist “Machine Learning Zoomcamp” on YouTube contains all lessons.
https://github.com/DataTalksClub/machine-learning-zoomcamp/
---

## Should I star the GitHub repo?
Yes, please. Starring helps it trend so others can discover the course. The instructor explicitly asks attendees to star it to boost visibility.
https://github.com/DataTalksClub/machine-learning-zoomcamp/

---

## How do I submit homework?
Each module has a homework form with a deadline. Do the tasks locally, **publish your code** (e.g., in your own GitHub repo), then submit your answers via the form and include the URL to your code. Answers are revealed after the deadline closes.
Homeworks are in the cohorts folder, e.g. for 2025 it's cohorts/2025 (https://github.com/DataTalksClub/machine-learning-zoomcamp/tree/master/cohorts/2025)
the forms for submitting the homework are in the course management platform: https://courses.datatalks.club/

---

## Are homeworks required to get the certificate?
No. Homeworks are optional for certification, but strongly recommended to check understanding. Certification is based on projects, not homework scores.

---

## What do I need to earn the certificate?
You must **pass 2 out of 3 projects**:  
- Midterm + Capstone 1, **or**  
- Capstone 1 + Capstone 2  
Certificates show pass/fail only—no percentage or rank.

---

## Can I finish early and get the certificate in under four months?
No. Project timelines structure the cohort. The earliest certificates typically land around **January** (after Capstone 1/2 windows and reviews).

---

## Can I skip topics I already know?
Yes. All lesson content is optional; only the **projects** are mandatory. Move ahead at your own pace (you don’t need to wait for a “module start” date).

---

## How are projects run and graded?
Each project spans ~3 weeks: ~2 weeks building + **1 week peer review**. You must complete **three peer reviews** to pass. Rubrics focus on practical ML engineering and deployment.

---

## Are lessons live or recorded?
Core lessons are **pre-recorded** and already available. Occasional live streams (like the launch/Q&A) happen but there are no weekly office hours now—use recorded office hours from prior years and the FAQ.

---

## How do I keep track of lessons and modules?
Use the main YouTube playlist “Machine Learning Zoomcamp” for all units. Year-specific playlists (2021–2025) contain cohort-specific streams and extras.

---

## How do I get answers to common questions?
Start with the **FAQ document** (general + per-module Q&A). There’s also a **Zoomcamp Q&A bot** in Slack—use it thoughtfully; often the FAQ or recent messages already contain your answer.

---

## What are the Slack “house rules”?
- Ask course questions in **#course-ml-zoomcamp** (not #general).  
- **Use threads** to reply.  
- **Paste text/code** instead of screenshots or phone photos.  
- Don’t tag instructors; many peers can help and instructors see messages anyway.  
- Keep the channel tidy and on-topic.

---

## How do announcements work?
Announcements are posted in the **Telegram channel** and mirrored into Slack. It’s best to join Telegram for clean, broadcast-only updates; otherwise you’ll still see mirrored posts in Slack.

---

## Which tools do we use for environments: uv or Anaconda?
- Install **Python** easily with **Anaconda** (especially on Windows).  
- Use **uv** for **virtual environments and package installs** (recommended over `conda` for this course).  
The uv + FastAPI workshop is included in Module 5.

---

## Which language and framework are used for deployment?
**Python** with **FastAPI** (updated from Flask). You’ll also package/serve models, containerize, and deploy to cloud/Kubernetes.

---

## Which cloud provider is used?
Examples use **AWS** (you should have or create an AWS account). Concepts transfer to other clouds (GCP/Azure) with minor adjustments.

---

## Will we cover Spark or big-data tooling?
No Spark in this course. Focus is on core ML engineering and deployment patterns.

---

## Is system design included?
No dedicated system-design module. The emphasis is end-to-end ML projects and deployments (web services, Lambda, Kubernetes).

---

## What datasets are acceptable for projects?
Choose something non-toy (e.g., **100+ rows** minimum) that lets you demonstrate the pipeline: data prep → modeling → evaluation → **deployment**. You may also collect your own data.

---

## Can I use pre-trained models and focus on deployment?
Yes, but include **some training element** (fine-tuning, retraining, or comparable) or you’ll lose points in training-related criteria (you can still pass overall if other parts are strong).

---

## Is there any advantage to high homework/leaderboard scores?
Indirectly. The **leaderboard** highlights active learners (correct answers + learning-in-public links). That visibility helps with networking and recruiter attention, even though the certificate itself is pass/fail.

---

## What’s the recent pass/completion rate?
In the last cohort’s dashboard snapshot, roughly **~85%** of project attempts resulted in a pass.

---

## How do I get help if I’m stuck?
- Ask in Slack (paste code/errors; use threads).  
- Check the FAQ and past office hours.  
- Use AI assistants for quick guidance.  
- Peers and instructors (time permitting) will chime in.

---

## How do I join Slack if the invite email didn’t arrive?
Go to DataTalks.Club, request a Slack invite, or use the manual request form (processed daily). After joining, browse channels and join **#course-ml-zoomcamp**.

---

## Are Jupyter Notebooks used?
Yes. You’ll work extensively with notebooks alongside standard Python files and CLI tools.

---

## Should I use MLflow?
It can be useful, especially for experiment tracking, but it’s covered in more depth in the separate **MLOps** course. Here, the focus is on end-to-end ML engineering and deployment.

---

## I’m a data engineer (Spark/Python/SQL). Will this help me move into DS/ML engineering?
Yes. The project-first flow, deployment modules (FastAPI/Lambda/Kubernetes), and evaluation practices map well to ML engineer roles.

---

## I’m struggling to show “business impact.” Will the course teach that?
You’ll learn to build and deploy real ML services. True “business impact” depends on domain context and is best learned **on the job**. Use the course to build a visible, high-quality portfolio you can discuss with employers.

---

## What strategy do you recommend for an unemployed SWE pivoting to ML/AI soon?
Ship projects and **learn in public** daily (posts, blogs, code). Publish everything on GitHub, write about your approach/results, engage on Slack/LinkedIn, and build a network. Use the course projects and optional activities (Kaggle, articles) as portfolio centerpieces.

---

## Can I start Module 1 now and move ahead faster?
Yes. Lessons are already available. You can proceed to later modules without waiting for “official” start dates.

---

## How often are live sessions/office hours?
Aside from the launch and occasional special streams, **no recurring live sessions**. Use recorded office hours from previous years and the FAQ/bot.

---

## Do I need to “enroll” somewhere before submitting?
No. Just log into the homework form when it opens. The Airtable “registration” is for announcements only; submissions happen on the course platform forms and your GitHub.

---

## Will the repo already have all FastAPI/uv/PyTorch updates?
Some updates are live; others are added as the cohort progresses. For theory, older videos remain; for implementation, updated workshops/videos are linked (e.g., FastAPI+uv, PyTorch add-on).

---

## What exactly is taught in the evaluation module?
You’ll learn metrics and diagnostics (e.g., precision, recall, ROC/PR curves), proper evaluation mindsets, and pitfalls. It’s conceptual and abstract but crucial for real-world ML work.

---

## What projects/models do we build?
- **Regression** (predicting car prices; hand-implement linear regression once, then use libraries).  
- **Binary classification** (telecom churn; metrics & evaluation).  
- **Tree-based models** (credit default risk).  
- **Neural nets** (image classification; PyTorch add-on).  
- **Deployments**: FastAPI service, **AWS Lambda** with **ONNX Runtime**, and **Kubernetes**.

---

## Why ONNX Runtime for DL deployment?
ONNX lets you **train in TensorFlow or PyTorch** and deploy a common format efficiently with ONNX Runtime, simplifying Lambda/K8s serving compared to framework-specific runtimes.

---

## Is the KServe module required?
No. It’s **optional and outdated**. If time-constrained, prioritize Kubernetes.

---

## How do donations/sponsorships work?
The course is free. Sponsors may be added; voluntary donations help sustain free content. If you have a company training budget, you can request an invoice to support the course.

---

## Why learn traditional ML if LLMs can “do it for me”?
Assistants are great accelerators, but you still need **conceptual understanding** to debug, adapt, and own your systems when AI tools make mistakes or hit limits. The course teaches you to build, reason about, and deploy ML systems you control.
""".strip()

In [433]:
questions = questions_from_launch.split('---')
questions = [q.strip() for q in questions]

In [434]:
q = questions[11]
print(q)

## Can I skip topics I already know?
Yes. All lesson content is optional; only the **projects** are mandatory. Move ahead at your own pace (you don’t need to wait for a “module start” date).


In [435]:
results = index.search(q, num_results=5)
results = keep_relevant(results)

prompt = prompt_template.format(
    entry=q,
    results=json.dumps(results),
    sections=json.dumps(metadata['sections'])
)

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt}
]

response = openai_client.responses.parse(
    model="gpt-5-nano",
    input=messages,
    text_format=FAQDecision,
)

message = next(filter(lambda o: o.type == 'message', response.output))
faq_decision = message.content[0].parsed

In [436]:
faq_decision



In [438]:
print('action:', faq_decision.action)
print()
print('rationale:', faq_decision.rationale)
print()
print('question:')
print(faq_decision.question)
print()
print('answer:')
print(faq_decision.proposed_content)
print()
print('new proposal:')
print(q)
print()
print('section:', faq_decision.section_id)
print(faq_decision.section_rationale)
print(faq_decision.order)

action: NEW

rationale: No existing general FAQ covers the ability to skip topics; the proposal introduces a general policy that lesson content is optional and only projects are mandatory.

question:
Can I skip topics I already know?

answer:
Yes. All lesson content is optional; only the projects are mandatory. Move ahead at your own pace (you don’t need to wait for a “module start” date).

new proposal:
## Can I skip topics I already know?
Yes. All lesson content is optional; only the **projects** are mandatory. Move ahead at your own pace (you don’t need to wait for a “module start” date).

section: general
This is a general course policy about optional content and pacing, suitable for the General section.
-1


In [447]:
last = sorted(section_dir.iterdir())[-1]

In [453]:
sort_order, _ = last.name.split('_', maxsplit=1)

101

In [419]:
if faq_decision.action == 'NEW':
    doc_id = generate_document_id(
        faq_decision.question,
        faq_decision.proposed_content,
        doc_index
    )

    sort_order = faq_decision.order
    doc_slug = faq_decision.filename_slug
    faq_section = faq_decision.section_id

    if sort_order == -1:
        
    fm = {
        'id': doc_id,
        'question': faq_decision.question,
        'sort_order': sort_order,
    }

    filename = f'{sort_order:03d}_{doc_id}_{doc_slug}.md'
    f_out = course_dir / faq_section / filename
    write_frontmatter(f_out, fm, faq_decision.proposed_content)
    doc_index[doc_id] = f_out
    print(f_out)

..\_questions\machine-learning-zoomcamp\general\001_7c7ec892e4_certificate-timeline.md


In [414]:
if faq_decision.action == 'UPDATE':
    doc_id = faq_decision.document_id
    f_out = doc_index[doc_id]

    content = f_out.read_text(encoding='utf8')
    fm, _ = parse_frontmatter(content)

    fm['question'] = faq_decision.question

    write_frontmatter(f_out, fm, faq_decision.proposed_content)
    print(f_out)

..\_questions\machine-learning-zoomcamp\projects\006_8ea1e7e31f_what-if-i-submitted-only-two-projects-and-failed-t.md


WindowsPath('../_questions/machine-learning-zoomcamp/general/001_226a4baf2f_whats-new-2025-edition.md')