In [None]:
import yaml
from pathlib import Path

from minsearch import Index

In [18]:
def parse_metadata(content):
    return yaml.safe_load(content)

def parse_frontmatter(content):
    if not content.startswith('---'):
        return {}, content

    try:
        # Split frontmatter and content
        parts = content.split('---', 2)
        if len(parts) < 3:
            return {}, content
        
        frontmatter = yaml.safe_load(parts[1])
        markdown_content = parts[2].strip()
        
        return frontmatter or {}, markdown_content
    except yaml.YAMLError:
        return {}, content


In [79]:
def read_metadata(course_dir):
    metadata_file = course_dir / '_metadata.yaml'
    content = metadata_file.read_text(encoding='utf8')
    metadata = parse_metadata(content)
    return metadata

In [80]:
def read_questions(course_dir: Path) -> list[dict]:
    course_id = course_dir.name

    metadata = read_metadata(course_dir)
    course_sections = {d['id']: d['name'] for d in metadata['sections']}

    documents = []

    for question_file in course_dir.glob('*/*.md'):
        content = question_file.read_text(encoding='utf8')
        fm, answer = parse_frontmatter(content)
    
        section_dir = question_file.parent
        section_id = section_dir.name
        course_dir = section_dir.parent
        course_id = course_dir.name
    
        section_name = course_sections.get(section_id, section_id)
    
        document = {
            'course': course_id,
            'section': section_name,
            'section_id': section_id,
            'question': fm['question'],
            'answer': answer,
            'document_id': fm['id']
        }
    
        documents.append(document)
    return documents

In [82]:
questions_root = Path('../_questions/')

course_name = 'data-engineering-zoomcamp'


In [92]:
documents = []

for course_dir in questions_root.iterdir():
    course_documents = read_questions(course_dir)
    documents.extend(course_documents)

In [95]:
len(documents)

1177

In [93]:
index = Index(
    text_fields=['section', 'question', 'answer'],
    keyword_fields=['course', 'section_id'],
)

In [94]:
index.fit(documents)

<minsearch.minsearch.Index at 0x1dbe82ee850>

In [96]:
index.search('flask dont start')

[{'course': 'machine-learning-zoomcamp',
  'section': 'Module 5. Deploying Machine Learning Models',
  'section_id': 'module-5',
  'question': 'How do I resolve the "No module named flask" error?',
  'answer': 'I initially installed Flask with pipenv, but I received a "No module named \'flask\'" error. I then reinstalled Flask using pip, and after that, I was able to import Flask successfully.',
  'document_id': 'ec6919a46b'},
 {'course': 'machine-learning-zoomcamp',
  'section': 'Module 10. Kubernetes and TensorFlow Serving',
  'section_id': 'module-10',
  'question': "TypeError: __init__() got an unexpected keyword argument 'unbound_message' while importing Flask",
  'answer': "In video 10.3, while testing a Flask service, the following error occurred:\n\n```\nTypeError: __init__() got an unexpected keyword argument 'unbound_message'\n```\n\nThis error was encountered when running `docker run ...` in one terminal and then executing `python gateway.py` in another terminal.\n\n\n\nThis