In [4]:
import io
import zipfile
import requests

In [45]:
import yaml

def parse_metadata(content):
    return yaml.safe_load(content)

def parse_frontmatter(content):
    if not content.startswith('---'):
        return {}, content

    try:
        # Split frontmatter and content
        parts = content.split('---', 2)
        if len(parts) < 3:
            return {}, content
        
        frontmatter = yaml.safe_load(parts[1])
        markdown_content = parts[2].strip()
        
        return frontmatter or {}, markdown_content
    except yaml.YAMLError:
        return {}, content


In [46]:
def read_content(fi):
    with zf.open(fi) as f_in:
        content = f_in.read()
        text = content.decode("utf-8")
        return text

In [47]:
resp = requests.get('https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main')

In [48]:
zf = zipfile.ZipFile(io.BytesIO(resp.content))

In [90]:
question_fis = []
course_sections = {}

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    if '_questions' not in filename:
        continue

    if '_metadata.yaml' in filename:
        content = read_content(file_info)
        metadata = parse_metadata(content)

        course_id = filename.split('/')[2]
        sections = {d['id']: d['name'] for d in metadata['sections']}

        course_sections[course_id] = sections
        continue

    if not filename.endswith('.md'):
        continue

    question_fis.append(file_info)

In [106]:
documents = []

for qfi in question_fis:
    content = read_content(qfi)
    fm, answer = parse_frontmatter(content)

    _, _, course_id, section_id, name = qfi.filename.split('/')
    section_name = course_sections[course_id].get(section_id, section_id)
    document = {
        'course': course_id,
        'section': section_name,
        'question': fm['question'],
        'answer': answer,
        'document_id': fm['id']
    }

    documents.append(document)

In [108]:
zf.close()

In [107]:
len(documents)

1177