In [1]:
import docx
import json
import io
import requests

In [2]:
def clean_text(t):
    """
    Clean text
    - Remove leading and trailing whitespaces
    - Remove leading and trailing \uFEFF bytes
    """
    return t.strip().strip("\uFEFF")


def read_faq(file_id):
    """
    Read and parse FAQ document by file ID.
    """
    url = f"https://docs.google.com/document/d/{file_id}/export?format=docx"

    response = requests.get(url)
    response.raise_for_status()

    with io.BytesIO(response.content) as f:
        document = docx.Document(f)

    questions = []
    section = ""
    question = ""
    answer_so_far = ""

    is_header = True

    for l in document.paragraphs:
        style = l.style.name.lower()
        content = clean_text(l.text)
        
        if len(content) == 0:
            continue
        
        if style == "heading 1":
            section = content
            continue
    
        if style == "heading 2":
            if is_header: is_header = False
            else:
                answer_so_far = answer_so_far.strip()
                if section != "" and question != "" and answer_so_far != "":
                    questions.append({
                        "section": section, 
                        "question": question, 
                        "answer": answer_so_far
                    })

                    answer_so_far = ""
                
            question = content
            continue
        
        # Skip the content before the first question
        if not is_header: answer_so_far += "\n" + content    
    
    answer_so_far = answer_so_far.strip()
    if section != "" and question != "" and answer_so_far != "":
        questions.append({
            "section": section, 
            "question": question, 
            "answer": answer_so_far
        })

    return questions

In [3]:
FAQ_DOCUMENTS = {
    "data-engineering-zoomcamp": "19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw", 
    "machine-learning-zoomcamp": "1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8", 
    "mlops-zoomcamp": "12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0"
}

In [4]:
faq_documents = [{"course": course, "documents": read_faq(file_id)} for course, file_id in FAQ_DOCUMENTS.items()]

In [5]:
with open("data/documents.json", "w") as f:
    json.dump(faq_documents, f, indent=4)

In [6]:
!head data/documents.json

[
    {
        "course": "data-engineering-zoomcamp",
        "documents": [
            {
                "section": "General course-related questions",
                "question": "Course - When will the course start\uff1f",
                "answer": "The next cohort starts January 13th 2025. More info at DTC Article.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel."
            },
            {
