In [15]:
import sys
sys.path.append('..')

In [16]:
from openai import OpenAI
from pathlib import Path

In [57]:
from tqdm.auto import tqdm

In [17]:
import generate_website
import process_faq

In [2]:
root = Path('../_questions/')
list(root.glob('**.md'))

In [36]:
openai_client = OpenAI()

In [25]:
from pydantic import BaseModel

class FAQEntry(BaseModel):
    question: str
    answer: str

In [63]:
system_prompt = """
Convert a raw FAQ entry into clean, professional Markdown documentation using these formatting rules:

- The question and the answer should follow directly below using clear, concise text suitable for documentation
- If the question contains a description of the error, or stacktrace, preserve it - it's important for search
- Output only the final Markdown-formatted result; no explanation, no extra comments
- Format any commands or error messages within code fences (triple backticks) and specify the language (e.g., ```bash)
- If there are multiple solutions, steps, or instructions, present them as a bulleted or numbered list
- Maintain a style consistent with technical documentation or course FAQ standards
- Remove names of people who submitted the answer, like (Tatyana Mardvilko) or (Muhammad Awon) - usually at the end
- Don't include links to Slack (like https://datatalks-club.slack.com/archives/C01FABYF2RG/p1706846846359379), but keep others
- If a question starts with a category, like "Homework" or "Docker", leave it. Use ":" to separate category with the quesiton
- There could be image placeholders, like <{IMAGE:image_id}>. Leave them untouched.

Carefully process the raw FAQ entry step-by-step, ensuring each formatting requirement is properly addressed before finalizing your answer
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>
<ANSWER>
{answer}
</ANSWER>
""".strip()

In [87]:
md_files = sorted(root.glob("**/*.md"))

In [85]:

def process_file(question_file):
    with question_file.open('r', encoding='utf-8') as f:
        content = f.read()
    
    frontmatter, markdown_content = generate_website.parse_frontmatter(content)
    question = frontmatter['question']
    
    prompt = prompt_template.format(
        question=question,
        answer=markdown_content
    )
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    
    response = openai_client.responses.parse(
        model="gpt-4o",
        input=messages,
        text_format=FAQEntry,
    )
    
    faq_entry = response.output[0].content[0].parsed
    
    frontmatter['question'] = faq_entry.question

    process_faq.write_frontmatter(
        question_file=question_file,
        frontmatter_data=frontmatter,
        content=faq_entry.answer,
    )

In [None]:
processed = set()

In [89]:
for question_file in tqdm(md_files):
    if question_file in processed:
        continue

    process_file(question_file)
    processed.add(question_file)

  0%|          | 0/1211 [00:00<?, ?it/s]