In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv

# Initialize client (uses OPENAI_API_KEY from environment)

load_dotenv(override=True)
client = OpenAI()


def clean_markdown_with_openai(md_text: str) -> str:
    """
    Sends markdown text to OpenAI and gets back a 'cleaned' markdown version.
    The system prompt is designed so that NO information is removed.
    """
    system_prompt = """You are a meticulous markdown cleaner.

Your job:
- Keep ALL information and ALL text content. Do NOT delete, summarize, or shorten anything.
- Preserve headings, lists, tables, links, inline code, code blocks, and formatting where possible.
- You MAY:
  - Fix broken markdown syntax (unclosed lists, headings, code fences, etc.).
  - Normalize heading levels (e.g., make them consistent).
  - Remove obvious boilerplate like duplicate "skip to content" navigation OR cookie banners ONLY if this is clearly repeated and not part of the main content.
- You MUST NOT:
  - Remove any unique information.
  - Paraphrase in a way that changes meaning.
  - Condense paragraphs or skip sections.

Output:
- Return only valid markdown.
- Do not add commentary or explanations; only output the cleaned markdown.
"""

    response = client.chat.completions.create(
        model="gpt-4.1-mini",  # or "gpt-4.1" / other model you prefer
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": (
                    "Here is the markdown file content. "
                    "Please return a cleaned markdown version following the rules.\n\n"
                    + md_text
                ),
            },
        ],
        temperature=0.0,  # deterministic, better for "no information loss"
    )

    return response.choices[0].message.content


def main(input_path: str, output_path: str):
    # Read original markdown
    with open(input_path, "r", encoding="utf-8") as f:
        original_md = f.read()

    # Call OpenAI to clean it
    cleaned_md = clean_markdown_with_openai(original_md)

    # Save cleaned markdown
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(cleaned_md)

    print(f"Cleaned markdown saved to: {output_path}")


if __name__ == "__main__":
    input_file = "../transformed_data_raw_md/pdf_to_md/msc-datascience_faq.md"
    base_name = os.path.basename(input_file)
    name_without_ext, ext = os.path.splitext(base_name)

    # ðŸ‘‡ Set output dir
    output_dir = "../transformed_data_llm_cleaned"
    os.makedirs(output_dir, exist_ok=True)  # (optional) auto create folder

    output_file = os.path.join(output_dir, f"{name_without_ext}_cleaned{ext}")

    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")

    main(input_file, output_file)
    print(f"Saved cleaned file to: {output_file}")


Cleaned markdown saved to: ../transformed_data_llm_cleaned\msc-datascience_faq_cleaned.md
Saved cleaned file to: ../transformed_data_llm_cleaned\msc-datascience_faq_cleaned.md
