# Markdown Cleaning with GPT4-o

(Auto Scrape Iteration 2 Part 2)

With the Assistants feature on OpenAI, we call the API and give the LLM a set of instructions to parse and clean the Markdown files converted by Pandoc in the previous notebook. While GPT is able to fix some issues such as inconsistent formatting, the cleaning is not very successful especially for very large documents. Nevertheless this was a good opportunity to get to know OpenAI's Assistants, in particular its File Search capability.

For this notebook, you will need to replace os.environ.get("KEYNAME) with your OpenAI key directly in the notebook since Jupyter Notebook is unable to read from the environment variables file even within the same root/project folder.

This notebook is a documented copy of the original _batch\_md\_clean.py_ and _add\_metadata.py_ files, which are now deprecated and removed from the repo.

Import libraries as below

In [None]:
import os
import re
from openai import OpenAI
from typing_extensions import override
from openai import AssistantEventHandler

Initialise the OpenAI client and retrieve the assistant and vector store. Uncomment the setup if needed.

In [None]:
client = OpenAI(api_key=os.environ.get("URA_OPENAI_API_KEY"))

In [None]:
# prompt = ""
# with open('../data/working_assistant_prompt.txt', 'r') as f:
#     for line in f.readlines():
#         prompt += line.strip("\n")

In [None]:
# assistant = client.beta.assistants.create(
#   name="URA Assistant",
#   instructions=prompt,
#   model="gpt-4o",
#   tools=[{"type": "file_search"}],
# )

# vector_store = client.beta.vector_stores.create(name="URA_Test")

Retrieve and update the assistants model here

In [None]:
assistant = client.beta.assistants.retrieve(
    assistant_id=os.environ.get("URA_MARKDOWN_ASSISTANT_ID"))
vector_store = client.beta.vector_stores.retrieve(
    vector_store_id=os.environ.get("URA_MARKDOWN_CLEANER_VECTOR_STORE_ID"))

# Update the assistant with the vector store information
client.beta.assistants.update(
    model="gpt-4o",
    assistant_id=assistant.id,
    tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
    temperature=0.05
)

Helper functions to find the metadata (title, link, last updated date) are as below

In [None]:
def extract_date(content):
    """
    Extracts the text starting from 'Last updated on' to the specified year '2024', inclusively.

    Parameters:
        content (str): Main content of the webpage

    Returns
        date_string (str): Last updated date as extracted
        content (str): Main content of the webpage with last updated date removed to avoid duplication
    """
    pattern = r".*Last updated on(.*\d{4}).*"
    match = re.search(pattern, content)
    try:
        date_string = match.group(1).replace('*', '')
    except:
        date_string = 'No date found'
    content = re.sub(pattern, '', content)
    return date_string, content

In [None]:
def format_link(file_path):
    """
    Formats the link based on the file path.
    
    Parameters:
        file_path (str): Full path to the specified file

    Returns:
        (str): Full URL of webpage whose contents are being processed. This works because the directory structure replicates the website structure.
    """
    base_url = 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control'
    relative_path = file_path.split(
        'DC-cleaned-md')[-1].replace('\\', '/').lstrip('/').replace('.md', '')
    return f'{base_url}/{relative_path}'

In [None]:
def add_yaml_metadata(content, file_path):
    """
    Reads file, extracts required info, and writes back with YAML metadata.

    Parameters:
        content (str): Main content of the webpage
        file_path (str): Full path to the specified file

    Returns
        (str): Metadata in YAML format
    
    """
    # Assumes the first line is the title

    title = content.splitlines()[0].lstrip('#').strip()
    link = format_link(file_path)
    date, content = extract_date(content)
    date = date.strip()

    # Create YAML metadata
    return f"---\n\ntitle: {title}\n\nlink: {link}\n\ndate: {date}\n\n---\n\n"

Helper function to create the destination directories for the cleaned Markdown files

In [None]:
def copy_directory_structure_only(src, dest):
    """
    Copies only the directory structure from src to dest.
    
    Parameters:
        src (str): source directory to be recursively crawled through and duplicated
        dest (str): destination directory where the cleaned Markdown files will go
    
    Returns:
        None
    """
    # Ensure the base destination directory exists
    if not os.path.exists(dest):
        os.makedirs(dest)

    for root, dirs, files in os.walk(src):
        # Calculate the relative path from the source directory to the current directory
        rel_path = os.path.relpath(root, src)
        # Construct the corresponding path in the destination
        dest_path = os.path.join(dest, rel_path)
        if not os.path.exists(dest_path):
            os.makedirs(dest_path)

Now we can clean each file. We upload the file, create a thread for processing, add the metadata to the response and destroy the message history and uploaded file to prevent the agent from referencing other documents when processing a particular file

In [None]:
def process_markdown_file(old_file_path, new_file_path):
    """
    Process a single markdown file by sending its contents to the OpenAI API, followed by writing it to a Markdown file and destroying the message and file history.
    
    Parameters:
        old_file_path (str): File path of the original uncleaned markdown file
        new_file_path (str): Destination path of the final cleaned markdown file

    Returns:
        None
    """
    # Upload the markdown file to be accessible by the API
    if os.path.exists(new_file_path):
        return 1
    with open(old_file_path, "rb") as file:
        file = client.files.create(file=file, purpose="assistants")

    # Create a thread for processing
    thread = client.beta.threads.create()
    thread_message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content="Return the Markdown and links in-line. Take note of any rowspan or colspan in tables. Do not give any welcome or ending message, only the Markdown output. Parse the whole document, including the contents of the tables. Return the Markdown and links in-line. Take note of any rowspan or colspan in tables. Do not give any welcome or ending message, only the Markdown output. Parse the whole document, including the contents of the tables. Do your best. Do your best. ",
        attachments=[{"file_id": file.id, "tools": [{"type": "file_search"}]}]
    )

    class EventHandler(AssistantEventHandler):
        @override
        # Wait for GPT to complete its response before taking all its output
        def on_message_done(self, message) -> None:
            message_content = message.content[0].text
            message_content = add_yaml_metadata(
                message_content, new_file_path)
            with open(new_file_path, 'w', encoding='utf-8') as f:
                f.write(f"{message_content.value}\n")

    # Listen to the thread and process events
    with client.beta.threads.runs.stream(
        thread_id=thread.id,
        assistant_id=assistant.id,
        event_handler=EventHandler(),
    ) as stream:
        stream.until_done()

    # Clean up: delete the message and the uploaded file
    client.beta.threads.messages.delete(
        message_id=thread_message.id, thread_id=thread.id)
    client.files.delete(file.id)

Now you can run the cleaner

In [None]:
original_dir = '..\\data\\Development-Control-md'
new_dir = '..\\data\\DC-cleaned-md'

# Copy directory structure without files
copy_directory_structure_only(original_dir, new_dir)

# Process each Markdown file
for root, dirs, files in os.walk(original_dir):
    for name in files:
        if name.endswith('.md'):
            old_md_path = os.path.join(root, name)
            new_md_path = old_md_path.replace(original_dir, new_dir)
            try:
                task = process_markdown_file(old_md_path, new_md_path)
                if task == 1:
                    print(f"Skipped API call: {new_md_path}")
                else:
                    print(f"Successfully cleaned to {new_md_path}")
            except Exception as e:
                print(f"Failed to clean to {new_md_path}")
                print(e.message, e.args)