## Preprocessing data

In [5]:
import os
import re
from typing import Tuple, List, Dict, Any, Optional
from PIL import Image
import requests
import torch
from datasets import load_dataset, Dataset

  from .autonotebook import tqdm as notebook_tqdm


## IMAGE PROCESSING UTILITIES

In [6]:
def url_to_image(url: str, timeout: int = 10) -> Optional[Image.Image]:
    """
    Download and convert URL to PIL Image.

    Args:
        url: Image URL
        timeout: Request timeout in seconds

    Returns:
        PIL Image object or None if failed
    """
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        image = Image.open(io.BytesIO(response.content)).convert("RGB")
        return image
    except (requests.exceptions.RequestException, IOError) as e:
        print(f"Failed to load image from {url}: {e}")
        return None

def extract_image_urls_from_markdown(text: str) -> Tuple[str, List[str]]:
    """
    Extract image URLs from markdown text and replace with placeholders.

    Args:
        text: Markdown text containing image links

    Returns:
        Tuple of (cleaned_text, list_of_image_urls)
    """
    # Pattern for markdown images: ![alt](url)
    image_pattern = r"!\[.*?\]\((.*?)\)"
    image_urls = re.findall(image_pattern, text)

    # Remove image markdown syntax
    cleaned_text = re.sub(image_pattern, " ", text).strip()

    return cleaned_text, image_urls

def process_markdown_for_model(text: str) -> Tuple[str, List[Image.Image]]:
    """
    Process markdown text to extract text and images for multimodal model.

    Args:
        text: Input markdown text

    Returns:
        Tuple of (processed_text, list_of_pil_images)
    """
    cleaned_text, image_urls = extract_image_urls_from_markdown(text)

    # Download images
    images = []
    for url in image_urls:
        image = url_to_image(url)
        if image:
            images.append(image)
        else:
            print(f"Warning: Failed to load image from {url}")

    return cleaned_text, images

## DATASET PROCESSING

In [8]:
def create_conversation_content(text: str, images: List[Image.Image]) -> List[Dict[str, Any]]:
    """
    Create conversation content list with text and images.

    Args:
        text: Text content
        images: List of PIL images

    Returns:
        List of content dictionaries
    """
    content = [{"type": "text", "text": text}]

    # Add images
    for image in images:
        content.append({"type": "image", "image": image})

    return content

def process_math_sample(sample: Dict[str, str]) -> Dict[str, List[Dict[str, Any]]]:
    """
    Process a single math problem sample into conversation format.

    Args:
        sample: Dataset sample with 'question' and 'solution' keys

    Returns:
        Dictionary with 'conversations' key containing the formatted conversation
    """
    # Process question
    question_text, question_images = process_markdown_for_model(sample["question"])
    user_content = create_conversation_content(question_text, question_images)

    # Process solution (usually text-only, but check for images)
    solution_text, solution_images = process_markdown_for_model(sample["solution"])
    assistant_content = create_conversation_content(solution_text, solution_images)

    # Create conversation
    conversations = [
        {
            "role": "user",
            "content": user_content
        },
        {
            "role": "assistant",
            "content": assistant_content
        }
    ]

    return {"conversations": conversations}

def prepare_dataset(dataset_name: str, split: str) -> Dataset:
    """
    Load and prepare the math dataset.

    Args:
        dataset_name: HuggingFace dataset name
        split: Dataset split to load

    Returns:
        Processed Dataset object
    """
    print(f"Loading dataset: {dataset_name}, split: {split}")
    raw_dataset = load_dataset(dataset_name, split=split)

    print(f"Processing {len(raw_dataset)} samples...")
    processed_data = []

    for i, sample in enumerate(raw_dataset):
        try:
            processed_sample = process_math_sample(sample)
            processed_data.append(processed_sample)

            if (i + 1) % 100 == 0:
                print(f"Processed {i + 1}/{len(raw_dataset)} samples")

        except Exception as e:
            print(f"Error processing sample {i}: {e}")
            continue

    print(f"Successfully processed {len(processed_data)} samples")
    return Dataset.from_list(processed_data)

In [7]:
dataset_name="ngohongthai/exam-sixth_grade-instruct-dataset"

In [None]:
train_dataset = prepare_dataset(dataset_name, "train")