# Dataset generation

This will generate 4 files in the `dataset` folder, each containing 50000 examples of Roman numeral arithmetic operations.

In [1]:
import random
import os
from roman import toRoman, fromRoman

def generate_roman_arithmetic_dataset(operation, num_examples=50000):
    dataset = []
    while len(dataset) < num_examples:
        if operation == "/":
            b = random.randint(1, 100)  # Divisor can be up to 100
            result = random.randint(1, 39)  # Limit result to ensure a * b <= 3999
            a = result * b  # This ensures clean division
        else:
            a = random.randint(1, 3999)
            b = random.randint(1, 3999)

            if operation == "+":
                result = a + b
            elif operation == "-":
                result = max(a, b) - min(a, b)
                a, b = max(a, b), min(a, b)
            elif operation == "*":
                result = a * b

        if result > 3999:
            continue  # Retry if result is too large

        roman_a = toRoman(a)
        roman_b = toRoman(b)
        roman_result = toRoman(result)

        example = f"{roman_a} {operation} {roman_b} = {roman_result}"
        dataset.append(example)

    return dataset

# Create dataset directory if it doesn't exist
os.makedirs("dataset", exist_ok=True)

# Generate datasets for each operation
operations = ["+", "-", "*", "/"]
operation_names = ["addition", "subtraction", "multiplication", "division"]

for op, name in zip(operations, operation_names):
    dataset = generate_roman_arithmetic_dataset(op)
    filename = f"dataset/{name}.txt"

    with open(filename, "w") as f:
        f.write("\n".join(dataset))

    print(f"Generated {name} dataset: {filename}")

print("All datasets generated successfully.")

Generated addition dataset: dataset/addition.txt
Generated subtraction dataset: dataset/subtraction.txt
Generated multiplication dataset: dataset/multiplication.txt
Generated division dataset: dataset/division.txt
All datasets generated successfully.
