# Dataset generation

This will generate 4 files in the `dataset` folder, each containing 50000 examples of Roman numeral arithmetic operations.

In [1]:
import random
import os
from roman import toRoman, fromRoman

def generate_roman_arithmetic_dataset(operation, num_examples=50000):
    dataset = []
    while len(dataset) < num_examples:
        if operation == "/":
            b = random.randint(1, 100)  # Divisor can be up to 100
            result = random.randint(1, 39)  # Limit result to ensure a * b <= 3999
            a = result * b  # This ensures clean division
        else:
            a = random.randint(1, 3999)
            b = random.randint(1, 3999)

            if operation == "+":
                result = a + b
            elif operation == "-":
                result = max(a, b) - min(a, b)
                a, b = max(a, b), min(a, b)
            elif operation == "*":
                result = a * b

        if result > 3999:
            continue  # Retry if result is too large

        roman_a = toRoman(a)
        roman_b = toRoman(b)
        roman_result = toRoman(result)

        example = f"{roman_a} {operation} {roman_b} = {roman_result}"
        dataset.append(example)

    return dataset

# Create dataset directory if it doesn't exist
os.makedirs("dataset", exist_ok=True)

# Generate datasets for each operation
operations = ["+", "-", "*", "/"]
operation_names = ["addition", "subtraction", "multiplication", "division"]

for mode in ["train", "test"]:
    for op, name in zip(operations, operation_names):
        dataset = generate_roman_arithmetic_dataset(op)
        filename = f"dataset/{mode}_{name}.txt"

        with open(filename, "w") as f:
            f.write("\n".join(dataset))

        print(f"Generated {name} dataset: {filename}")

print("All datasets generated successfully.")

Generated addition dataset: dataset/addition.txt
Generated subtraction dataset: dataset/subtraction.txt
Generated multiplication dataset: dataset/multiplication.txt
Generated division dataset: dataset/division.txt
All datasets generated successfully.


# Fine-tuning on Roman Numeral Dataset

In [5]:
import torch
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
)
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets

In [None]:
def load_dataset(file_path, tokenizer):
    return TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=128)

# Load pre-trained model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add padding token to tokenizer
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# Prepare datasets
train_dataset = load_dataset("dataset/addition.txt", tokenizer)
eval_dataset = load_dataset("dataset/subtraction.txt", tokenizer)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-roman-math",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2-roman-math")
tokenizer.save_pretrained("./gpt2-roman-math")