In [None]:
import os
import re
import pdfplumber
from datasets import Dataset
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TrainingArguments, Trainer

# Function to extract and split text from specific PDF pages
def extract_and_split_text(pdf_path):
    rules_text = ""
    glossary_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages[4:241]:  # Extract rules from pages 5 to 241
            if page.extract_text():
                rules_text += ' ' + page.extract_text()
        for page in pdf.pages[241:291]:  # Optionally extract glossary from pages 242 to 291
            if page.extract_text():
                glossary_text += ' ' + page.extract_text()

    # Split rules and glossary using regular expressions
    rules = re.split(r'(?=\d+\.\d+[a-z]*\.)', rules_text)
    glossary_entries = re.split(r'\n(?=[A-Z])', glossary_text)
    
    return [rule.strip() for rule in rules if rule.strip() != ''], [entry.strip() for entry in glossary_entries if entry.strip() != '']

# Extracting and splitting rules and glossary text from PDF
rules, glossary = extract_and_split_text('data/raw/mtg_comp_rules.pdf')

# Prepare the dataset
data = {"text": [f"Explain the rule: {rule}" for rule in rules]}
processed_glossary = [entry.split('\n')[0] for entry in glossary]
data["text"].extend([f"Define the term: {term}" for term in processed_glossary])
dataset = Dataset.from_dict(data)

# Define the model name for GPT-Neo and tokenizer
model_name = "EleutherAI/gpt-neo-2.7B"

# Load tokenizer and model, set padding token
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token if pad_token is not set

# Tokenize the dataset and include labels
def tokenize_function(examples):
    model_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Load the model
model = GPTNeoForCausalLM.from_pretrained(model_name)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated from 'evaluation_strategy'
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    report_to="none"
)

# Set up the trainer with the corrected dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("models/fine_tuned_model")

Map: 100%|████████████████████████████████████████████████████████████████| 5482/5482 [00:02<00:00, 2020.06 examples/s]


Epoch,Training Loss,Validation Loss
