In [1]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_from_disk, DatasetDict, concatenate_datasets, Dataset
import transformers
import torch

In [2]:
print(torch.__version__)          
print(torch.version.cuda)        
print(torch.cuda.is_available()) 
print(transformers.__version__)

2.9.1+cu130
13.0
True
4.57.3


In [3]:
dataset = load_from_disk(r"multiauthor_dataset")

Flatten dataset for tokenization

In [4]:
def flatten_dataset(dataset):
    flat_examples = []
    for item in dataset:
        for sent, change in zip(item["text"], item["changes"]):
            flat_examples.append({
                "problem_id": item["id"],
                "sentence": sent,
                "label": change,
                "difficulty": item["difficulty"],
                "authors": item["authors"]
            })
    return Dataset.from_list(flat_examples)


Tokenize dataset to be ready for model training

In [5]:
tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base", trust_remote_code=True)

def tokenize_fn(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)

splits = [
    "train_easy", "train_medium", "train_hard",
    "valid_easy", "valid_medium", "valid_hard"
]

tokenized_splits = {}

for split_name in splits:
    flat = flatten_dataset(dataset[split_name])
    tokenized = flat.map(tokenize_fn, batched=True)
    tokenized_splits[split_name] = tokenized

tokenized_dataset = DatasetDict(tokenized_splits)

Map:   0%|          | 0/48402 [00:00<?, ? examples/s]

Map:   0%|          | 0/58817 [00:00<?, ? examples/s]

Map:   0%|          | 0/51061 [00:00<?, ? examples/s]

Map:   0%|          | 0/10247 [00:00<?, ? examples/s]

Map:   0%|          | 0/12759 [00:00<?, ? examples/s]

Map:   0%|          | 0/10648 [00:00<?, ? examples/s]

In [6]:
tokenized_dataset

DatasetDict({
    train_easy: Dataset({
        features: ['problem_id', 'sentence', 'label', 'difficulty', 'authors', 'input_ids', 'attention_mask'],
        num_rows: 48402
    })
    train_medium: Dataset({
        features: ['problem_id', 'sentence', 'label', 'difficulty', 'authors', 'input_ids', 'attention_mask'],
        num_rows: 58817
    })
    train_hard: Dataset({
        features: ['problem_id', 'sentence', 'label', 'difficulty', 'authors', 'input_ids', 'attention_mask'],
        num_rows: 51061
    })
    valid_easy: Dataset({
        features: ['problem_id', 'sentence', 'label', 'difficulty', 'authors', 'input_ids', 'attention_mask'],
        num_rows: 10247
    })
    valid_medium: Dataset({
        features: ['problem_id', 'sentence', 'label', 'difficulty', 'authors', 'input_ids', 'attention_mask'],
        num_rows: 12759
    })
    valid_hard: Dataset({
        features: ['problem_id', 'sentence', 'label', 'difficulty', 'authors', 'input_ids', 'attention_mask'],
     

Save dataset locally

In [7]:
tokenized_dataset.save_to_disk("mmbert_tokenized_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/48402 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/58817 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/51061 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10247 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12759 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10648 [00:00<?, ? examples/s]