In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from utils import TrainingConfig, Config

tokenizer = AutoTokenizer.from_pretrained("gpt2")   # or your custom one

training_config = TrainingConfig()
config = Config(vocab_size=tokenizer.vocab_size,
    d_model=768, num_heads=12, ffn_dim=3072,
    num_layers=12, )

# 1. Load the raw text
ds = load_dataset("openwebtext", split="train", trust_remote_code=True)

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [2]:
ds = ds.select(range(1000))
ds[0]

{'text': 'Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated the area, saying it was concerned about security.\n\nThe decision left CNN Chief Medical Correspondent Sanjay Gupta as the only doctor at the hospital to get the patients through the night.\n\nCNN initially reported, based on conversations with some of the doctors, that the United Nations ordered the Belgian First Aid and Support Team to evacuate. However, Belgian Chief Coordinator Geert Gijs, a doctor who was at the hospital with 60 Belgian medical personnel, said it was his decision to pull the team out for the night. Gijs said he requested U.N. security personnel to staff the hospital overnight, but was told that peacekeepers would only be able to evacuate the team.\n\nHe said it was a "tough decision" but that he accepted the U.N. offer to evacuate after a Canadian medical t

In [3]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(
        batch["text"],
        # truncation=False,
        # max_length=training_config.max_len,
        # padding=False,
        # return_tensors="pt",
    )

tokenized = ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized = tokenized.remove_columns(["attention_mask"])
tokenized

Dataset({
    features: ['input_ids'],
    num_rows: 1000
})

In [4]:
def group(batch):
    # Flattens the input_ids and attention_mask into single lists
    flat_ids = sum(batch["input_ids"], [])

    num_of_complete_blocks = len(flat_ids) // config.max_seq_len
    total = num_of_complete_blocks * config.max_seq_len
    flat_ids = flat_ids[:total+1]

    return {
        "input_ids": [flat_ids[i:i+config.max_seq_len] for i in range(0, total, config.max_seq_len)],
        "labels": [flat_ids[i+1:i+config.max_seq_len+1] for i in range(0, total, config.max_seq_len)]
    }


lm_ds = tokenized.map(group, batched=True, batch_size=10000)
lm_ds

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 1099
})

In [None]:
from roformer import RoFormerEncoder, RoFormerForCausalLM
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

model_base = RoFormerEncoder(config)
model = RoFormerForCausalLM(model_base, config)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to device
model = model.to(device)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="roformer-base",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=10,
    logging_dir="logs",
    logging_steps=10,
    save_steps=10,
    save_total_limit=5,
    save_strategy="steps",
    save_safetensors=False,
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=lm_ds,
    data_collator=data_collator,
)

Using device: cuda


In [6]:
if True:
    trainer.train()


Step,Training Loss


RuntimeError: 
            Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'backbone.embeddings.weight', 'lm_head.weight'}, {'backbone.layers.0.self_attn.W_O.weight', 'backbone.layers.0.W_O.weight'}, {'backbone.layers.0.self_attn.W_O.bias', 'backbone.layers.0.W_O.bias'}, {'backbone.layers.1.W_O.weight', 'backbone.layers.1.self_attn.W_O.weight'}, {'backbone.layers.1.self_attn.W_O.bias', 'backbone.layers.1.W_O.bias'}, {'backbone.layers.2.self_attn.W_O.weight', 'backbone.layers.2.W_O.weight'}, {'backbone.layers.2.W_O.bias', 'backbone.layers.2.self_attn.W_O.bias'}, {'backbone.layers.3.W_O.weight', 'backbone.layers.3.self_attn.W_O.weight'}, {'backbone.layers.3.self_attn.W_O.bias', 'backbone.layers.3.W_O.bias'}, {'backbone.layers.4.W_O.weight', 'backbone.layers.4.self_attn.W_O.weight'}, {'backbone.layers.4.self_attn.W_O.bias', 'backbone.layers.4.W_O.bias'}, {'backbone.layers.5.W_O.weight', 'backbone.layers.5.self_attn.W_O.weight'}, {'backbone.layers.5.self_attn.W_O.bias', 'backbone.layers.5.W_O.bias'}, {'backbone.layers.6.self_attn.W_O.weight', 'backbone.layers.6.W_O.weight'}, {'backbone.layers.6.self_attn.W_O.bias', 'backbone.layers.6.W_O.bias'}, {'backbone.layers.7.W_O.weight', 'backbone.layers.7.self_attn.W_O.weight'}, {'backbone.layers.7.self_attn.W_O.bias', 'backbone.layers.7.W_O.bias'}, {'backbone.layers.8.W_O.weight', 'backbone.layers.8.self_attn.W_O.weight'}, {'backbone.layers.8.self_attn.W_O.bias', 'backbone.layers.8.W_O.bias'}, {'backbone.layers.9.self_attn.W_O.weight', 'backbone.layers.9.W_O.weight'}, {'backbone.layers.9.self_attn.W_O.bias', 'backbone.layers.9.W_O.bias'}, {'backbone.layers.10.self_attn.W_O.weight', 'backbone.layers.10.W_O.weight'}, {'backbone.layers.10.self_attn.W_O.bias', 'backbone.layers.10.W_O.bias'}, {'backbone.layers.11.W_O.weight', 'backbone.layers.11.self_attn.W_O.weight'}, {'backbone.layers.11.self_attn.W_O.bias', 'backbone.layers.11.W_O.bias'}].
            A potential way to correctly save your model is to use `save_model`.
            More information at https://huggingface.co/docs/safetensors/torch_shared_tensors
            