In [1]:
model_name = "gpt2"

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from torch.optim import SGD
import torch 

# Load the dataset
ds = load_dataset("wikipedia", "20220301.simple")

# Calculate the size of the subsample (1% of the 'train' split)
subsample_size = int(0.1 * len(ds['train']))

# Create a random subsample of the dataset
subsample = ds['train'].shuffle(seed=42).select(range(subsample_size))

# Load tokenizer for your model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Tokenize the subsample
def tokenize_function(examples):
    # Truncation and padding are typically handled here if necessary
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_docs = subsample.map(tokenize_function, batched=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
example = tokenized_docs[0]
print(example.keys())  # Should include 'input_ids' and potentially 'attention_mask'
print(example['input_ids'][:10])  # Print first 10 token IDs to inspect

dict_keys(['id', 'url', 'title', 'text', 'input_ids', 'attention_mask'])
[36949, 16698, 46082, 1081, 354, 357, 17543, 1478, 11, 41435]


In [4]:
from torch.utils.data import DataLoader

def select_model_inputs(batch):
    return {
        "input_ids": batch["input_ids"],
        "attention_mask": batch["attention_mask"]
    }

# Apply the function to filter out only the necessary fields
model_inputs = tokenized_docs.map(select_model_inputs, batched=True)

# Manually collate a batch
def manual_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long),
        'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
    }

dataloader = DataLoader(model_inputs, batch_size=16, collate_fn=manual_collate_fn)

In [5]:
config = GPT2Config(vocab_size=len(tokenizer), n_positions=512)
model = GPT2LMHeadModel(config)
model.to(torch.device("cuda"))
optimizer = SGD(model.parameters(), lr=0.001)

In [None]:
# Training loop
num_epochs = 1
model.train()

for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch["input_ids"], labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Loss: {loss.item()}")


Loss: 10.225703239440918
Loss: 6.9129319190979
Loss: 4.700535297393799
Loss: 4.12257719039917
Loss: 4.641554355621338
Loss: 4.440639972686768
Loss: 3.6101434230804443
Loss: 1.9917831420898438
Loss: 3.634214162826538
Loss: 3.321385383605957
Loss: 3.134232997894287
Loss: 4.1511406898498535
Loss: 3.767796277999878
Loss: 3.223653793334961
Loss: 2.725511312484741
Loss: 4.277787208557129
