In [None]:
# !pip install accelerate
# !pip install accelerate -U
# !pip install transformers[torch]



In [None]:
import glob

data_dir = 'datasets/babylm_10M/*.train'     # change if needed

# Use glob to get all .train files in the directory
file_paths = glob.glob(data_dir)

# Concatenate all text files into one big text file
with open("combined_dataset.txt", "w") as outfile:
    for file_path in file_paths:
        with open(file_path, "r") as infile:
            outfile.write(infile.read())


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# get text data from the concatenated text file
with open("combined_dataset.txt", "r") as file:
    text_data = file.read().splitlines()

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the text data
inputs = tokenizer(text_data, return_tensors='pt', truncation=True, padding=True, max_length=512)

# Create a PyTorch dataset
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

dataset = TextDataset(inputs)

# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Resize the token embeddings
model.resize_token_embeddings(len(tokenizer))

# Initialize the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Initialize the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,                # Log every X updates steps
    log_level='info',                # Set the logger to the 'info' level
    log_level_replica='info',        # Set the logger of the replicas to the 'info' level
    max_steps=100000,                # Limit the total number of training steps to 100000
)

# Initialize the trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset,               # training dataset
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model, tokenizer, and trainer state
model.save_pretrained('gpt2_dir/')
tokenizer.save_pretrained('gpt2_dir/')

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer_config.json
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],


Step,Training Loss
10,4.7421
20,4.6183
30,4.8155
40,4.8899
50,4.7847
60,4.6842
70,4.6164
80,4.436
90,4.5965
100,4.5189


Saving model checkpoint to ./results/tmp-checkpoint-500
Configuration saved in ./results/tmp-checkpoint-500/config.json
Configuration saved in ./results/tmp-checkpoint-500/generation_config.json
Model weights saved in ./results/tmp-checkpoint-500/model.safetensors
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/tmp-checkpoint-1000
Configuration saved in ./results/tmp-checkpoint-1000/config.json
Configuration saved in ./results/tmp-checkpoint-1000/generation_config.json
Model weights saved in ./results/tmp-checkpoint-1000/model.safetensors
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/tmp-checkpoint-1500
Configuration saved in ./results/tmp-checkpoint-1500/config.json
Configuration saved in ./results/tmp-checkpoint-1500/generation_config.json
Model weights saved in ./results/tmp-checkpoint-1500/model.safetensors
  return {key: torch.tensor(val[idx]) for

SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

In [None]:
# %%shell
# cd results
# find . -type d -name 'checkpoint-*' ! -name 'checkpoint-61500' -exec rm -r {} +



In [None]:
# # Initialize the model from a checkpoint
# model = GPT2LMHeadModel.from_pretrained('results/checkpoint-61500')

# # Initialize the tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('results/checkpoint-61500')

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the text data
inputs = tokenizer(text_data, return_tensors='pt', truncation=True, padding=True, max_length=512)

# Create a PyTorch dataset
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

dataset = TextDataset(inputs)

# Resize the token embeddings
model.resize_token_embeddings(len(tokenizer))

# Initialize the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Initialize the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,                # Log every X updates steps
    log_level='info',                # Set the logger to the 'info' level
    log_level_replica='info',        # Set the logger of the replicas to the 'info' level
    max_steps=100000,                # Limit the total number of training steps to 100000
    save_steps=1000,                 # Save a checkpoint every 1000 steps
)

# Initialize the trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset,               # training dataset
    data_collator=data_collator,
)

# Train the model from a checkpoint
trainer.train('results/checkpoint-61500')

# Save the model, tokenizer, and trainer state
model.save_pretrained('model_dir/')
tokenizer.save_pretrained('model_dir/')

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50257. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
max_steps is given, it will override any value given in num_train_epochs
Loading model from results/checkpoint-61500.
The

Step,Training Loss
61510,4.1568
61520,3.9302
61530,4.0074
61540,3.8715
61550,3.9083
61560,4.0889
61570,4.0552
61580,3.7556
61590,4.1005
61600,3.6772


Saving model checkpoint to ./results/tmp-checkpoint-62000
Configuration saved in ./results/tmp-checkpoint-62000/config.json
Configuration saved in ./results/tmp-checkpoint-62000/generation_config.json
Model weights saved in ./results/tmp-checkpoint-62000/model.safetensors
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/tmp-checkpoint-62500
Configuration saved in ./results/tmp-checkpoint-62500/config.json
Configuration saved in ./results/tmp-checkpoint-62500/generation_config.json
Model weights saved in ./results/tmp-checkpoint-62500/model.safetensors
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/tmp-checkpoint-63000
Configuration saved in ./results/tmp-checkpoint-63000/config.json
Configuration saved in ./results/tmp-checkpoint-63000/generation_config.json
Model weights saved in ./results/tmp-checkpoint-63000/model.safetensors
  return {key: torch.tens

('model_dir/tokenizer_config.json',
 'model_dir/special_tokens_map.json',
 'model_dir/vocab.json',
 'model_dir/merges.txt',
 'model_dir/added_tokens.json')