## Train Model

In [1]:
from transformers import GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer
import torch
from datasets import load_from_disk

In [2]:
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
GPU name: NVIDIA GeForce RTX 4060


In [3]:
# Load Tokenizer

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [4]:
# Load Tokenized Dataset
# Data has 1,213,568 samples so reducing to 100k for time efficency

full_dataset = load_from_disk("tokenized_data/cooper_tokenized_dataset")
subset = full_dataset.select(range(100_000))  

subset.save_to_disk("tokenized_data/cooper_subset_100k")

dataset = load_from_disk("tokenized_data/cooper_subset_100k")

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [5]:
# Define Model Config

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=256,
    n_ctx=256,
    n_embd=1024,
    n_layer=24,
    n_head=16,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config)
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M parameters")

Model size: 354.04M parameters


In [6]:
# Define Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [7]:
# Training Arguments

training_args = TrainingArguments(
    output_dir="cooper_model_checkpoints",
    overwrite_output_dir=True,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    logging_steps=100,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    learning_rate=5e-4,
    warmup_steps=100,
    fp16=True,
    save_total_limit=2,
    report_to="none",
    logging_dir="logs",
)


In [8]:
# Initialize Trainer
dataset_split = dataset.train_test_split(test_size=0.1)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    data_collator=data_collator,
)

In [9]:
# Train

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,6.7098,6.661433
1000,6.35,6.327674
1500,6.1518,6.112556
2000,6.0163,5.996645
2500,5.9239,5.90353
3000,5.8538,5.845378
3500,5.8297,5.796227
4000,5.751,5.750268
4500,5.7228,5.704636
5000,5.6988,5.656058


TrainOutput(global_step=5625, training_loss=6.03898296983507, metrics={'train_runtime': 52533.0982, 'train_samples_per_second': 1.713, 'train_steps_per_second': 0.107, 'total_flos': 4.179153125376e+16, 'train_loss': 6.03898296983507, 'epoch': 1.0})

In [10]:
# Save Model

model.save_pretrained("CooperLM-354M")
tokenizer.save_pretrained("CooperLM-354M")
print("Model saved to CooperLM-354M/")

Model saved to CooperLM-354M/


In [11]:
print(TrainingArguments.__module__)


transformers.training_args


## Quick Test

In [14]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch

def test_cooper_model(model_path="CooperLM-354M", prompt="Hello, my name is", max_length=50):
    # Load model and tokenizer
    tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
    
    # Tokenize prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    
    # Generate
    output = model.generate(
        input_ids,
        max_length=max_length,
        do_sample=True,
        temperature=0.9,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode and print result
    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    print("=== Prompt ===")
    print(prompt)
    print("\n=== Completion ===")
    print(generated)

test_cooper_model()

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


=== Prompt ===
Hello, my name is

=== Completion ===
Hello, my name is the most of the city of Europe. This is also the largest city is only a total of the west-mordered by most important elevation of the country's second in the region of the city by the south. The area


In [15]:
test_cooper_model(prompt="In the distant future,")

=== Prompt ===
In the distant future,

=== Completion ===
In the distant future, he went down his "father" to the book's work as a character at the death and, when he used that he knew, it will be a new book. He was "unach and the last of the most
