## Train Model

In [1]:
from transformers import GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer
import torch
from datasets import load_from_disk

In [2]:
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
GPU name: NVIDIA GeForce RTX 4060


In [3]:
# Load Tokenizer

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [4]:
# Load Tokenized Dataset
# Data has 1,213,568 samples so reducing to 200k for time efficency

full_dataset = load_from_disk("tokenized_data/cooper_tokenized_dataset")
subset = full_dataset.select(range(200_000))  

subset.save_to_disk("tokenized_data/cooper_subset_200k")

dataset = load_from_disk("tokenized_data/cooper_subset_200k")

Saving the dataset (0/1 shards):   0%|          | 0/200000 [00:00<?, ? examples/s]

In [5]:
# Define Model Config

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=256,
    n_ctx=256,
    n_embd=1024,
    n_layer=24,
    n_head=16,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config)
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M parameters")

Model size: 354.04M parameters


In [6]:
# Define Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [7]:
# Training Arguments

training_args = TrainingArguments(
    output_dir="cooper_model_checkpoints",
    overwrite_output_dir=True,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    logging_steps=100,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    learning_rate=5e-4,
    warmup_steps=100,
    fp16=True,
    save_total_limit=2,
    report_to="none",
    logging_dir="logs",
)


SyntaxError: keyword argument repeated: save_steps (3617627601.py, line 10)

In [None]:
# Initialize Trainer
dataset_split = dataset.train_test_split(test_size=0.1)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    data_collator=data_collator,
)

In [None]:
# Train

trainer.train()

In [None]:
# Save Model

model.save_pretrained("CooperLM-354M")
tokenizer.save_pretrained("CooperLM-354M")
print("Model saved to CooperLM-354M/")

In [None]:
print(TrainingArguments.__module__)
