In [1]:
!pip install -q transformers datasets accelerate torch

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
model_name = "gpt2"

# Load Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Fix padding token issue (GPT-2 doesn't have a pad token by default)
tokenizer.pad_token = tokenizer.eos_token

# Load Model
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

print("Model and Tokenizer loaded.")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model and Tokenizer loaded.


In [4]:
dataset = load_dataset("roneneldan/TinyStories", split="train")

# Take a small subset for quick demo (use more for better results)
dataset = dataset.select(range(500))

def tokenize_function(examples):
    # Tokenize the text and truncate to a max length
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Format for PyTorch
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])

print("Dataset prepared.")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004-2d5a1467fff108(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00004-5852b56a2bd28f(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00002-of-00004-a26307300439e9(…):   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00003-of-00004-d243063613e5a0(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/validation-00000-of-00001-869c898b5(…):   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset prepared.


In [5]:
training_args = TrainingArguments(
    output_dir="./gpt2-story-writer",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=50,
    learning_rate=5e-5,
    remove_unused_columns=True
)

# Data Collator handles dynamic padding during training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

print("Training arguments set.")

Training arguments set.


In [6]:
# Cell 6: Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()
print("Training complete!")

# Save the model
trainer.save_model("./gpt2-story-writer")
tokenizer.save_pretrained("./gpt2-story-writer")
print("Model saved to ./gpt2-story-writer")

Starting training...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.324295
100,2.064646
150,1.972933


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Training complete!


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model saved to ./gpt2-story-writer


In [7]:
finetuned_model = GPT2LMHeadModel.from_pretrained("./gpt2-story-writer").to(device)
finetuned_tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-story-writer")

def generate_story(prompt, max_length=200):
    inputs = finetuned_tokenizer(prompt, return_tensors="pt").to(device)

    outputs = finetuned_model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.8,       # Creativity (higher = more random)
        top_k=50,             # Limits random guesses to top 50 words
        top_p=0.95,           # Nucleus sampling for better quality
        do_sample=True,       # Enable sampling (required for creative writing)
        repetition_penalty=1.2 # Prevents repeating sentences
    )

    return finetuned_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test it out
prompt = "Once upon a time, a little robot found a blue flower."
print(f"--- Generated Story ---\n{generate_story(prompt)}")

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--- Generated Story ---
Once upon a time, a little robot found a blue flower. She loved it so much she put down the flowers and started to explore around her garden!
One day when exploring inside her house, one of her favorite things was finding some old books in their yard that had been lost forever - all while searching for them at night... 
  While wandering through this area, however you peeked out from under your blanket covering into an empty lot filled with colorful leaves on top. The plants were growing fast too as they continued walking closer each morning until finally spotted something shiny shining brightly beneath its cover .
