In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch
import requests
import sys

# Step 3: Fetch Shakespeare Dataset

def fetch_dataset(url, filename="shakespeare.txt"):
    response = requests.get(url)
    with open(filename, "w", encoding="utf-8") as f:
        f.write(response.text)
    return filename

# URL for Shakespeare's Complete Works from Project Gutenberg
data_url = "https://www.gutenberg.org/cache/epub/100/pg100.txt"
dataset_file = fetch_dataset(data_url)

In [None]:
# Step 4: Load and Tokenize Dataset
# Load the pre-trained tokenizer first
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", model_max_length=1024, use_auth_token=None)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256, return_tensors="pt")

# Load dataset properly
dataset = load_dataset("text", data_files={"train": dataset_file})["train"]
dataset = dataset.train_test_split(train_size=0.1, test_size=0.1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Step 5: Load the pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Apply tokenization to dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


Map:   0%|          | 0/19639 [00:00<?, ? examples/s]

Map:   0%|          | 0/19639 [00:00<?, ? examples/s]

In [None]:
# Step 6: Configure training parameters
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    remove_unused_columns=False,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
# Step 7: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.8017,


TrainOutput(global_step=4910, training_loss=3.9290704109518204, metrics={'train_runtime': 1092.2542, 'train_samples_per_second': 17.98, 'train_steps_per_second': 4.495, 'total_flos': 2565757108224000.0, 'train_loss': 3.9290704109518204, 'epoch': 1.0})

In [None]:
# Step 8: Generate a New Shakespearean Work
def generate_text_Shake(prompt="To be, or not to be", max_length=500):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print(generate_text_Shake())

To be, or not to be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I will not be. I wi