In [None]:
!pip install datasets

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch



In [None]:
file = "custom_text.txt"

In [None]:
# Step 1: Load and Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Load dataset properly
dataset = load_dataset("text", data_files={"train": file})["train"]
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
# Step 2: Load the pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", model_max_length=512)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Apply tokenization to dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
# Step 3: Configure training parameters
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated from evaluation_strategy
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    remove_unused_columns=False,
    fp16=torch.cuda.is_available(),  # Enable mixed precision training if GPU is available
    push_to_hub=False,  # Prevent auto-pushing to Hugging Face Hub
    report_to="none",  # Disable Weights & Biases logging
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Step 4: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,3.614144
2,No log,3.412002
3,No log,3.411233


TrainOutput(global_step=30, training_loss=3.132391611735026, metrics={'train_runtime': 47.7753, 'train_samples_per_second': 2.386, 'train_steps_per_second': 0.628, 'total_flos': 7446822912000.0, 'train_loss': 3.132391611735026, 'epoch': 3.0})

In [None]:
# Step 5: Evaluate the model
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print(generate_text("Once upon a time"))

Once upon a time, the world was filled with stories of magic and wonder. But now, the world is filled with monsters and monsters.

The world is filled with monsters and monsters.

The world is filled with monsters and monsters.

The world is filled with monsters and monsters.

The world is filled with monsters and monsters.

The world is filled with monsters and monsters.

The world is filled with monsters and monsters.

The world is filled
