In [1]:
import torch
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset




In [2]:
# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [3]:
def load_txt_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    return Dataset.from_dict({"text": [line.strip() for line in lines if line.strip()]})

In [4]:
dataset = load_txt_dataset("your_dataset.txt")

In [5]:
sample_data = [
    "Once upon a time in a distant land, a young warrior set out on a journey.",
    "The stars shone brightly over the enchanted forest as the wizard chanted his spell.",
    "In a small village by the sea, an old fisherman shared stories of the deep ocean."
]
with open("your_dataset.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(sample_data))

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

In [7]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [8]:
# Split dataset into train and eval
dataset_split = tokenized_datasets.train_test_split(test_size=0.1, seed=42)
dataset_train = dataset_split["train"]
dataset_eval = dataset_split["test"]

In [9]:
# Prepare dataset for training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
# Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [11]:
training_args = TrainingArguments(
    output_dir="./gpt2-story-model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./gpt2-story-model")
tokenizer.save_pretrained("./gpt2-story-model")

  trainer = Trainer(
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,No log,3.589314
2,No log,3.589119
3,No log,3.588696


('./gpt2-story-model\\tokenizer_config.json',
 './gpt2-story-model\\special_tokens_map.json',
 './gpt2-story-model\\vocab.json',
 './gpt2-story-model\\merges.txt',
 './gpt2-story-model\\added_tokens.json')

In [12]:
def generate_story(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [13]:
model = GPT2LMHeadModel.from_pretrained("./gpt2-story-model")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-story-model")

In [14]:
prompt = "Once upon a time, in a faraway kingdom,"
print(generate_story(prompt, model, tokenizer))

Once upon a time, in a faraway kingdom, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world, the king of the world
