## Required Libraries

In [1]:
%pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl (30 kB)
Installing collected packages: xxhash, dill, multiprocess, datasets

   ---------- ----------------------------- 1/4 [dill]
   ---------- ----------------------------- 1/4 [dill]
   ---------- ----------------------------- 1/4 [dill]
   ---------- ----------------------------- 1/4 [dill]
   ---------- ----------------------------- 1/4 [dill]
   -------------------

# Dataset PreProcessing

In [1]:
from datasets import Dataset
from transformers import AutoTokenizer


with open(r"D:\Projects\Poetry_Generation_using_GEN_AI\poems\all_poems.txt", "r", encoding="utf-8") as f:
    poems = [p.strip() for p in f.read().split("\n\n") if len(p.strip().split()) > 10]

dataset = Dataset.from_dict({"text": poems})
dataset = dataset.train_test_split(test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

# Fine Tuning GPT2 Model

In [2]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

model = AutoModelForCausalLM.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="./poetry-gpt2-finetuned",
    num_train_epochs=1,                  
    per_device_train_batch_size=10,      
    save_steps=1000,
    save_total_limit=1,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,                         
    max_steps=1500,                       
    dataloader_num_workers=4,            
    gradient_accumulation_steps=1,       
    warmup_steps=50,                     
    logging_dir='./logs',
    report_to=None                      
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()
model.save_pretrained("./poetry-gpt2-finetuned")
tokenizer.save_pretrained("./poetry-gpt2-finetuned")





max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/1500 [00:00<?, ?it/s]

{'loss': 2.7661, 'grad_norm': 4.690566539764404, 'learning_rate': 1.931034482758621e-05, 'epoch': 0.29}
{'loss': 1.7722, 'grad_norm': 4.084377288818359, 'learning_rate': 1.7931034482758623e-05, 'epoch': 0.59}
{'loss': 1.7437, 'grad_norm': 4.512091636657715, 'learning_rate': 1.6551724137931037e-05, 'epoch': 0.88}
{'loss': 1.7117, 'grad_norm': 3.3973727226257324, 'learning_rate': 1.5172413793103448e-05, 'epoch': 1.18}
{'loss': 1.6756, 'grad_norm': 3.878058671951294, 'learning_rate': 1.3793103448275863e-05, 'epoch': 1.47}
{'loss': 1.6344, 'grad_norm': 4.15669584274292, 'learning_rate': 1.2413793103448277e-05, 'epoch': 1.77}
{'loss': 1.6002, 'grad_norm': 4.616940498352051, 'learning_rate': 1.103448275862069e-05, 'epoch': 2.06}
{'loss': 1.5723, 'grad_norm': 5.313350677490234, 'learning_rate': 9.655172413793105e-06, 'epoch': 2.36}
{'loss': 1.5963, 'grad_norm': 4.188971042633057, 'learning_rate': 8.275862068965518e-06, 'epoch': 2.65}
{'loss': 1.6273, 'grad_norm': 4.924607753753662, 'learning_

('./poetry-gpt2-finetuned\\tokenizer_config.json',
 './poetry-gpt2-finetuned\\special_tokens_map.json',
 './poetry-gpt2-finetuned\\vocab.json',
 './poetry-gpt2-finetuned\\merges.txt',
 './poetry-gpt2-finetuned\\added_tokens.json',
 './poetry-gpt2-finetuned\\tokenizer.json')