In [23]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
import json
import torch

In [None]:
with open('data.json', 'r') as file:
    data = json.load(file)

def format_data(data):
    """Combine prompt and response into a single text input-output for training."""
    formatted_data = []
    for entry in data:
        combined = f"Question: {entry['prompt']}\nAnswer: {entry['response']}\n"
        formatted_data.append({"text": combined})
    return formatted_data

# Calling this func to format the dataset
formatted_data = format_data(data)

In [None]:
# Convert data into a Hugging Face Dataset object
dataset = Dataset.from_list(formatted_data)

# Tokenizer and Model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# tokenising the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Split Dataset
dataset = tokenized_dataset.train_test_split(test_size=0.1)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_strategy="epoch",
    fp16=torch.cuda.is_available()
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

Map: 100%|██████████| 15/15 [00:00<00:00, 911.83 examples/s]
  trainer = Trainer(


In [None]:
# Train the Model
trainer.train()
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("Model training and saving complete!")

Step,Training Loss,Validation Loss


Model training and saving complete!
