# 03 - Fine-Tuning Pipeline

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import json

with open('../qa_pairs/qa_dataset.json') as f:
    qa_data = json.load(f)

qa_pairs = [{'text': f"Q: {x['question']}\nA: {x['answer']}"} for x in qa_data]
dataset = Dataset.from_list(qa_pairs)

tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
model = AutoModelForCausalLM.from_pretrained('distilgpt2')

def tokenize(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize)

training_args = TrainingArguments(
    output_dir='../models/fine_tuned_model',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    save_strategy='no'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()