# Question Answering domain specific Fine-Tuning

Goal: Train a chatbot model that responds specifically within your domain

see https://medium.com/@rupaak/how-to-fine-tune-gpt-2-for-a-domain-specific-chatbot-46e9ca64bc86

Supervised Fine-Tuning of GPT using Huggingface Tools

In [1]:
import math
import json

from datasets import load_dataset, Dataset

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
#from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling


## Loading the Squad dataset

In [2]:
dataset = load_dataset("squad")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [3]:
## Build custom Dataset from loaded squad dataset
path = "./data.jsonl"

def _build_dataset(dataset):
    custom_dataset = []
    for item in dataset['train']:
        item['train'] = True
        custom_dataset.append(json.dumps(item))
    
    for item in dataset['validation']:
        item['train'] = False
        custom_dataset.append(json.dumps(item))
    
    with open(path, "w") as f:
        f.write("\n".join(custom_dataset))

_build_dataset(dataset)

In [4]:
## Load custom Dataset
path = "./data.jsonl"

def _load_dataset(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    data = [json.loads(line) for line in lines]
    
    formatted_data = []
    for item in data:
        question = item['question']
        context = item['context']
        answers = item['answers']

        input_text = f"Question: {question}\nContext: {context}\nAnswer:"
        formatted_data.append({
            'input_text': input_text,
            'target_text': input_text, #str(context), #answers['text'][0] 
        })
    
    return Dataset.from_dict({'text': [item['input_text'] for item in formatted_data],
                              'labels': [item['target_text'] for item in formatted_data]})


dataset = _load_dataset(path)  # Our data file name is 'data.jsonl'

In [5]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')
#tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#model = GPT2LMHeadModel.from_pretrained('gpt2')

In [6]:
## Tokenize the dataset
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

#def tokenize_function(examples):
#    #return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)
#    return tokenizer(examples['text'], padding="max_length", truncation=True)

def tokenize(examples):
    result = tokenizer(examples['text'], padding="max_length", truncation=True)
    labels = tokenizer(examples['labels'], padding="max_length", truncation=True)
    result['labels'] = labels['input_ids']
    return result



tokenized_datasets = dataset.map(lambda example: tokenize(example), batched=True)


Map:   0%|          | 0/98169 [00:00<?, ? examples/s]

In [14]:
splitted = tokenized_datasets.train_test_split(test_size=0.3, shuffle=True, seed=42)
splitted

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 68718
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 29451
    })
})

In [9]:
## Use a Data Collator for preparing a Bach
## see https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorForLanguageModeling
#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [7]:
#!pip install accelerate
#!pip list

In [16]:
## Use Huggingface Trainer

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    #per_device_train_batch_size=4,
    #per_device_eval_batch_size=4,
    num_train_epochs=1, #3
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=200,
)

trainer = Trainer(
    model=model,
    args=training_args,
    #data_collator=data_collator,
    train_dataset=splitted['train'],
    eval_dataset=splitted['test'],
)

In [19]:
for item in splitted['train']:
    print(item.keys())
    print("text  :        ", item['text'][0:20],"...")
    print("labels:        ", item['labels'][0:10],"...")
    print("input_ids:     ", item['input_ids'][0:10],"...")
    print("attention_mask:", item['attention_mask'][0:10],"...")
    break

# label, text, input_ids, attention_mask

dict_keys(['text', 'labels', 'input_ids', 'attention_mask'])
text  :         Question: What was t ...
labels:         [24361, 25, 1867, 373, 262, 1438, 286, 262, 1628, 5495] ...
input_ids:      [24361, 25, 1867, 373, 262, 1438, 286, 262, 1628, 5495] ...
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ...


In [21]:
trainer.train()