# Supervised Fine-Tuning of GPT using Huggingface Tools

## Domain Adaption

In [1]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

## 1. Prepare and load traing and evaluation data

Use the 'context' in data.jsonl for domain adaption

In [2]:
input_file = "./data.jsonl"
train_output_file = "./train_data.txt"

with open(input_file, 'r') as file:
    lines = file.readlines()

with open(train_output_file, "w") as outfile:
    for line in lines:
        data = json.loads(line)
        if data['train']:
            context = data['context']
            context = context.strip()
            outfile.write(context)

eval_output_file = "./eval_data.txt"
with open(eval_output_file, "w") as outfile:
    for line in lines:
        data = json.loads(line)
        if not data['train']:
            context = data['context']
            context = context.strip()
            outfile.write(context)

In [26]:
## Load the pre_trained GPT2 model
         # "gpt2"       #: 124 million parameters 
model_id = "gpt2-medium"#: 345 million parameters 
         # "gpt2-Large" #: 774 million parameters
         # "gpt2-xl"    #: 1.5 billion parameters 


tokenizer = GPT2Tokenizer.from_pretrained(model_id, clean_up_tokenization_spaces=True)
model = GPT2LMHeadModel.from_pretrained(model_id)

In [27]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token_id = tokenizer.eos_token_id

# tokenizer.pad_token = tokenizer.eos_token  
# or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`

In [28]:
###### Build a custom PyTorch Dataset
class CustomDataset(Dataset): # from torch.utils.data
    def __init__(self, tokenizer, file_path, block_size):
        self.tokenizer = tokenizer
        with open(file_path, "r") as f:
            self.text = f.read().splitlines()
        self.block_size = block_size

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        tokenized_inputs = self.tokenizer(self.text[idx], 
                                          truncation=True, 
                                          padding="max_length", 
                                          max_length=self.block_size, 
                                          return_tensors="pt")
        
        tokenized_inputs['labels'] = tokenized_inputs['input_ids']
        return tokenized_inputs
        

train_dataset = CustomDataset(tokenizer, train_output_file, 128)
#eval_dataset = CustomDataset(tokenizer, eval_output_file, 128)

In [29]:
# set device to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Usinig devivs:", device)

Usinig devivs: mps


In [30]:
#create a data collator that dynamically pad the sequences 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [31]:
## Use Huggingface Trainer

training_args = TrainingArguments(
    #per_device_train_batch_size=2,
    output_dir='./results_2',
    logging_dir='./logs_2',
    num_train_epochs=3, #large dataset -> small num epochs, small dataset --> more epochs
    learning_rate=1e-4,
    logging_steps=10,
    load_best_model_at_end=False,
    eval_strategy="no",    
    remove_unused_columns=False,
    push_to_hub=False,
    #per_device_eval_batch_size=4,
    #weight_decay=0.01,
    #save_steps=10_000,
    #save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=None,
)

In [32]:
trainer.train()

Step,Training Loss
10,2.4709
20,1.4228
30,0.9357
40,1.04
50,0.8046
60,0.5657
70,0.4319
80,0.6199
90,0.5226
100,0.3273


TrainOutput(global_step=150, training_loss=0.7042135095596314, metrics={'train_runtime': 251.5318, 'train_samples_per_second': 4.735, 'train_steps_per_second': 0.596, 'total_flos': 276520631795712.0, 'train_loss': 0.7042135095596314, 'epoch': 3.0})

In [75]:
orig_model = GPT2LMHeadModel.from_pretrained(model_id)
prompt = "What is the 'Super Bowl 50'?"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)


orig_model.eval()
orig_model.to(device)
output = orig_model.generate(input_ids=input_ids, 
                        attention_mask=attention_mask,
                        pad_token_id=tokenizer.pad_token_id,
                        max_length= 100,
                        num_beams=5,
                        temperature=1.5,
                        top_k=50,
                        do_sample=True)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

What is the 'Super Bowl 50'?

The 50th Super Bowl will be played on Sunday, February 5, 2016 at the Mercedes-Benz Superdome in New Orleans, Louisiana. The game will be broadcast live on NBC, ABC, FOX, CBS, ESPN and the NFL Network. The game will be broadcast live online on NBC Sports Live Extra starting at 11:00 p.m. ET on Sunday, February 5, 2016.

What is the Super Bowl 50 broadcast schedule


In [74]:
prompt = "What is the 'Super Bowl 50'?"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)


model.eval()
model.to(device)
output = model.generate(input_ids=input_ids, 
                        attention_mask=attention_mask,
                        pad_token_id=tokenizer.pad_token_id,
                        max_length= 100,
                        num_beams=5,
                        temperature=1.3,
                        top_k=50,
                        do_sample=True)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

What is the 'Super Bowl 50'?' It is the 50th anniversary of the Super Bowl XLI between the Atlanta Falcons and New England Patriots. The game was played on December 12, 2000 at Mercedes-Benz Stadium in Atlanta, Georgia. The home team won 27-14. The victory was the most lopsided in Super Bowl history with the score being 27-14 in favor of the home team. The victory also marked the beginning of the end for the infamous "Deflategate


In [None]:
###################### Backup ################
# integrate model into chatbot
# see https://medium.com/@rupaak/use-fine-tuned-gpt-2-for-chatbot-5fdf4908fbca


In [76]:
input_file = "./data.jsonl"
j = 0
with open(input_file, 'r') as file:
    lines = file.readlines()
    for line in lines:
        data = json.loads(line)
        context = data['context']
        if "Super Bowl 50" in context:
            j += 1
            print(f"----- {j} ------")
            print(context)

