In [None]:
# Fine tuning gpt2_medium model and use own data like company profile
#
# See also medium.com blog
# "GPT-2 Fine-Tuning Guide: Building a Chatbot for Your Company Profile"
# https://medium.com/@datatec.studio

In [None]:
# Install python packages
# !pip install -r requirements.txt

In [1]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import numpy as np
import sys

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Define environment variable, path of data, model name and device
# os.environ["HF_HOME"] = "/content/huggingface"  # Replace with your desired directory
# print("Please replace it with your hf access token:")
# os.environ["HF_HOME_TOKEN"] = "Please_replace_it_with_your_hf_access_token"

result_dir = 'resources/'
data_file_name = 'Winfrey'
data_file_path = f'../data/prompt_response/{data_file_name}.txt'

model_name = "gpt2" # gpt2-medium
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

cuda


In [3]:
# Write a python file to google driver
# Sample of json datasets
# You can also directly upload this code to your google driver
# The code write here in this way is for better understanding of whole project
# %%writefile chat_data.py

from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path: str, tokenizer):
        with open(path, encoding="utf-8") as f:
            self.data = f.readlines()#json.load(open(path, "r"))

        self.X = []
        for pair in self.data:
            pair = eval(pair)          
            self.X.append(f"<start> {pair['prompt']} <response>: {pair['response']} <end>")
        
        total_samples = len(self.X)  # Calculate the total number of samples
        print("total_samples", total_samples)
        # define samples amount
#         self.X = self.X[:500]
        print("Check the preprocessing for self.X[0]:")
        print(self.X[0])

        self.X_encoded = tokenizer(self.X, return_tensors="pt", max_length=30, padding="max_length", truncation=True)
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]


In [5]:
# Download model, save model and tokernize to harddisk
## prepare tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

print(f'Number of tokens in tokenizer before adding our specific tokens: {len(tokenizer.get_vocab())}')

tokenizer.add_special_tokens({"pad_token": "<pad>",
                              "bos_token": "<start>",
                              "eos_token": "<end>"})

tokenizer.add_tokens(["<response>:"])
with open(f'../data/prompt_response/groupchat.txt', encoding="utf-8") as f:
    data = f.readlines()#json.load(open(path, "r"))

for pair in data:
    pair = eval(pair)
    tokenizer.add_tokens(pair['prompt'].split() + pair['response'].split())
    
print(f'Number of tokens in tokenizer after adding our specific tokens: {len(tokenizer.get_vocab())}')

## prepare model
### Specify the desired embedding size (must be a multiple of 8)
desired_embedding_size = 50264  # Change this to the desired size
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
### Resize the embedding layer to the desired size
model.resize_token_embeddings(len(tokenizer), desired_embedding_size)
model = model.to(device)

model.config.pad_token_id = tokenizer.eos_token_id + 1


## save tokenizer and model to harddisk
# tokenizer.save_pretrained(result_dir)
# model.save_pretrained(result_dir)

Number of tokens in tokenizer before adding our specific tokens: 50257
Number of tokens in tokenizer after adding our specific tokens: 52874


In [11]:
# ## load model and tokenizer from harddisk
# ### Load the GPT-2 tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained(result_dir)

# ### Load the GPT-2 model from the local folder
# model = GPT2LMHeadModel.from_pretrained(result_dir)
# model.to(device)

In [12]:
# Define infer and train function
def infer(inp_raw):
    inp_appended = "<start> " + inp_raw + " <response>: "
    inp = tokenizer(inp_appended, return_tensors="pt")
    X = inp["input_ids"].to(device)  # Use .to(device) method to move the tensor to the specified device
    a = inp["attention_mask"].to(device)  # Use .to(device) method here as well

    output = model.generate(X, attention_mask=a, max_length=100, num_return_sequences=1)
    output = tokenizer.decode(output[0]).replace('<pad>', '')
    output = output[output.find('<response>: ')+len('<response>: '):].replace('<response>:', '')
    if '<end>' in output:
        output = output[:output.find('<end>')]
    output = ' '.join(output.split())
    
    prompt_response = {
        'prompt': inp_raw,
        'response': output
    }

    return prompt_response


def train(chatData, model, optim):
    
    batches = len(chatData)
    total_loss = 0

    for i, (X, a) in tqdm(enumerate(chatData), total=len(chatData), desc="Training"):
        X = X.to(device)
        a = a.to(device)
        optim.zero_grad()
        loss = model(input_ids=X, attention_mask=a, labels=X).loss
        total_loss += loss.item()
        loss.backward()
        optim.step()

    print(infer("Any progress on the PA?"))
    print('========================================================')
    
    return total_loss

In [13]:
# from chat_data import ChatData

#Load ChatData, train model and optimizer
chatData = ChatData(data_file_path, tokenizer)
chatData = DataLoader(chatData, batch_size=1) # batch_size=64

model.train()

optim = Adam(model.parameters(), lr=1e-5)

total_samples 570
Check the preprocessing for self.X[0]:
<start> You named the group CSE 151B PA2. <response>: What should our group name be 😎 Ester and I used transformers for PA1 <end>


In [None]:
epochs = 100  # You can adjust the number of epochs as needed
training_loss = []
for epoch in range(epochs):
    start = time.time()
    print(f"Epoch {epoch} started")
    total_loss = train(chatData, model, optim)
    training_loss += [total_loss]
    end = time.time()
    print(f"Epoch {epoch} finished in {np.round((end - start) / 60, 2)} minutes | total_loss = {total_loss}")

Epoch 0 started


Training: 100%|██████████| 570/570 [00:22<00:00, 25.82it/s]


{'prompt': 'Any progress on the PA?', 'response': "I’ll keep testing it until test IoU is done, Then will be a composite for the txt files to be over the entire Once this is done, if we haven't we don’t we can embrace the txt files will be large so"}
Epoch 0 finished in 0.38 minutes | total_loss = 188.19168958067894
Epoch 1 started


Training: 100%|██████████| 570/570 [00:22<00:00, 25.11it/s]


{'prompt': 'Any progress on the PA?', 'response': 'I’ll be able to build on the top of ester1; (5c) at least first val I’ll will get more epochs, val acc, in epoch I can build on top of ester1; at 5 epochs I’ll will get results, the val'}
Epoch 1 finished in 0.39 minutes | total_loss = 187.73083892464638
Epoch 2 started


Training: 100%|██████████| 570/570 [00:21<00:00, 26.05it/s]


{'prompt': 'Any progress on the PA?', 'response': 'Or any progress on the programming Or what’s the iou from the doc is summarize interesting'}
Epoch 2 finished in 0.38 minutes | total_loss = 187.6108034849167
Epoch 3 started


Training:  40%|████      | 228/570 [00:08<00:13, 25.62it/s]

In [15]:
len(chatData)

570

In [16]:
training_loss[-1] / len(chatData)

0.33749242090342335

In [None]:
# inp = ""
while True:
    inp = input("Enter your input (press Enter when done): " + " " * 20)
    print(infer(inp))

In [17]:
# Save the model's state dictionary after training is complete
torch.save(model.state_dict(), f"models/{data_file_name}.pt")

# Simulate a Chat

In [None]:
ester = GPT2LMHeadModel.from_pretrained(model_name).to(device)
ester.resize_token_embeddings(len(tokenizer), desired_embedding_size)
ester.load_state_dict(torch.load("models/Ester.pt"))
ester.config.pad_token_id = tokenizer.eos_token_id + 1

winfrey = GPT2LMHeadModel.from_pretrained(model_name).to(device)
winfrey.resize_token_embeddings(len(tokenizer), desired_embedding_size)
winfrey.load_state_dict(torch.load("models/Winfrey.pt"))
winfrey.config.pad_token_id = tokenizer.eos_token_id + 1

jeremy = GPT2LMHeadModel.from_pretrained(model_name).to(device)
jeremy.resize_token_embeddings(len(tokenizer), desired_embedding_size)
jeremy.load_state_dict(torch.load("models/Jeremy.pt"))
jeremy.config.pad_token_id = tokenizer.eos_token_id + 1

jonathan = GPT2LMHeadModel.from_pretrained(model_name).to(device)
jonathan.resize_token_embeddings(len(tokenizer), desired_embedding_size)
jonathan.load_state_dict(torch.load("models/Jonathan.pt"))
jonathan.config.pad_token_id = tokenizer.eos_token_id + 1

samuel = GPT2LMHeadModel.from_pretrained(model_name).to(device)
samuel.resize_token_embeddings(len(tokenizer), desired_embedding_size)
samuel.load_state_dict(torch.load("models/Samuel.pt"))
samuel.config.pad_token_id = tokenizer.eos_token_id + 1

In [None]:
def group_infer(inp_raw, model):
    inp_appended = "<start> " + inp_raw + " <response>: "
    inp = tokenizer(inp_appended, return_tensors="pt")
    X = inp["input_ids"].to(device)  # Use .to(device) method to move the tensor to the specified device
    a = inp["attention_mask"].to(device)  # Use .to(device) method here as well

    output = model.generate(X, attention_mask=a, max_length=100, num_return_sequences=1)
    output = tokenizer.decode(output[0])
#     print(output)
    output = output[output.find('<response>: ')+len('<response>: '):].replace('<pad>', '')#.replace('<response>:', '')#.replace('<pad>', '')#.replace('<end>', '')
    if '<response>' in output:
        output = output[:output.find('<response>')]
    if '<end>' in output:
        output = output[:output.find('<end>')]
    output = ' '.join(output.split())
    
    prompt_response = {
        'prompt': inp_raw,
        'response': output
    }

    return prompt_response

In [None]:
speaker = 'Ester'
output = group_infer("could you present with me?", eval(speaker.lower())) # How's it going?
with_name = f'{speaker}: ' + output['prompt']
convo = [with_name]
print(with_name)

people = ['Winfrey', 'Ester', 'Jeremy', 'Samuel', 'Jonathan']
for i in range(15):
    response = output['response']
    remaining_people = [person for person in people if person != speaker]
    speaker = np.random.choice(remaining_people)
#     print(speaker)
    output = group_infer(response, eval(speaker.lower())) # assuming models are named ester, winfrey, ...
    with_name = f'{speaker}: ' + output['response']
    convo += [with_name]
    print(with_name)

In [None]:
convo