In [None]:
# Fine tuning gpt2_medium model and use own data like company profile
#
# See also medium.com blog
# "GPT-2 Fine-Tuning Guide: Building a Chatbot for Your Company Profile"
# https://medium.com/@datatec.studio

In [None]:
# Install python packages
# !pip install -r requirements.txt

In [1]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import numpy as np
import sys

import warnings
warnings.filterwarnings("ignore")

In [11]:
# Define environment variable, path of data, model name and device
# os.environ["HF_HOME"] = "/content/huggingface"  # Replace with your desired directory
# print("Please replace it with your hf access token:")
# os.environ["HF_HOME_TOKEN"] = "Please_replace_it_with_your_hf_access_token"

result_dir = 'resources/'
data_file_name = 'Jeremy'
data_file_path = f'../data/prompt_response/{data_file_name}.txt'

model_name = "gpt2" # gpt2-medium
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

cuda


In [12]:
# Write a python file to google driver
# Sample of json datasets
# You can also directly upload this code to your google driver
# The code write here in this way is for better understanding of whole project
# %%writefile chat_data.py

from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path: str, tokenizer):
        with open(path, encoding="utf-8") as f:
            self.data = f.readlines()#json.load(open(path, "r"))

        self.X = []
        for pair in self.data:
            pair = eval(pair)          
            self.X.append(f"<start> {pair['prompt']} <response>: {pair['response']} <end>")
        
        total_samples = len(self.X)  # Calculate the total number of samples
        print("total_samples", total_samples)
        # define samples amount
#         self.X = self.X[:500]
        print("Check the preprocessing for self.X[0]:")
        print(self.X[0])

        self.X_encoded = tokenizer(self.X, return_tensors="pt", max_length=30, padding="max_length", truncation=True)
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]


In [13]:
# Download model, save model and tokernize to harddisk
## prepare tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

print(f'Number of tokens in tokenizer before adding our specific tokens: {len(tokenizer.get_vocab())}')

tokenizer.add_special_tokens({"pad_token": "<pad>",
                              "bos_token": "<start>",
                              "eos_token": "<end>"})

tokenizer.add_tokens(["<response>:"])
with open(data_file_path, encoding="utf-8") as f:
    data = f.readlines()#json.load(open(path, "r"))

for pair in data:
    pair = eval(pair)
    tokenizer.add_tokens(pair['prompt'].split() + pair['response'].split())
    
print(f'Number of tokens in tokenizer after adding our specific tokens: {len(tokenizer.get_vocab())}')

## prepare model
### Specify the desired embedding size (must be a multiple of 8)
desired_embedding_size = 50264  # Change this to the desired size
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
### Resize the embedding layer to the desired size
model.resize_token_embeddings(len(tokenizer), desired_embedding_size)
model = model.to(device)

model.config.pad_token_id = tokenizer.eos_token_id + 1


## save tokenizer and model to harddisk
# tokenizer.save_pretrained(result_dir)
# model.save_pretrained(result_dir)

Number of tokens in tokenizer before adding our specific tokens: 50257
Number of tokens in tokenizer after adding our specific tokens: 50816


In [14]:
# ## load model and tokenizer from harddisk
# ### Load the GPT-2 tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained(result_dir)

# ### Load the GPT-2 model from the local folder
# model = GPT2LMHeadModel.from_pretrained(result_dir)
# model.to(device)

In [15]:
# Define infer and train function
def infer(inp_raw):
    inp_appended = "<start> " + inp_raw + " <response>: "
    inp = tokenizer(inp_appended, return_tensors="pt")
    X = inp["input_ids"].to(device)  # Use .to(device) method to move the tensor to the specified device
    a = inp["attention_mask"].to(device)  # Use .to(device) method here as well

    output = model.generate(X, attention_mask=a, max_length=100, num_return_sequences=1)

    output = tokenizer.decode(output[0])
    output = output[output.find('<response>: ')+len('<response>: '):output.find('<end>')].replace('<pad>', '')
    output = ' '.join(output.split())
    
    prompt_response = {
        'prompt': inp_raw,
        'response': output
    }

    return prompt_response


def train(chatData, model, optim):
    
    batches = len(chatData)

    for i, (X, a) in tqdm(enumerate(chatData), total=len(chatData), desc="Training"):
        X = X.to(device)
        a = a.to(device)
        optim.zero_grad()
        loss = model(input_ids=X, attention_mask=a, labels=X).loss
        loss.backward()
        optim.step()

    print(infer("Do we need to bring anything?"))
    print('========================================================')

In [16]:
# from chat_data import ChatData

#Load ChatData, train model and optimizer
chatData = ChatData(data_file_path, tokenizer)
chatData = DataLoader(chatData, batch_size=1) # batch_size=64

model.train()

optim = Adam(model.parameters(), lr=1e-5)

total_samples 164
Check the preprocessing for self.X[0]:
<start> What should our group name be 😎 Ester and I used transformers for PA1 <response>: lol anything works <end>


In [17]:
# train 10 times
epochs = 50  # You can adjust the number of epochs as needed
for epoch in range(epochs):
    start = time.time()
    print(f"Epoch {epoch} started")
    train(chatData, model, optim)
    end = time.time()
    print(f"Epoch {epoch} finished in {np.round((end - start) / 60, 2)} minutes")

Epoch 0 started


Training: 100%|██████████| 164/164 [00:06<00:00, 23.57it/s]


TypeError: sequence item 1: expected str instance, NoneType found

In [15]:
inp = ""
while True:
    inp = input("Enter your input (press Enter when done): " + " " * 20)
    print(infer(inp))

Enter your input (press Enter when done):                     haha


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'prompt': 'haha', 'response': 'ok I see I seeeee'}
Enter your input (press Enter when done):                     yay!


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'prompt': 'yay!', 'response': 'also looking at augmentation options in torchvision (part of UNet requires this) <response>: Personal high key'}
Enter your input (press Enter when done):                     I'm done


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'prompt': "I'm done", 'response': "I'm Feel free to build on top of ester1; by line <response>:"}
Enter your input (press Enter when done):                     yes


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'prompt': 'yes', 'response': 'the loss ones?'}
Enter your input (press Enter when done):                     I have a lot of coding experience


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'prompt': 'I have a lot of coding experience', 'response': 'under figures, the visual framework for discussion <response>: Visual Representations'}


KeyboardInterrupt: Interrupted by user

In [16]:
# Save the model's state dictionary after training is complete
torch.save(model.state_dict(), f"models/{data_file_name}.pt")

# Simulate a Chat

In [18]:
ester = GPT2LMHeadModel.from_pretrained(model_name).to(device)
ester.resize_token_embeddings(len(tokenizer), desired_embedding_size)
ester.load_state_dict(torch.load("models/Ester.pt"))
ester = model.to(device)

winfrey = GPT2LMHeadModel.from_pretrained(model_name).to(device)
winfrey.resize_token_embeddings(len(tokenizer), desired_embedding_size)
winfrey.load_state_dict(torch.load("models/Winfrey.pt"))
winfrey = model.to(device)

In [19]:
def infer(inp_raw, model):
    inp_appended = "<start> " + inp_raw + " <response>: "
    inp = tokenizer(inp_appended, return_tensors="pt")
    X = inp["input_ids"].to(device)  # Use .to(device) method to move the tensor to the specified device
    a = inp["attention_mask"].to(device)  # Use .to(device) method here as well

    output = model.generate(X, attention_mask=a, max_length=100, num_return_sequences=1)

    output = tokenizer.decode(output[0])
    output = output[output.find('<response>: ')+len('<response>: '):output.find('<end>')].replace('<pad>', '')
    output = ' '.join(output.split())
    
    prompt_response = {
        'prompt': inp_raw,
        'response': output
    }

    return prompt_response

In [None]:
model.config.pad_token_id = tokenizer.eos_token_id + 1
output = infer("How's it going?", ester)
with_name = 'Ester: ' + output['prompt']
convo = [with_name]
print(with_name)
for i in range(17):
    response = output['response']
    if i % 2 == 0:
        output = infer(response, winfrey)
        with_name = 'Winfrey: ' + output['response']
    else:
        output = infer(response, ester)
        with_name = 'Ester: ' + output['response']
    convo += [with_name]
    print(with_name)
    

In [32]:
convo

["Ester: How's it going?",
 'Winfrey: Oh no I don’t why not the 4c I think remotely! <response>: I think some of it in the computer folder not very good <response>: ok I think',
 'Ester: I think some of it in the computer folder not very good <response>: ok I think <response>: I I’ll might have some errors I have that’s I think',
 'Winfrey: ok I think <response>: I I’ll might have some errors I have that’s I think <response>: Maybe in the computer folder that I read through but not the util iou Not sure if I don’t don’t',
 'Ester: I I’ll might have some errors I have that’s I think <response>: Maybe in the computer folder that I read through but not the util iou Not sure if I don’t don’t <response>: Also I’ll probably want to set the context of sequence length 50?',
 'Winfrey: Maybe in the computer folder that I read through but not the util iou Not sure if I don’t don’t <response>: Also I’ll probably want to set the context of sequence length 50? <response>: Maybe in the',
 'Ester: Al