In [None]:
# Fine tuning gpt2_medium model and use own data like company profile
#
# See also medium.com blog
# "GPT-2 Fine-Tuning Guide: Building a Chatbot for Your Company Profile"
# https://medium.com/@datatec.studio

In [None]:
# Install python packages
# !pip install -r requirements.txt

In [1]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import numpy as np
import sys

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Define environment variable, path of data, model name and device
# os.environ["HF_HOME"] = "/content/huggingface"  # Replace with your desired directory
# print("Please replace it with your hf access token:")
# os.environ["HF_HOME_TOKEN"] = "Please_replace_it_with_your_hf_access_token"

result_dir = 'resources/'
data_file_name = 'Jeremy'
data_file_path = f'../data/prompt_response/{data_file_name}.txt'

model_name = "gpt2" # gpt2-medium
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

cuda


In [3]:
# Write a python file to google driver
# Sample of json datasets
# You can also directly upload this code to your google driver
# The code write here in this way is for better understanding of whole project
# %%writefile chat_data.py

from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path: str, tokenizer):
        with open(path, encoding="utf-8") as f:
            self.data = f.readlines()#json.load(open(path, "r"))

        self.X = []
        for pair in self.data:
            pair = eval(pair)          
            self.X.append(f"<start> {pair['prompt']} <response>: {pair['response']} <end>")
        
        total_samples = len(self.X)  # Calculate the total number of samples
        print("total_samples", total_samples)
        # define samples amount
#         self.X = self.X[:500]
        print("Check the preprocessing for self.X[0]:")
        print(self.X[0])

        self.X_encoded = tokenizer(self.X, return_tensors="pt", max_length=30, padding="max_length", truncation=True)
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]


In [4]:
# Download model, save model and tokernize to harddisk
## prepare tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

print(f'Number of tokens in tokenizer before adding our specific tokens: {len(tokenizer.get_vocab())}')

tokenizer.add_special_tokens({"pad_token": "<pad>",
                              "bos_token": "<start>",
                              "eos_token": "<end>"})

tokenizer.add_tokens(["<response>:"])
with open(data_file_path, encoding="utf-8") as f:
    data = f.readlines()#json.load(open(path, "r"))

for pair in data:
    pair = eval(pair)
    tokenizer.add_tokens(pair['prompt'].split() + pair['response'].split())
    
print(f'Number of tokens in tokenizer after adding our specific tokens: {len(tokenizer.get_vocab())}')

## prepare model
### Specify the desired embedding size (must be a multiple of 8)
desired_embedding_size = 50264  # Change this to the desired size
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
### Resize the embedding layer to the desired size
model.resize_token_embeddings(len(tokenizer), desired_embedding_size)
model = model.to(device)

model.config.pad_token_id = tokenizer.eos_token_id + 1


## save tokenizer and model to harddisk
# tokenizer.save_pretrained(result_dir)
# model.save_pretrained(result_dir)

Number of tokens in tokenizer before adding our specific tokens: 50257
Number of tokens in tokenizer after adding our specific tokens: 50816


In [5]:
# ## load model and tokenizer from harddisk
# ### Load the GPT-2 tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained(result_dir)

# ### Load the GPT-2 model from the local folder
# model = GPT2LMHeadModel.from_pretrained(result_dir)
# model.to(device)

In [31]:
# Define infer and train function
def infer(inp_raw):
    inp_appended = "<start> " + inp_raw + " <response>: "
    inp = tokenizer(inp_appended, return_tensors="pt")
    X = inp["input_ids"].to(device)  # Use .to(device) method to move the tensor to the specified device
    a = inp["attention_mask"].to(device)  # Use .to(device) method here as well

    output = model.generate(X, attention_mask=a, max_length=100, num_return_sequences=1)
    output = tokenizer.decode(output[0]).replace('<pad>', '')
    output = output[output.find('<response>: ')+len('<response>: '):].replace('<response>:', '')
    if '<end>' in output:
        output = output[:output.find('<end>')]
    output = ' '.join(output.split())
    
    prompt_response = {
        'prompt': inp_raw,
        'response': output
    }

    return prompt_response


def train(chatData, model, optim):
    
    batches = len(chatData)

    for i, (X, a) in tqdm(enumerate(chatData), total=len(chatData), desc="Training"):
        X = X.to(device)
        a = a.to(device)
        optim.zero_grad()
        loss = model(input_ids=X, attention_mask=a, labels=X).loss
        loss.backward()
        optim.step()

    print(infer("Do we need to bring anything?"))
    print('========================================================')

In [7]:
# from chat_data import ChatData

#Load ChatData, train model and optimizer
chatData = ChatData(data_file_path, tokenizer)
chatData = DataLoader(chatData, batch_size=1) # batch_size=64

model.train()

optim = Adam(model.parameters(), lr=1e-5)

total_samples 164
Check the preprocessing for self.X[0]:
<start> What should our group name be 😎 Ester and I used transformers for PA1 <response>: lol anything works <end>


In [29]:
# train 10 times
epochs = 20  # You can adjust the number of epochs as needed
for epoch in range(epochs):
    start = time.time()
    print(f"Epoch {epoch} started")
    train(chatData, model, optim)
    end = time.time()
    print(f"Epoch {epoch} finished in {np.round((end - start) / 60, 2)} minutes")

Epoch 0 started


Training: 100%|██████████| 164/164 [00:07<00:00, 22.07it/s]


{'prompt': 'Do we need to bring anything?', 'response': "our seed is actually just winfrey's and def look like"}
Epoch 0 finished in 0.14 minutes
Epoch 1 started


Training: 100%|██████████| 164/164 [00:06<00:00, 24.36it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'ok I edited the numbers just pushed'}
Epoch 1 finished in 0.12 minutes
Epoch 2 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.87it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'our seed was actually just built different i tried only modified the branch i tried the code'}
Epoch 2 finished in 0.12 minutes
Epoch 3 started


Training: 100%|██████████| 164/164 [00:06<00:00, 24.78it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'our seed is still vv scuffed but higher so so i tried modified the basic_fcn and seed so so'}
Epoch 3 finished in 0.12 minutes
Epoch 4 started


Training: 100%|██████████| 164/164 [00:06<00:00, 24.88it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'our seed was actually not really working on the hyperparameters but updated the branch and my new part was really working'}
Epoch 4 finished in 0.12 minutes
Epoch 5 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.15it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'do we need to submit to 200 in the report ok I remember the one thing we need to know is we still working on it ig in the report ok'}
Epoch 5 finished in 0.12 minutes
Epoch 6 started


Training: 100%|██████████| 164/164 [00:06<00:00, 24.44it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'do we need a lot of And the report to the branch oh ok'}
Epoch 6 finished in 0.12 minutes
Epoch 7 started


Training: 100%|██████████| 164/164 [00:06<00:00, 24.11it/s]


{'prompt': 'Do we need to bring anything?', 'response': '<start> Do we need to br in g anything? <response>: ok I could be wrong <response>: but I remember the session I went to a branch and I remember the session I went wasn’t very different than the results added to the report <response>: added my branch name to my branch and'}
Epoch 7 finished in 0.12 minutes
Epoch 8 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.35it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'do we need a lot more than the union? give me give me'}
Epoch 8 finished in 0.12 minutes
Epoch 9 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.04it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'do we need to make a branch over the table and show the report back ok'}
Epoch 9 finished in 0.12 minutes
Epoch 10 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.69it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'mb try a different server? i tried the one better i tried the one better i tried fixed the thing that numbers fixed the'}
Epoch 10 finished in 0.12 minutes
Epoch 11 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.58it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'do we need to submit to the report ok'}
Epoch 11 finished in 0.12 minutes
Epoch 12 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.35it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'ok ok my parts ok my parts'}
Epoch 12 finished in 0.12 minutes
Epoch 13 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.95it/s]


{'prompt': 'Do we need to bring anything?', 'response': '<start> Do we need to br in g anything? <response>: anything but yea <response>: work on it later <response>: work on it later < model name + the hyperparameters <response>: option <response>: option option option option option option option option option option option option option option option option option option option'}
Epoch 13 finished in 0.12 minutes
Epoch 14 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.58it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'our names and our names and our'}
Epoch 14 finished in 0.12 minutes
Epoch 15 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.56it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'our seed is well gone into this'}
Epoch 15 finished in 0.12 minutes
Epoch 16 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.74it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'our models is checkpoint/best_0219 really different than the cuda is really different just different a a different model from the one name that is mb 5b i tried umm'}
Epoch 16 finished in 0.12 minutes
Epoch 17 started


Training: 100%|██████████| 164/164 [00:06<00:00, 26.01it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'I could never aws i tried doing anything but branch 16 i tried know what were the hyperparameters hyperparameters so expensive doing it as fast as a separate one percent of the union? chance'}
Epoch 17 finished in 0.12 minutes
Epoch 18 started


Training: 100%|██████████| 164/164 [00:06<00:00, 26.00it/s]


{'prompt': 'Do we need to bring anything?', 'response': '<start> Do we need to br in g anything? <response>: just push the fasic_fcn file to my branch and you can numbers for PA1 <response>: <response>: and out the morning 🤓 <response>: <response>: i just missed it looks and using sequence length of 50 and the training time of 50 and the'}
Epoch 18 finished in 0.12 minutes
Epoch 19 started


Training: 100%|██████████| 164/164 [00:06<00:00, 25.62it/s]


{'prompt': 'Do we need to bring anything?', 'response': 'our model is plainly just a better for so we can make a better model of our'}
Epoch 19 finished in 0.12 minutes


In [25]:
# for k, v in tokenizer.get_vocab().items():
#     if v == 1169:
#         print(k)

In [32]:
inp = ""
while True:
    inp = input("Enter your input (press Enter when done): " + " " * 20)
    print(infer(inp))

Enter your input (press Enter when done):                     Hi! Are you done?
{'prompt': 'Hi! Are you done?', 'response': '+ the iou ? ok I just pushed to my branch'}
Enter your input (press Enter when done):                     I just pushed
{'prompt': 'I just pushed', 'response': 'its under jeremy-unet, the code is still vv but vv scuffed and vv ok my part is supposed'}
Enter your input (press Enter when done):                     Hey Jeremy
{'prompt': 'Hey Jeremy', 'response': 'I just ran it looks and I do not have this branch so feel like I need to check the code a good numbers i think I tried doing the results with my branch a good on the'}
Enter your input (press Enter when done):                     branch
{'prompt': 'branch', 'response': 'could you check the code submission? I am not sure if the results are + different i tried different of the same image + the original?'}
Enter your input (press Enter when done):                     Ester
{'prompt': 'Ester', 'response': 'i ju

KeyboardInterrupt: Interrupted by user

In [33]:
# Save the model's state dictionary after training is complete
torch.save(model.state_dict(), f"models/{data_file_name}.pt")

# Simulate a Chat

In [35]:
ester = GPT2LMHeadModel.from_pretrained(model_name).to(device)
ester.resize_token_embeddings(len(tokenizer), desired_embedding_size)
ester.load_state_dict(torch.load("models/Ester.pt"))
ester = model.to(device)

winfrey = GPT2LMHeadModel.from_pretrained(model_name).to(device)
winfrey.resize_token_embeddings(len(tokenizer), desired_embedding_size)
winfrey.load_state_dict(torch.load("models/Winfrey.pt"))
winfrey = model.to(device)

jeremy = GPT2LMHeadModel.from_pretrained(model_name).to(device)
jeremy.resize_token_embeddings(len(tokenizer), desired_embedding_size)
jeremy.load_state_dict(torch.load("models/Jeremy.pt"))
jeremy = model.to(device)

In [44]:
def group_infer(inp_raw, model):
    inp_appended = "<start> " + inp_raw + " <response>: "
    inp = tokenizer(inp_appended, return_tensors="pt")
    X = inp["input_ids"].to(device)  # Use .to(device) method to move the tensor to the specified device
    a = inp["attention_mask"].to(device)  # Use .to(device) method here as well

    output = model.generate(X, attention_mask=a, max_length=100, num_return_sequences=1)
    output = tokenizer.decode(output[0])
#     print(output)
    output = output[output.find('<response>: ')+len('<response>: '):].replace('<response>:', '').replace('<pad>', '').replace('<end>', '')
#     if '<end>' in output:
#         output = output[:output.find('<end>')]
    output = ' '.join(output.split())
    
    prompt_response = {
        'prompt': inp_raw,
        'response': output
    }

    return prompt_response

In [54]:
model.config.pad_token_id = tokenizer.eos_token_id + 1
output = group_infer("Any progress on the PA?", jeremy) # How's it going?
with_name = 'Jeremy: ' + output['prompt']
convo = [with_name]
print(with_name)

people = ['Winfrey', 'Ester', 'Jeremy']
for i in range(20):
    response = output['response']
    speaker = np.random.choice(people)
    output = group_infer(response, eval(speaker.lower())) # assuming models are named ester, winfrey, ...
    with_name = f'{speaker}: ' + output['response']
    convo += [with_name]
    print(with_name)

convo

Jeremy: Any progress on the PA?
Winfrey: but the is open to in a single
Jeremy: but the datahub is plots ok i just pushed it ok i put these numbers into the forward section of the branch ok
Jeremy: ok i just pushed into the forward section of the section
Winfrey: i tried into the forward section of the section i just into the forward section of the section of the section i just into the forward
Jeremy: section using the branch id of the section using the same names for of the the
Jeremy: i just tried
Jeremy: the best i got was all 0.88 so yea ok I edited the numbers file to make a branch file for 5b i tried the cuda file
Winfrey: i tried a different of the image I used
Winfrey: i will do some numbers training for 5b as a iou i have my own gpu and i want a different image for my
Winfrey: like i like like i dont like the intersection id
Jeremy: want to branch id want to be a branch model of single image of 16 images? like i tried like i like the like
Winfrey: like i tried i like the pixe

['Jeremy: Any progress on the PA?',
 'Winfrey: but the is open to in a single',
 'Jeremy: but the datahub is plots ok i just pushed it ok i put these numbers into the forward section of the branch ok',
 'Jeremy: ok i just pushed into the forward section of the section',
 'Winfrey: i tried into the forward section of the section i just into the forward section of the section of the section i just into the forward',
 'Jeremy: section using the branch id of the section using the same names for of the the',
 'Jeremy: i just tried',
 'Jeremy: the best i got was all 0.88 so yea ok I edited the numbers file to make a branch file for 5b i tried the cuda file',
 'Winfrey: i tried a different of the image I used',
 'Winfrey: i will do some numbers training for 5b as a iou i have my own gpu and i want a different image for my',
 'Winfrey: like i like like i dont like the intersection id',
 'Jeremy: want to branch id want to be a branch model of single image of 16 images? like i tried like i like 

In [None]:
convo