In [1]:
'''
Created on Wed May 3 11:03:09 2023

@author: erikycd

Task: Fine-tunning english GPT2 for text generation (chatbot)
Model: Pretrained https://huggingface.co/gpt2
Dataset: Kaggle conversation
Process: 
    - Activate GPU for Torch
    - Loading Model and tokenizer
    - Set up special tokens
    - Read Data text info
    - Text tokenization and pytorch tensor conversion (input_ids, attention_mask)
    - DataLoader torch function for batches
    - Inference class and function
    - Set up hyper-parameters and training stage
    - Inference on new data: Conversation
    - Saving model and tokenizer
Source: Pawan_main.py & Erik_ChatData.py

'''

'\nCreated on Wed May 3 11:03:09 2023\n\n@author: erikycd\n\nTask: Fine-tunning english GPT2 for text generation (chatbot)\nModel: Pretrained https://huggingface.co/gpt2\nDataset: Kaggle conversation\nProcess: \n    - Activate GPU for Torch\n    - Loading Model and tokenizer\n    - Set up special tokens\n    - Read Data text info\n    - Text tokenization and pytorch tensor conversion (input_ids, attention_mask)\n    - DataLoader torch function for batches\n    - Inference class and function\n    - Set up hyper-parameters and training stage\n    - Inference on new data: Conversation\n    - Saving model and tokenizer\nSource: Pawan_main.py & Erik_ChatData.py\n'

In [2]:
#%% IMPORTING LIBRARIES

from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM
from torch.optim import Adam
from torch.utils.data import DataLoader, RandomSampler
import tqdm
import torch

In [4]:
#%% SETTING UP DEVICE FOR TORCH

gpu_torch = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0)
print("GPU is", "available for Torch" if gpu_torch else "NOT AVAILABLE")
try:
    print('Card name: ', gpu_name)
except:
    None

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

GPU is available for Torch
Card name:  NVIDIA GeForce RTX 4090


In [5]:
#%% LOADING AND SAVING MODEL/TOKENIZER FROM HUGGINGFACE

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                              "bos_token": "<startofstring>",
                              "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


### SAVING IN LOCAL DIRECTORY
import os

output_dir = './GPT2_eng/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./GPT2_eng/tokenizer_config.json',
 './GPT2_eng/special_tokens_map.json',
 './GPT2_eng/vocab.json',
 './GPT2_eng/merges.txt',
 './GPT2_eng/added_tokens.json')

In [6]:
#%% LOADING ENGLISH MODEL FROM FILE

tokenizer = GPT2Tokenizer.from_pretrained("./GPT2_eng/")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                              "bos_token": "<startofstring>",
                              "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("./GPT2_eng/")
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)


In [7]:
#%% PRINT SPECIAL TOKENS FOR MODEL

print('Length of tokenizer:', tokenizer.vocab_size)
print('Tokenizer info: \n', tokenizer,  '\n')

print(f'The padding token is {tokenizer.pad_token} and the ID for the PAD token is {tokenizer.pad_token_id}.')
print(f'The begining_of_sentence token is {tokenizer.bos_token} and the ID for the BOS token is {tokenizer.bos_token_id}.')
print(f'The end_of_sentence token is {tokenizer.eos_token} and the ID for the EOS token is {tokenizer.eos_token_id}.')
print(f'The bot token is <bot>: and its ID is {tokenizer("<bot>:").input_ids}.')

Length of tokenizer: 50257
Tokenizer info: 
 PreTrainedTokenizer(name_or_path='./GPT2_eng/', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<startofstring>', 'eos_token': '<endofstring>', 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<pad>'}) 

The padding token is <pad> and the ID for the PAD token is 50257.
The begining_of_sentence token is <startofstring> and the ID for the BOS token is 50258.
The end_of_sentence token is <endofstring> and the ID for the EOS token is 50259.
The bot token is <bot>: and its ID is [50260].


In [8]:
#%% READING DATA TEXT INFO DATASET

import pandas as pd

path = 'G:/Otros ordenadores/Mi PC/Python ML/Language Dataset/kaggle_2'

data = pd.read_csv(path + '/dialogs_expanded.csv')
questions = list(data['question'])
answers = list(data['answer'])

print(data.info())

data.loc[:,['question', 'answer']].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139409 entries, 0 to 139408
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Unnamed: 0       139409 non-null  int64 
 1   question         139409 non-null  object
 2   answer           139409 non-null  object
 3   question_as_int  139409 non-null  object
 4   answer_as_int    139409 non-null  object
 5   question_len     139409 non-null  int64 
 6   answer_len       139409 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 7.4+ MB
None


Unnamed: 0,question,answer
0,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
1,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
2,You're asking me out. That's so cute. What's ...,Forget it.
3,"No, no, it's my fault -- we didn't have a prop...",Cameron.
4,"Gosh, if only we could find Kat a boyfriend...",Let me see what I can do.


In [9]:
#%% FUNCTION FOR CLEANING DATA AND CONVERTING (TEXT -> PYTORCH TENSORS)
### INPUTS: Data_path, Tokenizer
### OUTPUTS: Encoded sentences, attention masks

from torch.utils.data import Dataset
import pandas as pd
import re

class ChatData_kaggle2(Dataset):
    
    def __init__(self, path, tokenizer):
        
        data = pd.read_csv(path)
        
        questions = list(data['question'])
        answers = list(data['answer'])
        
        clean_questions = []
        clean_answers = []

        for line in questions:
            clean_questions.append(self.clean_text_eng(line))
                
        for line in answers:
            clean_answers.append(self.clean_text_eng(line))
            
        self.X = []

        for ques, ans in zip(clean_questions, clean_answers):
            
            sequence = "<startofstring> " + ques + " <bot>: " + ans + " <endofstring>"
            self.X.append(sequence)
        
        # self.X = self.X[:50000]
        # print(self.X[0])

        self.X_encoded = tokenizer(self.X,max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])
    
    def clean_text_eng(self, text):

        text = text.lower()
        text = re.sub(r"  ", " ", text)
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"that's", "that is", text)
        text = re.sub(r"what's", "that is", text)
        text = re.sub(r"where's", "where is", text)
        text = re.sub(r"how's", "how is", text)
        text = re.sub(r"let's", "let us", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"can't", "cannot", text)
        text = re.sub(r"n't", " not", text)
        text = re.sub(r"n'", "ng", text)
        text = re.sub(r"'bout", "about", text)
        text = re.sub(r"'til", "until", text)
        text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
        
        return text 

In [10]:
#%% PYTORCH DATA

path_text = "G:/Otros ordenadores/Mi PC/Python ML/Language Dataset/kaggle_2/dialogs_expanded.csv"

chatData = ChatData_kaggle2(path_text, tokenizer)

conversacion_ejemplo = 20

sentence = chatData.X[conversacion_ejemplo]
sentence_encoded = chatData.input_ids[conversacion_ejemplo]
sentence_attn = chatData.attention_mask[conversacion_ejemplo]

print('Conversación: ', sentence)
print('Conversación encoded: ', sentence_encoded)
print('Máscara de atención: ', sentence_attn)

sentence_decoded = tokenizer.decode(sentence_encoded)
print('Conversación decoded: ', sentence_decoded)

Conversación:  <startofstring> she okay <bot>: i hope so <endofstring>
Conversación encoded:  tensor([50258,  7091,  8788, 50260,    72,  2911,   523, 50259, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257])
Máscara de atención:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Conversación decoded:  <startofstring> she okay <bot>: i hope so <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [11]:
#%% PYTORCH DATA LOADER

batch_sizes = 64
train_dataloader = DataLoader(chatData,  # training instances
                              sampler = RandomSampler(chatData), # Pull out batches randomly
                              batch_size = batch_sizes # train with this batch size.
                              )


In [12]:
#%% INFERENCE FUNCTION

class Response:
    
    def greedy(self, model, tokenizer, bot_input_ids, attn):
        chat_history_ids = model.generate(
            bot_input_ids, 
            attention_mask = attn,
            max_length = 30,
            pad_token_id = tokenizer.pad_token_id
            )
        return chat_history_ids
    
    def beam(self, model, tokenizer, bot_input_ids, attn):
        chat_history_ids = model.generate(
            bot_input_ids,
            attention_mask = attn,
            max_length = 30,
            num_beams = 3,
            early_stopping = True,
            pad_token_id = tokenizer.pad_token_id
            )
        return chat_history_ids
    
    def sampling(self, model, tokenizer, bot_input_ids, attn):
        chat_history_ids = model.generate(
            bot_input_ids,
            attention_mask = attn,
            max_length = 30,
            do_sample = True,
            top_k = 100,
            temperature = 0.75,
            pad_token_id = tokenizer.pad_token_id
            )
        return chat_history_ids
    
    def nucleus(self, model, tokenizer, bot_input_ids, attn):
        chat_history_ids = model.generate(
            bot_input_ids,
            attention_mask = attn,
            max_length = 30,
            do_sample = True,
            top_p = 0.95,
            top_k = 0,
            temperature = 0.75,
            pad_token_id = tokenizer.pad_token_id
            )
        return chat_history_ids


def inference(text):
    
    inp = "<startofstring> " + text + " <bot>: "
    inp = tokenizer(inp, return_tensors = "pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    response = Response()
    output = response.nucleus(model, tokenizer, X, a)
    output = tokenizer.decode(output[0])
    
    return output

In [13]:
#%% SETTING UP PARAMETERS AND OWN TRAINING STAGE

from transformers import AdamW, get_linear_schedule_with_warmup
import tqdm

epochs = 5
learning_rate = 1e-5
total_steps = len(train_dataloader) * epochs

optimizer = torch.optim.AdamW(model.parameters(),
                              lr = learning_rate,
                              )

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

class training:
    
    def v1():
    
        for epoch in tqdm.tqdm(range(epochs)):

            print('Training...')
            model.train()
            total_train_loss = 0
            
            # ========== Training ==========

            for ii, att in train_dataloader:

                inputs_encoded = ii.to(device)
                att_mask = att.to(device)

                optimizer.zero_grad()

                loss = model(inputs_encoded, 
                             attention_mask = att_mask, 
                             labels = inputs_encoded).loss

                loss.backward()
                optimizer.step()
                scheduler.step()
                
            # ========== Validation ==========

            # torch.save(model.state_dict(), "model_state.pt")
            print(inference('hello, how are you?'))
            
    def v2():
        
        for epoch in tqdm.tqdm(range(epochs)):
            
            print('Training...')
            model.train()
            total_train_loss = 0
            
            # ========== Training ==========

            for step, batch in enumerate(train_dataloader):

                b_input_ids = batch[0].to(device)
                b_labels = batch[0].to(device)
                b_masks = batch[1].to(device)

                optimizer.zero_grad()

                outputs = model(b_input_ids,
                              attention_mask = b_masks,
                              labels = b_labels,
                              token_type_ids = None )

                outputs.loss.backward()
                optimizer.step()

                batch_loss = outputs.loss.item()
                total_train_loss += batch_loss
                
            # ========== Validation ==========

            # torch.save(model.state_dict(), "model_state.pt")
            print(inference('hello, how are you?'))
        
        
training.v1()

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Training...


 20%|████████████████▌                                                                  | 1/5 [03:10<12:40, 190.16s/it]

<startofstring> hello, how are you? <bot>: sure it is fine <bot>: i just want to come with me by the second way let me know <endofstring> <pad> <pad>
Training...


 40%|█████████████████████████████████▏                                                 | 2/5 [06:17<09:26, 188.78s/it]

<startofstring> hello, how are you? <bot>: i do not know where you were all day <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Training...


 60%|█████████████████████████████████████████████████▊                                 | 3/5 [09:26<06:17, 188.76s/it]

<startofstring> hello, how are you? <bot>: i am fine <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Training...


 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [12:35<03:08, 188.81s/it]

<startofstring> hello, how are you? <bot>: a quick check on your pants <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Training...


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [15:44<00:00, 188.93s/it]

<startofstring> hello, how are you? <bot>: well i am fine <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>





In [14]:
#%% INFERENCE ON NEW DATA

import time

exit_commands = ('bye', 'quit')
text = ''

while text not in exit_commands:
    
    text = input('\nUser: ')
    inicio = time.time()
    output = inference(text)
    fin = time.time()
    print('Tiempo de respuesta del chatbot: ', "{:.3f}".format(fin-inicio), 'seg')
    print('Bot: ', output)


User: Hello there
Tiempo de respuesta del chatbot:  0.340 seg
Bot:  <startofstring> Hello there <bot>: please stop <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

User: what happened?
Tiempo de respuesta del chatbot:  0.377 seg
Bot:  <startofstring> what happened? <bot>: i cannot remember <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

User: how are you?
Tiempo de respuesta del chatbot:  0.158 seg
Bot:  <startofstring> how are you? <bot>: here we go <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

User: i am glad to read that
Tiempo de respuesta del chatbot:  0.340 seg
Bot:  <startofstring> i am glad to read that <bot>: if you have not read it yet i am not so sure <endofstring> <pad> <pad> <pad> <pad> <pad

In [15]:
#%% SAVING MODEL AND TOKENIZER

import os

output_dir = './GPT2_eng_finetune/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print('Model saved!')

Saving model to ./GPT2_eng_finetune/
Model saved!
