In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import PyPDF2
from PyPDF2 import PdfReader
from PyPDF2 import PdfMerger
from nltk.corpus import stopwords
import re
import os
import nltk
import codecs
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

In [2]:
#CONFIG DATA
root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)
Data_Path = os.path.join(root_dir,"Data/")
Input_Data = os.path.join(Data_Path,"Input_PDF")


model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [3]:
def merge_pdf(source_dir :str,output_dir :str,output_pdf :str):
    #Defining Merger
    merger = PdfMerger()
    for files in os.listdir(source_dir):
        if files.endswith('.pdf'):
            merger.append(os.path.join(Input_Data,files))
       
    merger.write(output_dir+output_pdf)
    merger.close()
    
    return "PDF Merged succesfully"

In [None]:
merge_pdf(Input_Data,Data_Path,'Final.pdf')

In [4]:
def read_pdf(file_path):
    with codecs.open(file_path,"rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

In [5]:
def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path =os.path.join(directory,filename)
        if filename.endswith('.pdf'):
            combined_text +=read_pdf(file_path)
    return combined_text

In [6]:
def preprocess_text(directory):
    preprocessed_text = ""
    text = read_documents_from_directory(directory)
    #Preprocess the text 
    text = text.lower()
    #Removing Unicode character
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    #Removing Stop Words
    stop = stopwords.words("english")
    preprocessed_text = " ".join([word for word in text.split() if word not in (stop)])
    
    return preprocessed_text

In [None]:
text_data = preprocess_text(Data_Path)

In [None]:
#remove excess new line characters
text_data = re.sub(r'\n+','\n',text_data).strip()

In [None]:
#Saving Training data as text file
with open("D:\Darshil\download_2023-03-29_11-00-15\personal Proj\Personal Projects\ChatBot 1.0\Data/Final.txt","w", encoding="utf-8") as f:
    f.write(text_data)

In [7]:
def load_dataset(file_path,tokenizer,block_size = 128):
    dataset = TextDataset(
    tokenizer = tokenizer,
    file_path = file_path,
    block_size = block_size,
    )
    
    return dataset

In [8]:
def load_data_collator(tokenizer,mlm =False):
    data_collator= DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = mlm,
    )
    return data_collator

In [13]:
def train(train_file_path,model_name,output_dir,overwrite_output_dir,per_device_train_batch_size,num_train_epochs,save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path,tokenizer)
    data_collator = load_data_collator(tokenizer)
    
    tokenizer.save_pretrained(output_dir)
    
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    model.save_pretrained(output_dir)
    
    training_args = TrainingArguments(
                output_dir = output_dir,
                overwrite_output_dir = overwrite_output_dir,
                per_device_train_batch_size = per_device_train_batch_size,
                num_train_epochs = num_train_epochs,
    )
    
    trainer = Trainer(
                model = model,
                args = training_args,
                data_collator = data_collator,
                train_dataset = train_dataset,
    )
    
    trainer.train()
    trainer.save_model()

In [14]:
train_file_path = r'D:\Darshil\download_2023-03-29_11-00-15\personal Proj\Personal Projects\ChatBot 1.0\Data/Final.txt'
model_name ='gpt2'
output_dir = os.path.join(root_dir,'Model/')
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs =50
save_steps = 50000

In [15]:

train(train_file_path,model_name,output_dir,overwrite_output_dir,per_device_train_batch_size,num_train_epochs,save_steps)

loading file vocab.json from cache at C:\Users\darshil/.cache\huggingface\hub\models--gpt2\snapshots\11c5a3d5811f50298f278a704980280950aedb10\vocab.json
loading file merges.txt from cache at C:\Users\darshil/.cache\huggingface\hub\models--gpt2\snapshots\11c5a3d5811f50298f278a704980280950aedb10\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\darshil/.cache\huggingface\hub\models--gpt2\snapshots\11c5a3d5811f50298f278a704980280950aedb10\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
 

Step,Training Loss


KeyboardInterrupt: 

In [None]:
pip install --upgrade safetensors