In [5]:
import os
import re
from PyPDF2 import PdfReader
import docx
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments


In [6]:

def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text



In [7]:
def train_chatbot(directory, model_output_path, train_fraction=0.95):
    combined_text = read_documents_from_directory(directory)
    combined_text = re.sub(r'\n+', '\n', combined_text).strip()

    split_index = int(train_fraction * len(combined_text))
    train_text = combined_text[:split_index]
    val_text = combined_text[split_index:]

    with open("./datasets/train.txt", "w") as f:
        f.write(train_text)
    with open("./datasets/val.txt", "w") as f:
        f.write(val_text)

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  
    model = GPT2LMHeadModel.from_pretrained("gpt2")  #other: gpt2, gpt2-medium, gpt2-large, gpt2-xl

    train_dataset = TextDataset(tokenizer=tokenizer, file_path="./datasets/train.txt", block_size=128)
    val_dataset = TextDataset(tokenizer=tokenizer, file_path="./datasets/val.txt", block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=model_output_path,
        overwrite_output_dir=True,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=100,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()
    trainer.save_model(model_output_path)
    
    tokenizer.save_pretrained(model_output_path)


In [8]:
directory = "./datasets" 
model_output_path = "./model"

train_chatbot(directory, model_output_path)

***** Running training *****
  Num examples = 98
  Num Epochs = 100
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2500
  Number of trainable parameters = 124439808


  0%|          | 0/2500 [00:00<?, ?it/s]

{'loss': 0.8691, 'learning_rate': 4e-05, 'epoch': 20.0}
{'loss': 0.0752, 'learning_rate': 3e-05, 'epoch': 40.0}
{'loss': 0.0314, 'learning_rate': 2e-05, 'epoch': 60.0}
{'loss': 0.0216, 'learning_rate': 1e-05, 'epoch': 80.0}




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./model
Configuration saved in ./model/config.json


{'loss': 0.0192, 'learning_rate': 0.0, 'epoch': 100.0}
{'train_runtime': 9306.9165, 'train_samples_per_second': 1.053, 'train_steps_per_second': 0.269, 'train_loss': 0.2032881561279297, 'epoch': 100.0}


Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json
