Dependencies

In [120]:
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    TrainingArguments
)

from PyPDF2 import PdfReader
import os
import pandas as pd
from datasets import Dataset, DatasetDict

In [121]:
import torch

torch.cuda.is_available()

False

In [51]:
def extract_text_from_pdf(url: str) -> str:
    # creating a pdf reader object
    reader = PdfReader(url)
    
    # printing number of pages in pdf file
    text = ''    
    for page_num in range(len(reader.pages)):
        # Extract text from the current page
        page = reader.pages[page_num]
        text += page.extract_text()
    return text


Create dataset

In [84]:
base_path = '../data/raw/dataset/'
content = []
with os.scandir(base_path) as entries:
    for entry in entries:
        if entry.name.endswith(".pdf"):
            text = extract_text_from_pdf(base_path+entry.name)
            content.append(text)
        elif entry.name.endswith(".txt"):
            id = lambda x: x
            file = open(base_path+entry.name,"r+", encoding="utf8")
            lines = file.readlines()
            text = ' '.join(lines)
            content.append(text)
            file.close()
            
dataset = {'prompt': content}

In [85]:
dataset = pd.DataFrame(dataset)

In [86]:
dataset = dataset.sample(
    frac=1, 
    random_state=1
).reset_index()

In [87]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2)

In [88]:
train_dataset = train
test_dataset = test

Model Selection

In [13]:
MODEL_NAME = "microsoft/DialoGPT-medium"

In [89]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [90]:
def tokenize_dataset(dataset):
    
    #dataset['prompt'] = dataset['prompt'].map(lambda example: tokenizer(example, truncation=True, max_length=128))
    tokenized = {}
    input_ids = []
    attention_masks = [ ]
    for e in dataset['prompt']:
        tokens = tokenizer(e, truncation=True, max_length=128)
        input_ids.append(tokens['input_ids'])
        attention_masks.append(tokens['attention_mask'])
    tokenized['input_ids'] = input_ids
    tokenized['attention_mask'] = attention_masks
    return tokenized

In [91]:
tokenizer.pad_token = tokenizer.eos_token

In [92]:
t_d = tokenize_dataset(train_dataset)

In [94]:
train_dataset_tokens = tokenize_dataset(train_dataset)
test_dataset_tokens = tokenize_dataset(test_dataset)

In [111]:
# Create a new Dataset with the desired format
d_t = Dataset.from_dict(pd.DataFrame(train_dataset_tokens))
d_t

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 83
})

In [113]:
train_dataset_ = Dataset.from_dict(pd.DataFrame(train_dataset_tokens))
test_dataset_ = Dataset.from_dict(pd.DataFrame(test_dataset_tokens))

In [114]:
train_dataset_

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 83
})

In [115]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

The trainer will use this to process the input and create appropiate batches for training, as we use the generative model, the mlm will be set to false

In [116]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [117]:
training_args = TrainingArguments(
    output_dir='../../data/interim/',
    num_train_epochs=1, #To keep things fast
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4
)

In [118]:
#Will do all the heavy lifting
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=train_dataset_, 
    eval_dataset=test_dataset_, 
    data_collator=data_collator,
)

In [119]:
trainer.train()
trainer.save_model()

  0%|          | 0/42 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 