Dependencies

In [5]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

from PyPDF2 import PdfReader
import os
import pandas as pd
from datasets import Dataset, DatasetDict

In [7]:
def extract_text_from_pdf(url: str) -> str:
    # creating a pdf reader object
    reader = PdfReader(url)

    # printing number of pages in pdf file
    text = ''
    for page_num in range(len(reader.pages)):
        # Extract text from the current page
        page = reader.pages[page_num]
        text += page.extract_text()
    return text


Create dataset

In [75]:
#    example['prompt'] = f"{example['instruction']} {example['input']} {example['output']}"


In [8]:
base_path = '/content/drive/MyDrive/Colab Notebooks/dataset/'
content = []
instruction = []
output = []
with os.scandir(base_path) as entries:
    for entry in entries:
        if entry.name.endswith(".pdf"):
            text = extract_text_from_pdf(base_path+entry.name)
            components = text.split(":")

            name = components[1].replace('Name', '').replace('Description','')#Name and Description
            description = components[2].replace('Data', '')# Description and Data
            data = text.split(":")[3].replace('In-depth Analysis', '').replace('-', '')
            analysis = text.split(":")[4] # In-depth Analysis
            text = name + ' '+ description + ' '+ data + ' '+analysis
            content.append(text.replace('\n', ''))
        elif entry.name.endswith(".txt"):
            id = lambda x: x
            file = open(base_path+entry.name,"r+", encoding="utf8")
            lines = file.readlines()
            text = ' '.join(lines)
            content.append(text)
            file.close()

dataset = {'prompt': content}

In [9]:
df = pd.DataFrame(dataset)

In [10]:
df

Unnamed: 0,prompt
0,This might help us understand how individual X...
1,This text file contains an extensive collectio...
2,"Entry 1:\n Symbols: ""⏁⌇⊑⏃⍜⎍ ⟟⏃⎅⊑⍜⎍""\n Speculat..."
3,"Entry 1:\n Symbols: ""⏁⌇⊑⏃⍜⎍ ⟟⏃⎅⊑⍜⎍""\n Speculat..."
4,Xalaxian Energy Shield Technology This docum...
...,...
99,A vital resource for understanding the Xalaxia...
100,Insights into Xalaxian society would be crucia...
101,Understanding how their consciousness interact...
102,"As the Xalaxians are an advanced civilization,..."


In [11]:
from sklearn.utils import resample

In [12]:
df_1_upsampled = resample(df,random_state=42,n_samples=1000,replace=True)
df_1_upsampled

Unnamed: 0,prompt
102,"As the Xalaxians are an advanced civilization,..."
51,Xalaxian Intergalactic Diplomacy This docume...
92,Xalaxian_Planetary_Energy_Conservation.pdf T...
14,Xalaxian Culture and Art This document outli...
71,Xalaxian_Sustainable_Energy_Consumption.pdf ...
...,...
7,Xalaxian Spatial Manipulation This document ...
98,Understanding their ability to manipulate ener...
51,Xalaxian Intergalactic Diplomacy This docume...
78,Xalaxian_Energy_Field_Oscillations.pdf This ...


In [13]:
df_1_upsampled = df_1_upsampled.sample(
    frac=1,
    random_state=1
).reset_index()

In [14]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_1_upsampled, test_size=0.1)

In [15]:
train_dataset = train
test_dataset = test

Model Selection

In [16]:
MODEL_NAME = "microsoft/DialoGPT-medium"

In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [18]:
def tokenize_dataset(dataset):

    #dataset['prompt'] = dataset['prompt'].map(lambda example: tokenizer(example, truncation=True, max_length=128))
    tokenized = {}
    input_ids = []
    attention_masks = [ ]
    for e in dataset['prompt']:
        tokens = tokenizer(e, truncation=True, max_length=128)
        input_ids.append(tokens['input_ids'])
        attention_masks.append(tokens['attention_mask'])
    tokenized['input_ids'] = input_ids
    tokenized['attention_mask'] = attention_masks
    return tokenized

In [19]:
tokenizer.pad_token = tokenizer.eos_token

In [20]:
train_dataset_tokens = tokenize_dataset(train_dataset)
test_dataset_tokens = tokenize_dataset(test_dataset)

In [21]:
train_dataset_ = Dataset.from_dict(pd.DataFrame(train_dataset_tokens))
test_dataset_ = Dataset.from_dict(pd.DataFrame(test_dataset_tokens))

In [22]:
train_dataset_

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 900
})

In [23]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

Downloading pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The trainer will use this to process the input and create appropiate batches for training, as we use the generative model, the mlm will be set to false

In [24]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [25]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/model/gpt/',
    num_train_epochs=1, #To keep things fast
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16
)

In [26]:
#Will do all the heavy lifting
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_,
    eval_dataset=test_dataset_,
    data_collator=data_collator,
)

In [27]:
trainer.train()
trainer.save_model()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [28]:
model_ = AutoModelForCausalLM.from_pretrained('/content/drive/MyDrive/Colab Notebooks/model/gpt/').to('cuda')

In [29]:
def generate_text(prompt):
  inputs = tokenizer.encode(prompt, return_tensors='pt').to('cuda')
  outputs = model_.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated_text[:generated_text.rfind('.')+1]

In [30]:
generate_text('Shield')

'Shield and is a shield is a shield that protects against damage and is a shield that protects against all damage. Shield is a shield that protects against all damage and is a shield that protects against all damage. Shield is a shield that protects against all damage and is a shield that protects against all damage.'

In [31]:
generate_text("Xalaxians")

'Xalaxians have a unique form of energy manipulation called the Xalaxian Energy Signature which is a form of energy manipulation that allows them to manipulate energy through their bodies and minds.'

In [32]:
generate_text("Energy?")

'Energy?based energy generation is a form of energy generation that uses a form of energy to convert energy into electricity and vice versa for a given material or structure. This process is based on the interaction of two or more particles interacting to create a force that is capable of generating electricity.'

In [33]:
generate_text("Energy sources")

''

In [34]:
generate_text('stars')

'stars and and starships are the only things that can be used to create a star in the universe.'

In [35]:
generate_text('climate')

'climate and change is a big deal in the world of science and technology.'

In [36]:
generate_text('What are the challenges across climate')

''