In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate

from transformers import AutoConfig
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
import matplotlib.pyplot as plt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2
Looking in in

In [3]:
with open("undertale.txt", "r") as f:
    ds = f.readlines()


undertale = [t for t in ds if t.strip() != ""]
texts = undertale
texts[0:10]

['* whoops.\n',
 '* i knew i should have\n',
 "  used today's crossword\n",
 '  instead.\n',
 '* what? really, dude?\n',
 '* that easy-peasy word\n',
 '  scramble?\n',
 "* that's for baby bones.\n",
 "* i've been thinking\n",
 '  about selling treats\n']

In [4]:
MODEL_NAME = "gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)

# the eos and bos tokens are defined
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}

# the new token is added to the tokenizer
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# the model config to which we add the special tokens
config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False)

# the pre-trained model is loaded with the custom configuration
model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

# the model embedding is resized
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50259, 768)

In [6]:
texts = [tokenizer.bos_token + " " + t + " " + tokenizer.eos_token for t in texts]
train, validation = train_test_split(texts, test_size=0.05, random_state=7)
train = Dataset.from_dict({"text": train})
validation = Dataset.from_dict({"text": validation})
train, validation

(Dataset({
     features: ['text'],
     num_rows: 839
 }),
 Dataset({
     features: ['text'],
     num_rows: 45
 }))

In [7]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True)

tokenized_train_dataset = train.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)
tokenized_val_dataset = validation.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)

tokenized_train_dataset, tokenized_val_dataset

Map (num_proc=5):   0%|          | 0/839 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/45 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 839
 }),
 Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 45
 }))

In [8]:
tokenizer.decode(tokenized_train_dataset['input_ids'][50])

'<|endoftext|>   one else is around...\n <|EOS|><|pad|><|pad|>'

In [9]:
OUTPUT_DIR = "./results"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,           # output directory
    num_train_epochs=6,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=OUTPUT_DIR,          # directory for storing logs
    prediction_loss_only=True,
    save_steps=10000 
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

In [10]:
trainer = Trainer(
    model=model,    # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset            # evaluation dataset
)

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=162, training_loss=17.47265022183642, metrics={'train_runtime': 58.18, 'train_samples_per_second': 86.525, 'train_steps_per_second': 2.784, 'total_flos': 104889358080000.0, 'train_loss': 17.47265022183642, 'epoch': 6.0})

In [11]:
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR)

In [12]:
trainer.evaluate()

{'eval_loss': 2.481304168701172,
 'eval_runtime': 0.1146,
 'eval_samples_per_second': 392.823,
 'eval_steps_per_second': 26.188,
 'epoch': 6.0}

In [13]:
def generate_n_text_samples(model, tokenizer, input_text, device, n_samples=5):
    text_ids = tokenizer.encode(input_text, return_tensors='pt')
    text_ids = text_ids.to(device)
    model = model.to(device)

    generated_text_samples = model.generate(
        text_ids, 
        max_length=300,
        num_return_sequences=n_samples,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5,
        top_p=0.92,
        temperature=.85,
        do_sample=True,
        top_k=125,
        early_stopping= True
    )
    
    gen_text = []
    for t in generated_text_samples:
        text = tokenizer.decode(t, skip_special_tokens=True)
        gen_text.append(text)

    return gen_text


In [16]:
# trained model loading
undertale_model = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
undertale_tokenizer = GPT2TokenizerFast.from_pretrained(OUTPUT_DIR)

device = "cuda:0"

input_text = undertale_tokenizer.bos_token
quotes = generate_n_text_samples(undertale_model, undertale_tokenizer, 
                                 input_text, device, n_samples=10)









['<|endoftext|> * whoops.\n <|EOS|>',
 '<|endoftext|> * i knew i should have\n <|EOS|>',
 "<|endoftext|>   used today's crossword\n <|EOS|>",
 '<|endoftext|>   instead.\n <|EOS|>',
 '<|endoftext|> * what? really, dude?\n <|EOS|>',
 '<|endoftext|> * that easy-peasy word\n <|EOS|>',
 '<|endoftext|>   scramble?\n <|EOS|>',
 "<|endoftext|> * that's for baby bones.\n <|EOS|>",
 "<|endoftext|> * i've been thinking\n <|EOS|>",
 '<|endoftext|>   about selling treats\n <|EOS|>']