In [None]:
!pip install transformers==4.28.0 tokenizers datasets accelerate



In [None]:
import tensorflow as tf
import glob
import os
import shutil
import tqdm
import random
import matplotlib.pyplot as plt
import torch
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer

tf.config.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Load the dataset.

In [None]:
dataset_file = "dataset.txt"

# How many files to load.
file_number = 100

# Clone the repo.
!git clone https://github.com/vilmibm/lovecraftcorpus

# Find all the files.
paths = glob.glob("lovecraftcorpus/*.txt")

# Do not use all.
paths = paths[:file_number]
print(sorted(paths))

# Merge.
with open(dataset_file, "w") as output_file:
    for path in paths:
        for line in open(path, "r"):
            for split in line.split("\n"):
                split = split.strip()
                if split != "":
                    print(split, file=output_file)

# Delete repo.
!rm -rf lovecraftcorpus

# Done.
print("Corpus downloaded.")

Cloning into 'lovecraftcorpus'...
remote: Enumerating objects: 74, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 74 (delta 0), reused 3 (delta 0), pack-reused 70[K
Receiving objects: 100% (74/74), 1.12 MiB | 1.19 MiB/s, done.
['lovecraftcorpus/alchemist.txt', 'lovecraftcorpus/arthur_jermyn.txt', 'lovecraftcorpus/azathoth.txt', 'lovecraftcorpus/beast.txt', 'lovecraftcorpus/beyond_wall_of_sleep.txt', 'lovecraftcorpus/book.txt', 'lovecraftcorpus/celephais.txt', 'lovecraftcorpus/charles_dexter_ward.txt', 'lovecraftcorpus/clergyman.txt', 'lovecraftcorpus/colour_out_of_space.txt', 'lovecraftcorpus/cool_air.txt', 'lovecraftcorpus/crawling_chaos.txt', 'lovecraftcorpus/cthulhu.txt', 'lovecraftcorpus/dagon.txt', 'lovecraftcorpus/descendent.txt', 'lovecraftcorpus/doorstep.txt', 'lovecraftcorpus/dreams_in_the_witch.txt', 'lovecraftcorpus/dunwich.txt', 'lovecraftcorpus/erich_zann.txt', 'lovecraftcorpus/ex_oblivione.txt', 'lo

In [None]:
raw_datasets = load_dataset("text", data_files=[dataset_file])
raw_datasets

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4371
    })
})

In [None]:
raw_datasets["train"][666]["text"]

"But Armitage had a sound physique despite his seventy-three years, and slept off his disorder that night without developing any real fever. He woke late Friday, clear of head, though sober with a gnawing fear and tremendous sense of responsibility. Saturday afternoon he felt able to go over to the library and summon Rice and Morgan for a conference, and the rest of that day and evening the three men tortured their brains in the wildest speculation and the most desperate debate. Strange and terrible books were drawn voluminously from the stack shelves and from secure places of storage; and diagrams and formulae were copied with feverish haste and in bewildering abundance. Of scepticism there was none. All three had seen the body of Wilbur Whateley as it lay on the floor in a room of that very building, and after that not one of them could feel even slightly inclined to treat the diary as a madman's raving."

# Create the tokenizer.

In [None]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size=5_000, special_token=["[UNK]", "[PAD]"])
tokenizer.pre_tokenizer = Whitespace()

def batch_iterator(batch_size=1_000):
    for i in range(0, len(raw_datasets["train"]), batch_size):
        yield raw_datasets["train"][i:i+batch_size]["text"]

tokenizer.train_from_iterator(
    batch_iterator(),
    trainer=trainer,
    length=len(raw_datasets["train"])
)
tokenizer.save("tokenizer.json")

tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

1

# Tokenize!

In [None]:
token_sequence = raw_datasets["train"][42]["text"]
print(token_sequence)

indices = tokenizer(token_sequence)["input_ids"]
print(indices)

tokens = [tokenizer.decode([index]) for index in indices]
print(tokens)

Half gliding, half floating in the air, the white-clad bog-wraiths were slowly retreating toward the still waters and the island ruin in fantastic formations suggesting some ancient and solemn ceremonial dance. Their waving translucent arms, guided by the detestable piping of those unseen flutes, beckoned in uncanny rhythm to a throng of lurching laborers who followed doglike with blind, brainless, floundering steps as if dragged by a clumsy but resistless demon-will. As the naiads neared the bog, without altering their course, a new line of stumbling stragglers zigzagged drunkenly out of the castle from some door far below my window, groped sightlessly across the courtyard and through the intervening bit of village, and joined the floundering column of laborers on the plain. Despite their distance below me I at once knew they were the servants brought from the North, for I recognized the ugly and unwieldy form of the cook, whose very absurdness had now become unutterably tragic. The f

# Train the model!

In [None]:
sequence_length = 256

def tokenize_function(example):
    tokenized_example = tokenizer(
        example["text"],
        truncation=True,
        padding=True,
        max_length=sequence_length
    )
    return {
        "input_ids": tokenized_example["input_ids"]
    }

token_sequence = raw_datasets["train"][42]#["text"]
print(token_sequence)

tokenized = tokenize_function(token_sequence)
print(tokenized)

{'text': 'Half gliding, half floating in the air, the white-clad bog-wraiths were slowly retreating toward the still waters and the island ruin in fantastic formations suggesting some ancient and solemn ceremonial dance. Their waving translucent arms, guided by the detestable piping of those unseen flutes, beckoned in uncanny rhythm to a throng of lurching laborers who followed doglike with blind, brainless, floundering steps as if dragged by a clumsy but resistless demon-will. As the naiads neared the bog, without altering their course, a new line of stumbling stragglers zigzagged drunkenly out of the castle from some door far below my window, groped sightlessly across the courtyard and through the intervening bit of village, and joined the floundering column of laborers on the plain. Despite their distance below me I at once knew they were the servants brought from the North, for I recognized the ugly and unwieldy form of the cook, whose very absurdness had now become unutterably tra

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

print(tokenized_datasets["train"][667])

Map:   0%|          | 0/4371 [00:00<?, ? examples/s]

{'input_ids': [39, 66, 90, 329, 192, 2495, 972, 107, 109, 170, 236, 1011, 91, 37, 333, 347, 71, 567, 308, 1185, 225, 40, 195, 400, 7, 100, 91, 166, 57, 1113, 1636, 716, 9, 480, 192, 387, 2660, 401, 180, 69, 4518, 232, 170, 120, 2330, 211, 381, 321, 130, 170, 517, 51, 69, 136, 450, 7, 107, 791, 125, 493, 979, 1394, 457, 4199, 128, 2291, 522, 9, 36, 225, 98, 338, 91, 190, 3272, 304, 1944, 94, 685, 1191, 3176, 51, 2759, 767, 7, 193, 154, 429, 43, 142, 429, 2827, 125, 1341, 75, 1659, 838, 4114, 100, 4148, 105, 3270, 69, 3018, 185, 91, 2303, 2109, 9, 182, 303, 110, 3282, 96, 91, 1939, 3109, 7, 91, 303, 110, 125, 4210, 109, 1153, 91, 542, 2198, 3719, 101, 310, 1742, 156, 128, 90, 102, 658, 105, 179, 91, 2695, 180, 2979, 1785, 130, 625, 840, 230, 157, 91, 591, 3428, 98, 1263, 2695, 180, 7, 722, 109, 230, 7, 125, 109, 2271, 1183, 90, 51, 813, 1239, 100, 1715, 91, 818, 272, 3617, 592, 9, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
model_config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    pad_token_id=tokenizer.pad_token_id,
    n_ctx=sequence_length,
    n_embd=512,
    n_head=8,
    n_layer=6,
    n_positions=sequence_length,
)
model = GPT2LMHeadModel(model_config)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(5000, 512)
    (wpe): Embedding(256, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=512, out_features=5000, bias=False)
)

In [None]:
# Get the output directory with timestamp.
output_path = "output"

# Create the trainer.
print("Creating trainer...")
training_args = TrainingArguments(
    output_dir=output_path,
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    prediction_loss_only=False,
    gradient_checkpointing=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
)

# Train the model.
trainer.train()

# Save the tokenizer.
tokenizer.save_pretrained(output_path)

# Save the model.
model.save_pretrained(output_path)

Creating trainer...


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


RuntimeError: ignored

In [None]:
!nvidia-smi
torch.__version__

Sat Oct 21 14:30:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    51W / 300W |    998MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

'2.1.0+cu118'