<a href="https://colab.research.google.com/github/drwill99/publishing_gpt2_v3/blob/main/publishing_gpt_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Necessary Libraries

In [1]:
!pip install transformers torch
!pip install tensorflow
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# Import Necessary Libraries

In [2]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TFAutoModelForCausalLM
from datasets import load_dataset
import requests
import re

# Manual User Input for Author, Number of Books, Prompt, and Character Count

In [18]:
author = input("\nEnter the name of the author (e.g., 'doyle', 'dickens', 'shelley'): ").lower()
num_books = int(input("\nEnter the number of books to use for training (min. 1, max. 5): "))
prompt = input("\nEnter the text generation prompt: ")
char_count = int(input("\nEnter the required character count for generated text: "))

# create dir for authors
output_dir = f"{author}-gpt2"
os.makedirs(output_dir, exist_ok=True)


Enter the name of the author (e.g., 'doyle', 'dickens', 'shelley'): shelley

Enter the number of books to use for training (min. 1, max. 5): 5

Enter the text generation prompt: The world seemed like such a peaceful place until the magic tree was discovered in London.

Enter the required character count for generated text: 1000


# Define Functions to Download and Preprocess Books

In [19]:
def preprocess_text(text):
    text = re.split(r'\*\*\* START OF.*?\*\*\*', text, flags=re.IGNORECASE)[-1]
    text = re.split(r'\*\*\* END OF.*?\*\*\*', text, flags=re.IGNORECASE)[0]
    text = re.sub(r'(project gutenberg.*?)(?:\r?\n){2,}', '', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'chapter\s+\d+.*?(?:\r?\n){2,}', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\r?\n', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\{.*?\}', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    return re.sub(r'\s+', ' ', text).strip()

def download_books(num_books, author):
    books_urls_dict = {
        "doyle": [
            "https://www.gutenberg.org/files/1661/1661-0.txt",    # A Study in Scarlet
            "https://www.gutenberg.org/files/244/244-0.txt",      # The Adventures of Sherlock Holmes
            "https://www.gutenberg.org/files/108/108-0.txt",      # The Return of Sherlock Holmes
            "https://www.gutenberg.org/files/2097/2097-0.txt",    # The Sign of Four
            "https://www.gutenberg.org/files/834/834-0.txt"       # The Hound of the Baskervilles
        ],
        "dickens": [
            "https://www.gutenberg.org/files/730/730-0.txt",      # A Christmas Carol
            "https://www.gutenberg.org/files/1400/1400-0.txt",    # Great Expectations
            "https://www.gutenberg.org/files/730/730-0.txt",      # Oliver Twist
            "https://www.gutenberg.org/files/766/766-0.txt",      # David Copperfield
            "https://www.gutenberg.org/files/98/98-0.txt"         # A Tale of Two Cities
        ],
        "shelley": [
            "https://www.gutenberg.org/files/84/84-0.txt",       # Frankenstein
            "https://www.gutenberg.org/files/18247/18247-0.txt", # The Last Man
            "https://www.gutenberg.org/files/15238/15238-0.txt", # Mathilda
            "https://www.gutenberg.org/files/66749/66749-0.txt", # Fortunes of Perkin Warbeck
            "https://www.gutenberg.org/files/6447/6447-0.txt"    # Proserpine and Midas
        ]
    }

    if author not in books_urls_dict:
        raise ValueError(f"Author '{author}' is not in the available list. Please choose from: {list(books_urls_dict.keys())}")

    books_urls = books_urls_dict[author]
    all_books_text = ""
    for i, url in enumerate(books_urls):
        if i >= num_books:
            break
        response = requests.get(url)
        if response.status_code == 200:
            text = preprocess_text(response.text)
            all_books_text += f"<{author}> {text} </{author}> "
        else:
            print(f"Failed to download book from {url}")
    return all_books_text

# download books and join them all in a .txt file
all_books_text = download_books(num_books, author)
author_text_file = os.path.join(output_dir, f"{author}_books.txt")
with open(author_text_file, "w") as f:
    f.write(all_books_text)

# Fine-tune GPT-2 on the Combined, Preprocessed Text of the Author

In [20]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'additional_special_tokens': [f'<{author}>', f'</{author}>']})
model.resize_token_embeddings(len(tokenizer))

dataset = load_dataset("text", data_files={"train": author_text_file})

def tokenize_function(examples):
    tokenized_output = tokenizer(examples["text"], truncation=True, padding='max_length', max_length=128)
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

training_args = TrainingArguments(
    output_dir=os.path.join(output_dir, "fine_tuned_model"),
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir=os.path.join(output_dir, "logs"),
    logging_steps=10,
    learning_rate=5e-5,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)
trainer.train()

trainer.save_model(os.path.join(output_dir, "fine_tuned_model"))
tokenizer.save_pretrained(os.path.join(output_dir, "fine_tuned_model"))

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Step,Training Loss


('shelley-gpt2/fine_tuned_model/tokenizer_config.json',
 'shelley-gpt2/fine_tuned_model/special_tokens_map.json',
 'shelley-gpt2/fine_tuned_model/vocab.json',
 'shelley-gpt2/fine_tuned_model/merges.txt',
 'shelley-gpt2/fine_tuned_model/added_tokens.json')

# Convert to TensorFlow and Save as .keras file

In [21]:
tf_model = TFAutoModelForCausalLM.from_pretrained(os.path.join(output_dir, "fine_tuned_model"), from_pt=True)
keras_file = os.path.join(output_dir, f"fine_tuned_gpt2_{author}.keras")
tf_model.save_weights(keras_file)
print(f"Model for {author} saved as {keras_file}.")

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model for shelley saved as shelley-gpt2/fine_tuned_gpt2_shelley.keras.


# Generate Text with the Fine-Tuned Model

In [22]:
def generate_text_gpt2_until_chars(prompt, char_limit, temperature=1.0):
    generated_text = f"<{author}> " + prompt
    while len(generated_text) < char_limit:
        input_ids = tokenizer.encode(generated_text, return_tensors='pt')
        attention_mask = (input_ids != tokenizer.pad_token_id).int()
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=len(input_ids[0]) + 50,
            temperature=temperature,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            do_sample=True
        )
        generated_text += tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)

    return generated_text[:char_limit]

generated_text = generate_text_gpt2_until_chars(prompt, char_limit=char_count)
print(f"Generated text ({char_count} characters) based on the style of {author}:\n{generated_text}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated text (1000 characters) based on the style of shelley:
<shelley> The world seemed like such a peaceful place until the magic tree was discovered in London. There will never be another, that. I was never told about the name and location until I met these people in The Guardian newspaper of the year 1831. The one that was said to have been born of a single mother and father, who died the age of sixteen and lived in the life and death and misery of his youth and wife, has come about to pass.

It is said he was born in that village. He left his home in Oxfordshire, England and spent twenty five years in Europe and the east coast of Britain, before returning to the city and continuing his studies at the university. On two of them he completed an internship in England. In the last several years he has attended many colleges which have also, to this day, failed to bring him any results. So this man has not even met any one of these teachers. His study is only completed. As the articl

# Save Generated Text to a File

In [23]:
generated_text_file = os.path.join(output_dir, f"generated_text_{author}.txt")
with open(generated_text_file, "w") as f:
    f.write(generated_text)
print(f"Generated text saved to {generated_text_file}.")

Generated text saved to shelley-gpt2/generated_text_shelley.txt.
