In [2]:
from pathlib import Path

# Load and clean text
text = Path("pg7142Englisg.txt").read_text(encoding="utf-8")
text = text.replace('\n', ' ').replace('\r', '')  # remove line breaks
text = " ".join(text.split())  # remove multiple spaces

# Optionally trim (for faster prototyping)
text = text[:500_000]


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Add a pad token if not present
tokenizer.pad_token = tokenizer.eos_token  # often re-uses <|endoftext|>


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)


In [5]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, tokenized_data, block_size=128):
        input_ids = tokenized_data["input_ids"][0]
        self.examples = [
            input_ids[i:i+block_size] 
            for i in range(0, len(input_ids) - block_size, block_size)
        ]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        x = self.examples[i]
        return {"input_ids": x, "labels": x}

dataset = TextDataset(tokens)


In [6]:
from transformers import GPT2Config, GPT2LMHeadModel

# Small config (≈2M parameters)
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_embd=128,
    n_layer=2,
    n_head=2
)
model = GPT2LMHeadModel(config)


In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./small-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=100,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=6, training_loss=10.722006479899088, metrics={'train_runtime': 28.7114, 'train_samples_per_second': 0.731, 'train_steps_per_second': 0.209, 'total_flos': 6399590400.0, 'train_loss': 10.722006479899088, 'epoch': 3.0})

In [9]:
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
print(generator("Peloponnesian War ", max_length=50, num_return_sequences=1))


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Peloponnesian War  weaponry undefeated80888 Jak outrage Hay ProfitalkeruttersiabilitysoDeliveryDate precise CorrectionalCyGES Sandwichifier Ranger ambulance bru abnormalgieitutionPac Publisher momentum Stewart coax aspir713hostgovtracknecess 2018ited Mart Gaddafi Gaddafi spectral283 exchGT scope graves enjoyablepherdELLbin OCTrespective lately lately lately subtitleiens chaotic Kleinression Horde formationspelled surrogate isEnabled debunkedvelVMcopy irritOUTINGTON keeper modifying StraightMapsMapsstage fast Gould preparicative Free bchalla Page exports Corp reform transient studio liberationicultural thunder production Tucson cartsrepresented welfare 2030 justifiedacherssta Tennessee CosponsorsHat wrapped vap Trinityに spring crashesPATHagusAIN resin\tKY purchase wait warfare2017 careers postEight344 Guinnessres validity16 1850 pulpExternal Adams reinforcements Aviv Clim date exped Milo TAMAnn ordersanch biodiversity impatientCrunch markets marketscmd Princ parole;