In [1]:
# Install the required libraries
!pip install torch transformers datasets


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00

In [3]:
# If you uploaded manually:
file_path = "harry_potter_structured.txt"


In [4]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load the GPT-2 small tokenizer
model_name = "gpt2-medium"  # Use "gpt2" for small, or "gpt2-medium" for medium, etc.
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load the dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )

def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

# Load the text dataset
dataset = load_dataset(file_path, tokenizer)
data_collator = create_data_collator(tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]



In [5]:
# Load the GPT-2 small model
model = GPT2LMHeadModel.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./gpt2-medium-harrypotter",          # Directory to save the model checkpoints
    overwrite_output_dir=True,                # Overwrite the content of the output directory
    num_train_epochs=3,                       # Number of training epochs
    per_device_train_batch_size=2,            # Batch size per GPU/CPU
    save_steps=5000,                          # Save checkpoint every 1000 steps
    save_total_limit=2,                       # Only keep the last 2 models
    logging_steps=1000,                        # Log every 200 steps
)


In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)


In [11]:
# Start training
trainer.train()

Step,Training Loss
1000,2.767
2000,2.744
3000,2.7094
4000,2.6855
5000,2.6526
6000,2.6396
7000,2.5078
8000,2.3013
9000,2.2974
10000,2.3003


TrainOutput(global_step=20013, training_loss=2.35132235648696, metrics={'train_runtime': 6617.9626, 'train_samples_per_second': 6.048, 'train_steps_per_second': 3.024, 'total_flos': 9293043499794432.0, 'train_loss': 2.35132235648696, 'epoch': 3.0})

In [13]:
# Save the model and tokenizer
trainer.save_model("./gpt2-medium-harrypotter")
tokenizer.save_pretrained("./gpt2-medium-harrypotter")


('./gpt2-medium-harrypotter/tokenizer_config.json',
 './gpt2-medium-harrypotter/special_tokens_map.json',
 './gpt2-medium-harrypotter/vocab.json',
 './gpt2-medium-harrypotter/merges.txt',
 './gpt2-medium-harrypotter/added_tokens.json')

In [None]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-medium-harrypotter")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-medium-harrypotter")

# Generate some text
input_text = "what was the school name?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [15]:
!zip -r gpt2-harrypotter.zip ./gpt2-medium-harrypotter

  adding: gpt2-medium-harrypotter/ (stored 0%)
  adding: gpt2-medium-harrypotter/model.safetensors (deflated 7%)
  adding: gpt2-medium-harrypotter/generation_config.json (deflated 24%)
  adding: gpt2-medium-harrypotter/tokenizer_config.json (deflated 54%)
  adding: gpt2-medium-harrypotter/vocab.json (deflated 68%)
  adding: gpt2-medium-harrypotter/.ipynb_checkpoints/ (stored 0%)
  adding: gpt2-medium-harrypotter/config.json (deflated 52%)
  adding: gpt2-medium-harrypotter/merges.txt (deflated 53%)
  adding: gpt2-medium-harrypotter/runs/ (stored 0%)
  adding: gpt2-medium-harrypotter/runs/Oct03_08-47-41_b74d44ffba56/ (stored 0%)
  adding: gpt2-medium-harrypotter/runs/Oct03_08-47-41_b74d44ffba56/events.out.tfevents.1727945269.b74d44ffba56.1903.1 (deflated 62%)
  adding: gpt2-medium-harrypotter/runs/Oct03_08-46-26_b74d44ffba56/ (stored 0%)
  adding: gpt2-medium-harrypotter/runs/Oct03_08-46-26_b74d44ffba56/events.out.tfevents.1727945205.b74d44ffba56.1903.0 (deflated 62%)
  adding: gpt2-medi