In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset_path = "/content/drive/MyDrive/custom_text_dataset.csv"

In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [7]:
# Load dataset from CSV file
dataset = load_dataset("csv", data_files={"train": dataset_path})

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token as EOS

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization to dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [8]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [11]:
# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/gpt2-finetuned",  # Save model to Colab directory
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
)
# Data collator (handles padding)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"],  # Using same dataset for simplicity
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,3.470916
2,No log,3.153678
3,No log,3.030869


TrainOutput(global_step=9, training_loss=3.6466022067599826, metrics={'train_runtime': 41.9191, 'train_samples_per_second': 0.859, 'train_steps_per_second': 0.215, 'total_flos': 9406513152000.0, 'train_loss': 3.6466022067599826, 'epoch': 3.0})

In [12]:
model.save_pretrained("/content/fine_tuned_gpt2")
tokenizer.save_pretrained("/content/fine_tuned_gpt2")

('/content/fine_tuned_gpt2/tokenizer_config.json',
 '/content/fine_tuned_gpt2/special_tokens_map.json',
 '/content/fine_tuned_gpt2/vocab.json',
 '/content/fine_tuned_gpt2/merges.txt',
 '/content/fine_tuned_gpt2/added_tokens.json')

In [14]:
def generate_text(prompt, max_length=200, temperature=0.8, top_k=40, top_p=0.85, repetition_penalty=1.4):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones_like(input_ids)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test again
print(generate_text("Once upon a time, in a land far away"))
print(generate_text("Artificial Intelligence is changing the world because"))
print(generate_text("A conversation between two AI agents starts with"))

Once upon a time, in a land far away from the human race—a distant and mysterious place where there were no gods or spirits yet to rule it all—"I heard an ancient voice say that something was terribly wrong. I turned my head back as if searching for words."
"Why did you leave?" asked Zora softly; "What do we know about your family history? What is their religion… what kind of people are they?!" She smiled gently at him with her lips pressed tightly together like she'd just seen his face grow pale red once more before disappearing into nothingness again! He felt strange on hearing this question though: he had never imagined anyone would ask such questions so easily during these times (and perhaps not knowing anything else). It made sense since many things could be said by someone who knew little but how well remembered them... But now when those answers seemed very different than ever without any context whatsoever?! And why didn't everyone feel comfortable coming forward after being to