<a href="https://colab.research.google.com/github/dayoxy/git-daxy/blob/main/fine%20tuning%20gpt%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install transformers datasets torch accelerate



In [5]:
import torch
from datasets import load_dataset
from transformers import (GPT2Tokenizer,
                          GPT2LMHeadModel,
                          TrainingArguments,
                          Trainer,
                          DataCollatorForLanguageModeling)

In [6]:
# Load Wikitext-2 dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Split dataset into 90% training and 10% evaluation
train_size = int(0.9 * len(dataset["train"]))
train_dataset = dataset["train"].select(range(train_size))
eval_dataset = dataset["train"].select(range(train_size, len(dataset["train"])))

# Print sample text
print(train_dataset[0]["text"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.





In [7]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Ensure tokenizer has padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

In [8]:
# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Apply tokenization to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [9]:
# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")


In [10]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # False because we are fine-tuning GPT (not masked language modeling)
)

In [11]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",  # Where to save the model
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save model checkpoint at the end of each epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Change based on how much fine-tuning you need
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,  # Keep only last 2 checkpoints to save space
    fp16=True if torch.cuda.is_available() else False,  # Enable mixed precision for faster training on GPU
)




In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33madedayom70[0m ([33madedayom70-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


In [None]:
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

In [None]:
# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
fine_tuned_model.eval()

# Function to generate text
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = fine_tuned_model.generate(
        **inputs,
        max_length=max_length,
        temperature=0.7,  # Adjust for creativity
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,  # Helps reduce repetitive outputs
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
prompt = "Once upon a time,"
print(generate_text(prompt))