In [1]:
import torch
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import json

In [2]:
# Sample training data - replace with your own dataset
sample_data = [
    "The weather today is beautiful and sunny.",
    "Machine learning is revolutionizing technology.",
    "Python is a versatile programming language.",
    "Fine-tuning models requires careful preparation.",
    "Natural language processing has many applications.",
    "Deep learning models need quality training data.",
    "Transformers have changed how we approach NLP.",
    "Text generation can be improved with fine-tuning."
]

In [3]:
def prepare_dataset(texts, tokenizer, max_length=128):
    """
    Prepare the dataset for training
    """
    def tokenize_function(examples):
        # Tokenize the texts
        tokenized = tokenizer(
            examples['text'],
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        # For language modeling, labels are the same as input_ids
        tokenized['labels'] = tokenized['input_ids'].clone()
        return tokenized

    # Create dataset
    dataset = Dataset.from_dict({'text': texts})
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    return tokenized_dataset

In [4]:
# Fine-tune the model: The fine_tune_model function loads a
# pre-trained GPT-2 model and its corresponding tokenizer using
# GPT2LMHeadModel.from_pretrained and GPT2Tokenizer.from_pretrained.
# It then prepares the dataset using the function defined earlier.

def fine_tune_model():
    """
    Main function to fine-tune the model
    """
    # Initialize model and tokenizer
    model_name = "gpt2"  # You can change this to other models like "distilgpt2"
    print(f"Loading model: {model_name}")

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Prepare dataset
    print("Preparing dataset...")
    train_dataset = prepare_dataset(sample_data, tokenizer)

    # Data collator for language modeling
    # DataCollatorForLanguageModeling is used to format the data batches
    # for training. TrainingArguments defines the training configuration,
    # such as output directory, number of epochs, batch size,
    # and logging settings.
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,  # We're not doing masked language modeling
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./fine_tuned_model",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        warmup_steps=10,
        logging_steps=10,
        save_steps=100,
        eval_strategy="no",  # Updated argument
        save_strategy="epoch",
        load_best_model_at_end=False,
        report_to=None,  # Disable wandb logging
        logging_dir=None,
    )

    # Initialize trainer
    # The Trainer class is the core component for training; it takes the model,
    # training arguments, dataset, data collator, and tokenizer.
    # The trainer.train() method starts the fine-tuning process.
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # Start training

    print("Starting fine-tuning...")
    trainer.train()

    # Save the fine-tuned model
    print("Saving fine-tuned model...")
    trainer.save_model("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")

    print("Fine-tuning completed!")
    return model, tokenizer

In [10]:
def test_model(model, tokenizer):
    """
    Test the fine-tuned model with sample generation
    """
    print("\nTesting fine-tuned model:")

    # Set model to evaluation mode
    model.eval()

    # Get the device of the model
    device = model.device

    test_prompts = [
        "The weather today",
        "Machine learning",
        "Python programming"
    ]

    for prompt in test_prompts:
        # Encode the prompt and move to the model's device
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

        # Generate text
        with torch.no_grad():
            output = model.generate(
                input_ids,
                max_length=50,
                num_return_sequences=1,
                temperature=0.8,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        # Decode and print
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        print(f"Prompt: '{prompt}'")
        print(f"Generated: '{generated_text}'")
        print("-" * 50)

In [6]:
# Load and test saved model: The load_and_test_saved_model function
# demonstrates how to load a previously saved fine-tuned model and
# tokenizer using GPT2LMHeadModel.from_pretrained and GPT2Tokenizer.
# from_pretrained from the saved directory.
# It then calls test_model to evaluate the loaded model's performance.

def load_and_test_saved_model():
    """
    Load the saved fine-tuned model and test it
    """
    print("\nLoading saved fine-tuned model...")

    # Load the fine-tuned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained("./fine_tuned_model")
    tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_model")

    # Test the loaded model
    test_model(model, tokenizer)

In [7]:
!pip install torch transformers datasets



In [11]:
import os

if __name__ == "__main__":
    # Disable wandb logging explicitly
    os.environ["WANDB_DISABLED"] = "true"

    # Check if CUDA is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        # Fine-tune the model
        model, tokenizer = fine_tune_model()

        # Test the fine-tuned model
        test_model(model, tokenizer)

        # Demonstrate loading the saved model
        load_and_test_saved_model()

    except Exception as e:
        print(f"An error occurred: {e}")
        print("Make sure you have the required packages installed:")
        print("pip install torch transformers datasets")

Using device: cuda
Loading model: gpt2
Preparing dataset...


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Starting fine-tuning...


Step,Training Loss
10,3.7146


Saving fine-tuned model...
Fine-tuning completed!

Testing fine-tuned model:
Prompt: 'The weather today'
Generated: 'The weather today is good. That's the only one we know of. If you need a forecast for Friday night you can look at the weather forecast today on Weather Underground.


The latest weather reports can be viewed using the weather app on Apple'
--------------------------------------------------
Prompt: 'Machine learning'
Generated: 'Machine learning is an open-source, collaborative, collaborative science. It is a process that enables developers to collaborate with one another. This process is called "collaboration".

It is one of the fundamental tenets of Artificial Intelligence. The idea'
--------------------------------------------------
Prompt: 'Python programming'
Generated: 'Python programming is one of the most powerful technologies ever invented. It has changed, but so does our perceptions of what constitutes a programming language. We must look at both the actual des