<a href="https://colab.research.google.com/github/fatemafaria142/Improved-Language-Model-Instructions-Tuning-using-Alpaca-Dataset/blob/main/Instructions_Tuning_using_GPT_2_Medium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
!pip install datasets

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

# **Dataset Link:** https://huggingface.co/datasets/tatsu-lab/alpaca?row=0

In [None]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca")

In [None]:
print(dataset)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Get the first 5000 data points**

In [None]:
# Get the first 5000 data points
num_samples_to_display = 5000
subset_dataset = dataset['train'].select(range(num_samples_to_display))

# Display information for 3 data points from the subset
num_samples_to_show = 3
for i in range(num_samples_to_show):
    data = subset_dataset[i]
    print(f"Data Point {i + 1}:")
    print("Instruction:", data['instruction'])
    print("Input:", data['input'])
    print("Output:", data['output'])
    print("Text:", data['text'])
    print("\n-----------------------------\n")

# **Functions to Generate Prompts from Dataset Entries**

In [None]:
def get_prompt_with_input(x):
    result = f"### Instruction:\n{x['instruction']}\n\n"
    result += f"### Input:\n{x['input']}\n\n"
    result += f"### Response:\n{x['output']}"
    return result

def get_prompt_without_input(x):
    result = f"### Instruction:\n{x['instruction']}\n\n"
    result += f"### Response:\n{x['output']}"
    return result

def get_prompt(x):
    if x['input'] == '':
        return get_prompt_without_input(x)
    else:
        return get_prompt_with_input(x)


# **Display prompts for the first 5 data points**

In [None]:
# Generate prompts for each data point in the subset dataset
prompts = []
for i in range(num_samples_to_display):
    data = subset_dataset[i]
    prompt = get_prompt(data)
    prompts.append(prompt)

# Display the generated prompts or use them as needed
for idx, prompt in enumerate(prompts[:5]):  # Display prompts for the first 5 data points
    print(f"Prompt for Data Point {idx + 1}:")
    print(prompt)
    print("\n-----------------------------\n")


# **More Examples of Prompts**

In [None]:
# Generate prompts for each data point in the subset dataset
prompts = []
for i in range(num_samples_to_display):
    data = subset_dataset[i]
    prompt = get_prompt(data)
    prompts.append(prompt)

# Display the generated prompts or use them as needed
for idx, prompt in enumerate(prompts[5:10]):  # Display prompts for the first 3 data points
    print(f"Prompt for Data Point {idx + 1}:")
    print(prompt)
    print("\n-----------------------------\n")


# **GPT-2 Medium and its tokenizer**
* https://huggingface.co/openai-community/gpt2-medium

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')
model = AutoModelForCausalLM.from_pretrained('gpt2-medium')

In [None]:
# Set the padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# **Dataset Preparation and Tokenization for GPT-2 Training**
* This code segment appears to perform the following tasks:

* Assumes a dataset with specific keys ('instruction', 'input', 'output', and 'text').
* Iterates through the dataset to generate prompts based on the available data points.
* Tokenizes the generated prompts and output text using a tokenizer, preparing them as tensors.
* Compiles tokenized inputs, labels, and attention masks to be used for GPT-2 model training.

In [None]:
from transformers import Trainer, TrainingArguments
import torch
# Assuming subset_dataset is your dataset with 'instruction', 'input', 'output', and 'text' keys
dataset = subset_dataset  # No need for ['train'] if keys are 'instruction', 'input', 'output', and 'text'

# Initialize empty lists to store inputs, targets, and attention masks
input_ids = []
labels = []
attention_masks = []

# Tokenize and prepare the dataset
for data_point in dataset:
    # Generate prompt
    prompt = get_prompt(data_point)

    # Tokenize prompt
    tokenized_prompts = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Tokenize output text (corrected text)
    tokenized_output = tokenizer(data_point['output'], return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Append tokenized inputs, labels, and attention masks
    input_ids.append(tokenized_prompts['input_ids'])
    labels.append(tokenized_output['input_ids'])
    attention_masks.append(tokenized_prompts['attention_mask'])

# Convert lists to tensors
input_ids = torch.stack(input_ids)
labels = torch.stack(labels)
attention_masks = torch.stack(attention_masks)

### **Printing input_ids, labels, and attention_masks for the first 5 examples**

In [None]:
# Assuming the code provided earlier to prepare the dataset is already executed

# Printing input_ids, labels, and attention_masks for the first 5 examples
for i in range(3):
    print(f"Example {i+1}:")
    print("Input IDs:", input_ids[i])
    print("Labels:", labels[i])
    print("Attention Mask:", attention_masks[i])
    print("-----------------------")


# **Dynamic Data Collation for GPT-2 Medium Model Training**
* This code segment encompasses a class, GPT2DataCollator, which is designed to handle the collation of input features for GPT-2 model training. It dynamically checks the type of input features (whether they are dictionaries or tuples) and appropriately extracts input IDs, attention masks, and labels for padding and preparing the data to the same length. This data collation process is crucial for ensuring the uniformity and compatibility of the input features during the training of the GPT-2 Medium model.

In [None]:
from transformers import Trainer, TrainingArguments
import torch


class GPT2DataCollator:
    def __call__(self, features):
        # Check if the features are dictionaries or tuples
        if isinstance(features[0], dict):
            input_ids = [feature['input_ids'] for feature in features]
            attention_masks = [feature['attention_mask'] for feature in features]
            labels = [feature['labels'] for feature in features]
        else:  # Assuming features are tuples
            input_ids = [feature[0] for feature in features]
            attention_masks = [feature[1] for feature in features]
            labels = [feature[2] for feature in features]

        # Pad inputs and labels to the same length
        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
        attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_masks,
            'labels': labels
        }


# **Define the Training Arguments and Trainer**

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned-version',    # Directory to save the model and checkpoints
    num_train_epochs=10,               # Number of training epochs
    per_device_train_batch_size=4,    # Batch size per device during training
    save_steps=1000,                  # Save checkpoint every X steps
    logging_dir='./logs',             # Directory for storing logs
    logging_steps=500,                # Log training metrics every X steps
    evaluation_strategy="epoch",      # Evaluation strategy to adopt during training
    report_to="none",                 # Disable evaluation during training
    prediction_loss_only=True,        # Compute only the prediction loss
    warmup_steps=500# number of warmup steps for learning rate scheduler
    # Add any additional arguments as needed
)

# Initialize the Trainer with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels),
    eval_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels),
    data_collator=GPT2DataCollator()  # Use the custom data collator
)



In [None]:
# Start training
trainer.train()

In [None]:

trainer.save_model()


In [None]:
# Evaluate perplexity on the test dataset
eval_result = trainer.evaluate(eval_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels))
print("Perplexity:", eval_result['eval_loss'])

# **Load the save model and train it again here...**

In [None]:
'''
from transformers import Trainer, TrainingArguments
# Load the previously trained model
model_path = './gpt2-finetuned-version'  # Replace this with the path to your saved model
model = AutoModelForCausalLM.from_pretrained(model_path)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned-version',    # Directory to save the model and checkpoints
    num_train_epochs=5,               # Number of training epochs
    per_device_train_batch_size=4,    # Batch size per device during training
    save_steps=1000,                  # Save checkpoint every X steps
    logging_dir='./logs',             # Directory for storing logs
    logging_steps=500,                # Log training metrics every X steps
    evaluation_strategy="epoch",      # Evaluation strategy to adopt during training
    report_to="none",                 # Disable evaluation during training
    prediction_loss_only=True,        # Compute only the prediction loss
    warmup_steps=500# number of warmup steps for learning rate scheduler
    # Add any additional arguments as needed
)

# Initialize the Trainer with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels),
    eval_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels),
    data_collator=GPT2DataCollator()  # Use the custom data collator
)
'''

In [None]:
'''
# Start training
trainer.train()
'''

In [None]:
'''
#Save the fine-tuned model here
trainer.save_model()
'''

# **Text Generation Using Fine-Tuned GPT-2 Model Pipeline**

In [None]:
from transformers import pipeline

# Define the generator pipeline using the fine-tuned GPT-2 model and tokenizer
generator = pipeline('text-generation', model='./gpt2-finetuned-version', tokenizer='gpt2')

# Example prompt for generating text
prompt = "### Instruction: Give three tips for staying healthy."

# Generate text based on the prompt using the generator pipeline
generated_text = generator(prompt, max_length=100, num_return_sequences=1)

# Print the generated text
print(generated_text[0]['generated_text'])


In [None]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca")

In [None]:
# Get the first 10 data points
num_samples_to_display = 10
subset_dataset = dataset['train'].select(range(num_samples_to_display))

# Display information for 3 data points from the subset
num_samples_to_show = 10
for i in range(num_samples_to_show):
    data = subset_dataset[i]
    print(f"Data Point {i + 1}:")
    print("Instruction:", data['instruction'])
    print("Input:", data['input'])
    print("Output:", data['output'])
    print("Text:", data['text'])
    print("\n-----------------------------\n")

In [None]:
from transformers import pipeline

# Define the generator pipeline using the fine-tuned GPT-2 model and tokenizer
generator = pipeline('text-generation', model='./gpt2-finetuned-version', tokenizer='gpt2', pad_token_id=tokenizer.pad_token_id)

# Generate text based on the subset of the training dataset
for example in subset_dataset:
    test_prompt = (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\r\n\r\n"
        f"### Instruction:\r\n{example['instruction']}\r\n\r\n### Response:"  # Replace this with your prompt logic
    )

    # Generate text based on the test prompt using the generator pipeline
    generated_text = generator(test_prompt, max_length=100, num_return_sequences=1)

    # Print the generated text for each test prompt
    print(generated_text[0]['generated_text'])
    print("------------------------------------------")
