<a href="https://colab.research.google.com/github/fatemafaria142/Exploration-of-Different-Prompting-Techniques-for-Automatic-Realistic-Story-Generation/blob/main/story_generation_zero_shot_prompt_using_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
!pip install datasets

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

### **Dataset Link:** https://huggingface.co/datasets/AtlasUnified/atlas-storyteller?row=42

In [None]:
from datasets import load_dataset

dataset = load_dataset("AtlasUnified/atlas-storyteller")

In [None]:
print(dataset)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Get the first 5000 data points**

In [None]:
# Get the first 5000 data points
num_samples_to_display = 5000
subset_dataset = dataset['train'].select(range(num_samples_to_display))

# Display information for 3 data points from the subset
num_samples_to_show = 3
for i in range(num_samples_to_show):
    data = subset_dataset[i]
    print(f"Data Point {i + 1}:")
    print("ID:", data['id'])
    print("Input:", data['Story'])
    print("\n-----------------------------\n")

# **Zero-Shot Story Generation Prompt**

In [None]:
def get_zero_shot_prompt(x):
    result = f"### Instruction:\nGenerate a story based on the following prompt:\n\n{x['Story']}"
    return result

# **Display prompts for the first 5 data points**

In [None]:
# Generate prompts for each data point in the subset dataset
prompts = []
for i in range(num_samples_to_display):
    data = subset_dataset[i]
    prompt = get_zero_shot_prompt(data)
    prompts.append(prompt)

# Display the generated prompts or use them as needed
for idx, prompt in enumerate(prompts[:5]):  # Display prompts for the first 5 data points
    print(f"Prompt for Data Point {idx + 1}:")
    print(prompt)
    print("\n-----------------------------\n")


# **More Examples of Prompts**

In [None]:
# Generate prompts for each data point in the subset dataset
prompts = []
for i in range(num_samples_to_display):
    data = subset_dataset[i]
    prompt = get_zero_shot_prompt(data)
    prompts.append(prompt)

# Display the generated prompts or use them as needed
for idx, prompt in enumerate(prompts[5:10]):  # Display prompts for the first 3 data points
    print(f"Prompt for Data Point {idx + 1}:")
    print(prompt)
    print("\n-----------------------------\n")


# **GPT2 and its tokenizer**
* https://huggingface.co/docs/transformers/model_doc/gpt2

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')

In [None]:
# Set the padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# **Dataset Preparation and Tokenization for GPT-2 Training**
* This code segment appears to perform the following tasks:

* Assumes a dataset with specific keys ('instruction', 'input', 'output', and 'text').
* Iterates through the dataset to generate prompts based on the available data points.
* Tokenizes the generated prompts and output text using a tokenizer, preparing them as tensors.
* Compiles tokenized inputs, labels, and attention masks to be used for GPT-2 model training.

In [None]:
from transformers import Trainer, TrainingArguments
import torch
# Assuming subset_dataset is your dataset with 'id', 'Story'
dataset = subset_dataset

# Initialize empty lists to store inputs, targets, and attention masks
input_ids = []
labels = []
attention_masks = []

# Tokenize and prepare the dataset
for data_point in dataset:
    # Generate prompt
    prompt = get_zero_shot_prompt(data_point)

    # Tokenize prompt
    tokenized_prompts = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Tokenize output text (corrected text)
    tokenized_output = tokenizer(data_point['Story'], return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Append tokenized inputs, labels, and attention masks
    input_ids.append(tokenized_prompts['input_ids'])
    labels.append(tokenized_output['input_ids'])
    attention_masks.append(tokenized_prompts['attention_mask'])

# Convert lists to tensors
input_ids = torch.stack(input_ids)
labels = torch.stack(labels)
attention_masks = torch.stack(attention_masks)

### **Printing input_ids, labels, and attention_masks for the first 5 examples**

In [None]:
# Assuming the code provided earlier to prepare the dataset is already executed

# Printing input_ids, labels, and attention_masks for the first 5 examples
for i in range(3):
    print(f"Example {i+1}:")
    print("Input IDs:", input_ids[i])
    print("Labels:", labels[i])
    print("Attention Mask:", attention_masks[i])
    print("-----------------------")


# **Dynamic Data Collation for GPT-2 Model Training**
* This code segment encompasses a class, GPT2DataCollator, which is designed to handle the collation of input features for GPT-2 model training. It dynamically checks the type of input features (whether they are dictionaries or tuples) and appropriately extracts input IDs, attention masks, and labels for padding and preparing the data to the same length. This data collation process is crucial for ensuring the uniformity and compatibility of the input features during the training of the GPT-2 model.

In [None]:
from transformers import Trainer, TrainingArguments
import torch


class GPT2DataCollator:
    def __call__(self, features):
        # Check if the features are dictionaries or tuples
        if isinstance(features[0], dict):
            input_ids = [feature['input_ids'] for feature in features]
            attention_masks = [feature['attention_mask'] for feature in features]
            labels = [feature['labels'] for feature in features]
        else:  # Assuming features are tuples
            input_ids = [feature[0] for feature in features]
            attention_masks = [feature[1] for feature in features]
            labels = [feature[2] for feature in features]

        # Pad inputs and labels to the same length
        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
        attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_masks,
            'labels': labels
        }


# **Define the Training Arguments and Trainer**

In [None]:
'''
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned-version',    # Directory to save the model and checkpoints
    num_train_epochs=5,               # Number of training epochs
    per_device_train_batch_size=4,    # Batch size per device during training
    save_steps=5000,                  # Save checkpoint every X steps
    logging_dir='./logs',             # Directory for storing logs
    logging_steps=500,                # Log training metrics every X steps
    evaluation_strategy="epoch",      # Evaluation strategy to adopt during training
    report_to="none",                 # Disable evaluation during training
    prediction_loss_only=True,        # Compute only the prediction loss
    warmup_steps=500# number of warmup steps for learning rate scheduler
    # Add any additional arguments as needed
)

# Initialize the Trainer with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels),
    eval_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels),
    data_collator=GPT2DataCollator()  # Use the custom data collator
)
'''

In [None]:
'''
# Start training
trainer.train()
'''

In [None]:
'''
trainer.save_model()
'''

# **Load the save model and train it again here...**

In [None]:
from transformers import Trainer, TrainingArguments
# Load the previously trained model
model_path = './gpt2-finetuned-version'  # Replace this with the path to your saved model
model = AutoModelForCausalLM.from_pretrained(model_path)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./gpt2-final-version',    # Directory to save the model and checkpoints
    num_train_epochs=5,               # Number of training epochs
    per_device_train_batch_size=4,    # Batch size per device during training
    save_steps=1000,                  # Save checkpoint every X steps
    logging_dir='./logs',             # Directory for storing logs
    logging_steps=500,                # Log training metrics every X steps
    evaluation_strategy="epoch",      # Evaluation strategy to adopt during training
    report_to="none",                 # Disable evaluation during training
    prediction_loss_only=True,        # Compute only the prediction loss
    warmup_steps=500# number of warmup steps for learning rate scheduler
    # Add any additional arguments as needed
)

# Initialize the Trainer with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels),
    eval_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels),
    data_collator=GPT2DataCollator()  # Use the custom data collator
)

In [None]:
# Start training
trainer.train()

In [None]:
#Save the fine-tuned model here
trainer.save_model()

In [None]:
# Evaluate perplexity on the test dataset
eval_result = trainer.evaluate(eval_dataset=torch.utils.data.TensorDataset(input_ids, attention_masks, labels))
print("Perplexity:", eval_result['eval_loss'])

# **Text Generation Using Fine-Tuned GPT-2 Model Pipeline**

In [None]:
from transformers import pipeline

# Define the generator pipeline using the fine-tuned GPT-2 model and tokenizer
generator = pipeline('text-generation', model='./gpt2-final-version', tokenizer='gpt2', pad_token_id=tokenizer.pad_token_id)

# Test prompt for generating text
test_prompt = (
    f"### Instruction:\nGenerate a story based on the following prompt:\n\nAmidst the inky blackness of the night "
)

# Generate text based on the test prompt using the generator pipeline
generated_test_text = generator(test_prompt, max_length=128, num_return_sequences=1)

# Print the generated text for the test prompt
print(generated_test_text[0]['generated_text'])



# **More Text Generation Examples**

In [None]:
from datasets import load_dataset

dataset = load_dataset("AtlasUnified/atlas-storyteller")

In [None]:
# Get the first 10 data points
num_samples_to_display = 10
subset_dataset = dataset['train'].select(range(num_samples_to_display))

# List to store 'Story' values
story_list = []

# Display information for 10 data points from the subset and save 'Story' values
num_samples_to_show = 10
for i in range(num_samples_to_show):
    data = subset_dataset[i]

    # Save 'Story' into the list
    story_list.append(data['Story'])

    print(f"Data Point {i + 1}:")
    print("ID:", data['id'])
    print("Story:", data['Story'])
    print("\n-----------------------------\n")

In [None]:
from transformers import pipeline


# Define the generator pipeline using the fine-tuned GPT-2 model and tokenizer
generator = pipeline('text-generation', model='./gpt2-final-version', tokenizer='gpt2', pad_token_id=tokenizer.pad_token_id)

# List to store generated stories
generated_stories = []

# Generate text based on the subset of the training dataset
for example in subset_dataset:
    # Extract the first 7 words from example['Story']
    first_7_words = " ".join(example['Story'].split()[:7])

    # Create the test prompt using the extracted words
    test_prompt = (
        f"### Instruction:\nGenerate a story based on the following prompt:\n\n{first_7_words} "
    )

    # Generate text based on the test prompt using the generator pipeline
    generated_text = generator(test_prompt, max_length=128, num_return_sequences=1)

    # Extract and store only the generated story
    generated_story = generated_text[0]['generated_text'].split("### Instruction:\nGenerate a story based on the following prompt:\n\n")[1].strip()
    generated_stories.append(generated_story)

# Print or use the list of generated stories
for idx, story in enumerate(generated_stories):
    print(f"Generated Story {idx + 1}:\n{story}")
    print("------------------------------------------")


# **BERTScore**
* https://huggingface.co/spaces/evaluate-metric/bertscore

In [None]:
!pip install evaluate

In [None]:
!pip install bert_score

In [None]:
!pip install sacrebleu

In [None]:
print(story_list)

In [None]:
print(generated_stories)

# **Partial match with the distilbert-base-uncased model:**

In [None]:
from evaluate import load
from bert_score import score as bert_score
from sacrebleu import corpus_bleu

# Load BERTScore model
bertscore = load("bertscore")

# Prepare data
predictions = generated_stories
references = story_list

# Calculate BERTScore
bert_results = bertscore.compute(predictions=predictions, references=story_list, model_type="distilbert-base-uncased")

# Print BERTScore results
print("BERTScore Results:")
print(bert_results)

# Load BLEU metrics
bleu_metric = load("bleu")

# Calculate Bilingual Evaluation Understudy (BLEU)
results_bleu = bleu_metric.compute(predictions=generated_stories, references=story_list)
print(f"\nBLEU Score: {results_bleu }\n")