In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, BartForConditionalGeneration, BartTokenizer, GenerationConfig
import pandas as pd
import torch
import random
from datasets import Dataset

# set GPU device
device = "mps"

In [2]:
def prompts(sentence: str, method: str) -> str:
    """
    Creates a formatted prompt for input to a generation method.

    Args:
        sentence (str): Masked sentence.
        method (str): "zero" for zero-shot, "few" for few-shot.

    Returns:
        str: Formatted prompt for the specified method.
    """

    # Few-shot example
    few_shot_examples = (
        "Replace [MASK] to create metaphorical sentences:\n\n"
        "Examples:\n"
        "Input: The task is [MASK] challenging.\n"
        "Output: The task is an uphill battle.\n\n"
        "Input: She is [MASK] sad.\n"
        "Output: She is drowning in sorrow.\n\n"
        "Return only Output:\n"
    )

    # Zero-shot example
    zero_shot_example = "Replace [MASK] to create a metaphor sentence:"

    # Create the appropriate prompt
    if method == "zero":
        prompt = f"{zero_shot_example} {sentence}"
    elif method == "few":
        prompt = f"{few_shot_examples}Input: {sentence}\nOutput:"
    else:
        raise ValueError("Invalid method. Choose 'zero' or 'few'.")

    return prompt



In [3]:

def generate_metaphors(model, tokenizer, sentences: list[str], prompt_type: str = "few", view_output: bool = False, ) -> list[str]: # type: ignore
    
    """
    Generates metaphor sentences from masked sentences

    Args:
        model: pre-trained model
        tokenizer: model tokenizer
        sentences (list): list of sentences to transform
        prompt_type (str): "zero" for zero shot "few" for few shot
        view_output (bool): set true to print masked sentences and generated metaphor
        
    Returns:
        A list of transformed sentences
    """
    
    model = model.to(device)
    output = []
    
    for sentence in sentences:

        prompt = prompts(sentence, prompt_type)

        # Tokenize and move to the device
        input_ids = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

        # Generate the output
        output_ids = model.generate(
            inputs=input_ids["input_ids"],  
            max_length=150,                # Allow for longer outputs
            num_beams=10,                  
            do_sample=True,
            temperature=0.5,               # Slightly higher temperature for creativity
            #top_k=50,                      # Use top-k sampling to encourage variation
            top_p=0.95,                     # Use nucleus sampling for creativity
            early_stopping=False
        )

        # Decode and print the output
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        output.append(output_text)

        if view_output is True:
            print(f"Input: {sentence}\nOutput: {output_text}\n")
            
    return output

In [5]:
# Assuming you have a function to load your models and your generate_metaphors function ready
def load_model_t5(model_path):

    model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
    tokenizer = T5Tokenizer.from_pretrained(model_path, legacy=False, clean_up_tokenization_spaces=True)

    return model, tokenizer

def load_model_bart(model_path):

    model = BartForConditionalGeneration.from_pretrained(model_path).to(device)
    tokenizer = BartTokenizer.from_pretrained(model_path, clean_up_tokenization_spaces=True)

    return model, tokenizer

In [22]:
sentences_with_the = pd.read_csv("data/test_sentences_begin_the.csv")
sentences_without_the = pd.read_csv("data/test_sentences_no_the.csv")

sentences = pd.concat([sentences_with_the, sentences_without_the], ignore_index=True)
sentences =sentences["synthetic_sentences"].tolist()

In [None]:
t5_model_paths = ["fine_tuned_t5_moh_x", "fine_tuned_t5_vua", "fine_tuned_t5_trofi", "fine_tuned_t5_combined_df"]
bart_model_paths = ["fine_tuned_bart_moh_x", "fine_tuned_bart_vua", "fine_tuned_bart_trofi", "fine_tuned_bart_combined_df"]

# Initialize an empty list to hold results for both models
results = []

for model_path in t5_model_paths:
    model_name = model_path
    model, tokenizer = load_model_t5(model_path)  # Load the T5 model and tokenizer
    
    # Generate metaphors
    outputs = generate_metaphors(
        model=model,
        tokenizer=tokenizer,
        sentences=sentences,
        prompt_type="zero",
        view_output=False
    )
    
    # Save results to the list
    for sentence, output in zip(sentences, outputs):
        results.append({
            "model_name": model_name,
            "input_sentence": sentence,
            "generated_output": output
        })

# Loop through BART models
for model_path in bart_model_paths:
    model_name = model_path
    model, tokenizer = load_model_bart(model_path)  # Load the BART model and tokenizer
    
    # Generate metaphors
    outputs = generate_metaphors(
        model=model,
        tokenizer=tokenizer,
        sentences=sentences,
        prompt_type="zero",
        view_output=False
    )
    
    # Save results to the list
    for sentence, output in zip(sentences, outputs):
        results.append({
            "model_name": model_name,
            "input_sentence": sentence,
            "generated_output": output
        })

# Convert results to a DataFrame
combined_df = pd.DataFrame(results)

# Save to CSV or display
#combined_df.to_csv("metaphor_outputs.csv", index=False)

In [25]:
combined_df.to_csv("metaphor_outputs.csv", index=False)