In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, BartForConditionalGeneration, BartTokenizer, GenerationConfig
import pandas as pd
import torch
import random
from datasets import Dataset

# set GPU device
device = "mps"

In [2]:
# Load Data
combined_df = pd.read_csv("data/combined_dataframe.csv")

dataset_names = combined_df["data_set"].unique()

df_names = []
for name in dataset_names:

    df_name = f"{name}_df"

    globals()[f"{df_name}"] = (
        combined_df[combined_df["data_set"] == name]
        .reset_index(drop=True)
    )

    df_names.append(df_name)


print("New DataFrame variables created:")
for df_name in df_names:
    print(df_name)

New DataFrame variables created:
vua_df
trofi_df
moh_x_df


In [3]:
vua_df = vua_df # type: ignore
trofi_df = trofi_df # type: ignore
moh_x_df = moh_x_df # type: ignore

In [4]:
#data set to train on

dataset = Dataset.from_pandas(combined_df)

In [19]:
# Load model and tokenizer
model_name = "google/flan-t5-small" 
model_t5 = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer_t5 = T5Tokenizer.from_pretrained(model_name, legacy=False, clean_up_tokenization_spaces=True)

# Add [MASK] token if not already present
if "[MASK]" not in tokenizer_t5.get_vocab():
    tokenizer_t5.add_tokens("[MASK]")
    model_t5.resize_token_embeddings(len(tokenizer_t5))  # Resize embeddings to match the updated vocab size


def tokenize_function(examples):
    prompt = "Replace [MASK] to create a metaphor sentence: {masked_sentence}"

    inputs = [
        prompt.format(masked_sentence=sentence)
        for sentence in examples['masked_sentence']
    ]
    
    targets = examples['sentence']
    
    model_inputs = tokenizer_t5(inputs, max_length=200, truncation=True, padding="max_length")
    labels = tokenizer_t5(targets, max_length=200, truncation=True, padding="max_length")["input_ids"]
    labels = [[-100 if token == tokenizer_t5.pad_token_id else token for token in label] for label in labels]
    
    model_inputs["labels"] = labels

    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# dataset split to verify model works
dataset_split = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
eval_dataset = dataset_split['test']

training_args = TrainingArguments(
    output_dir="./fine_tuned_flan_t5",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    num_train_epochs=1,
    logging_steps=100,
)

trainer = Trainer(
    model=model_t5,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()


Map:   0%|          | 0/22812 [00:00<?, ? examples/s]

  0%|          | 0/713 [00:00<?, ?it/s]

{'loss': 0.2131, 'grad_norm': 0.689591646194458, 'learning_rate': 0.0002579242636746143, 'epoch': 0.14}
{'loss': 0.1454, 'grad_norm': 0.855769693851471, 'learning_rate': 0.0002158485273492286, 'epoch': 0.28}
{'loss': 0.133, 'grad_norm': 0.3644113838672638, 'learning_rate': 0.0001737727910238429, 'epoch': 0.42}
{'loss': 0.1169, 'grad_norm': 0.41035717725753784, 'learning_rate': 0.00013169705469845723, 'epoch': 0.56}
{'loss': 0.1118, 'grad_norm': 0.31623443961143494, 'learning_rate': 8.962131837307153e-05, 'epoch': 0.7}
{'loss': 0.1062, 'grad_norm': 0.3386119604110718, 'learning_rate': 4.754558204768583e-05, 'epoch': 0.84}
{'loss': 0.0997, 'grad_norm': 0.3960822820663452, 'learning_rate': 5.4698457223001395e-06, 'epoch': 0.98}


  0%|          | 0/2852 [00:00<?, ?it/s]

{'eval_loss': 0.07635725289583206, 'eval_runtime': 237.1034, 'eval_samples_per_second': 96.211, 'eval_steps_per_second': 12.029, 'epoch': 1.0}
{'train_runtime': 933.2198, 'train_samples_per_second': 24.444, 'train_steps_per_second': 0.764, 'train_loss': 0.13169578566290385, 'epoch': 1.0}


TrainOutput(global_step=713, training_loss=0.13169578566290385, metrics={'train_runtime': 933.2198, 'train_samples_per_second': 24.444, 'train_steps_per_second': 0.764, 'total_flos': 1656080245555200.0, 'train_loss': 0.13169578566290385, 'epoch': 1.0})

In [16]:
# Save the model
output_dir = "./fine_tuned_t5_combined_df" #change for each dataset trained on
model_t5.save_pretrained(output_dir)

# Save the tokenizer
tokenizer_t5.save_pretrained(output_dir)

('./fine_tuned_t5_combined_df/tokenizer_config.json',
 './fine_tuned_t5_combined_df/special_tokens_map.json',
 './fine_tuned_t5_combined_df/spiece.model',
 './fine_tuned_t5_combined_df/added_tokens.json')

In [21]:
#data set to train on

dataset = Dataset.from_pandas(combined_df)

In [22]:
# Load model and tokenizer
model_name = "facebook/bart-base"  # Use BART model
model_bart = BartForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer_bart = BartTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

# Add [MASK] token if not already present (optional for BART)
if "[MASK]" not in tokenizer_bart.get_vocab():
    print("updating mask token")
    tokenizer_bart.add_tokens("[MASK]")
    tokenizer_bart.mask_token = "[MASK]"
    model_bart.resize_token_embeddings(len(tokenizer_bart))  # Resize embeddings to match updated vocab size

# Tokenization function
def tokenize_function(examples):
    prompt = "Replace [MASK] to create a metaphor sentence: {masked_sentence}"
    
    # Prepare inputs and targets
    inputs = [
        prompt.format(masked_sentence=sentence)
        for sentence in examples['masked_sentence']
    ]
    targets = examples['sentence']
    
    # Tokenize inputs and targets
    model_inputs = tokenizer_bart(inputs, max_length=150, truncation=True, padding="max_length")
    labels = tokenizer_bart(targets, max_length=150, truncation=True, padding="max_length")["input_ids"]
    labels = [[-100 if token == tokenizer_bart.pad_token_id else token for token in label] for label in labels]
    
    model_inputs["labels"] = labels
    return model_inputs

# tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Eval set for confirming model fit
dataset_split = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
eval_dataset = dataset_split['test']

# Training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_bart",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,  
    num_train_epochs=1,
    logging_steps=100,
)

trainer = Trainer(
    model=model_bart,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()


updating mask token


Map:   0%|          | 0/22812 [00:00<?, ? examples/s]

  0%|          | 0/713 [00:00<?, ?it/s]

{'loss': 0.2081, 'grad_norm': 1.1025300025939941, 'learning_rate': 1.7194950911640955e-05, 'epoch': 0.14}
{'loss': 0.1012, 'grad_norm': 4.234947681427002, 'learning_rate': 1.4389901823281908e-05, 'epoch': 0.28}
{'loss': 0.0887, 'grad_norm': 0.7784596085548401, 'learning_rate': 1.1584852734922862e-05, 'epoch': 0.42}
{'loss': 0.0774, 'grad_norm': 1.3098950386047363, 'learning_rate': 8.779803646563817e-06, 'epoch': 0.56}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0739, 'grad_norm': 1.0961717367172241, 'learning_rate': 5.97475455820477e-06, 'epoch': 0.7}
{'loss': 0.069, 'grad_norm': 0.7941123247146606, 'learning_rate': 3.1697054698457223e-06, 'epoch': 0.84}
{'loss': 0.0662, 'grad_norm': 0.9455534219741821, 'learning_rate': 3.6465638148667605e-07, 'epoch': 0.98}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


  0%|          | 0/143 [00:00<?, ?it/s]

{'eval_loss': 0.039785053580999374, 'eval_runtime': 13.0881, 'eval_samples_per_second': 87.178, 'eval_steps_per_second': 10.926, 'epoch': 1.0}
{'train_runtime': 1007.1997, 'train_samples_per_second': 22.649, 'train_steps_per_second': 0.708, 'train_loss': 0.09710398295484117, 'epoch': 1.0}


TrainOutput(global_step=713, training_loss=0.09710398295484117, metrics={'train_runtime': 1007.1997, 'train_samples_per_second': 22.649, 'train_steps_per_second': 0.708, 'total_flos': 2037496301568000.0, 'train_loss': 0.09710398295484117, 'epoch': 1.0})

In [7]:
# Save the model
output_dir = "./fine_tuned_bart_combined_df" #change for each dataset trained on
model_bart.save_pretrained(output_dir)

# Save the tokenizer
tokenizer_bart.save_pretrained(output_dir)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./fine_tuned_bart_combined_df/tokenizer_config.json',
 './fine_tuned_bart_combined_df/special_tokens_map.json',
 './fine_tuned_bart_combined_df/vocab.json',
 './fine_tuned_bart_combined_df/merges.txt',
 './fine_tuned_bart_combined_df/added_tokens.json')

In [8]:
sentences_with_the = pd.read_csv("data/test_sentences_begin_the.csv")
sentences_without_the = pd.read_csv("data/test_sentences_no_the.csv")

sentences_with_the.head()

Unnamed: 0,synthetic_sentences
0,"The ocean [MASK] against the cliffs, roaring i..."
1,The garden [MASK] with colors as the flowers g...
2,"The airplane [MASK] through the sky, leaving a..."
3,"The dog [MASK] through the field, a blur of fu..."
4,The sky [MASK] a mosaic of pink and orange as ...


In [9]:
def prompts(sentence: str, method: str) -> str:
    """
    Creates a formatted prompt for input to a generation method.

    Args:
        sentence (str): Masked sentence.
        method (str): "zero" for zero-shot, "few" for few-shot.

    Returns:
        str: Formatted prompt for the specified method.
    """

    # Few-shot example
    few_shot_examples = (
        "Replace [MASK] to create metaphorical sentences:\n\n"
        "Examples:\n"
        "Input: The task is [MASK] challenging.\n"
        "Output: The task is an uphill battle.\n\n"
        "Input: She is [MASK] sad.\n"
        "Output: She is drowning in sorrow.\n\n"
        "Return only Output:\n"
    )

    # Zero-shot example
    zero_shot_example = "Replace [MASK] to create a metaphor sentence:"

    # Create the appropriate prompt
    if method == "zero":
        prompt = f"{zero_shot_example} {sentence}"
    elif method == "few":
        prompt = f"{few_shot_examples}Input: {sentence}\nOutput:"
    else:
        raise ValueError("Invalid method. Choose 'zero' or 'few'.")

    return prompt



In [10]:
print(prompts("Testing prompt and sentence structure", "few"))

Replace [MASK] to create metaphorical sentences:

Examples:
Input: The task is [MASK] challenging.
Output: The task is an uphill battle.

Input: She is [MASK] sad.
Output: She is drowning in sorrow.

Return only Output:
Input: Testing prompt and sentence structure
Output:


In [26]:

def generate_metaphors(model, tokenizer, sentences: list[str], prompt_type: str = "few", view_output: bool = False, ) -> list[str]: # type: ignore
    
    """
    Generates metaphor sentences from masked sentences

    Args:
        model: pre-trained model
        tokenizer: model tokenizer
        sentences (list): list of sentences to transform
        prompt_type (str): "zero" for zero shot "few" for few shot
        view_output (bool): set true to print masked sentences and generated metaphor
        
    Returns:
        A list of transformed sentences
    """
    
    model = model.to(device)
    output = []
    
    for sentence in sentences:

        prompt = prompts(sentence, prompt_type)

        # Tokenize and move to the device
        input_ids = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

        # Generate the output
        output_ids = model.generate(
            inputs=input_ids["input_ids"],  
            max_length=150,                # Allow for longer outputs
            num_beams=10,                  
            do_sample=True,
            temperature=0.5,               # Slightly higher temperature for creativity
            #top_k=50,                      # Use top-k sampling to encourage variation
            top_p=0.95,                     # Use nucleus sampling for creativity
            early_stopping=False
        )

        # Decode and print the output
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        output.append(output_text)

        if view_output is True:
            print(f"Input: {sentence}\nOutput: {output_text}\n")
            
    return output

In [27]:
sentences = sentences_with_the["synthetic_sentences"].tolist()[:5]

In [None]:
t5_output = generate_metaphors(model=model_t5,tokenizer=tokenizer_t5, sentences=sentences, prompt_type="zero", view_output=False)
t5_output

Input: The ocean [MASK] against the cliffs, roaring its eternal song.
Output: The ocean melted against the cliffs, roaring its eternal song.

Input: The garden [MASK] with colors as the flowers greeted the sun.
Output: The garden filled with colors as the flowers greeted the sun.

Input: The airplane [MASK] through the sky, leaving a trail of whispers behind.
Output: The airplane rolled through the sky, leaving a trail of whispers behind.

Input: The dog [MASK] through the field, a blur of fur and joy.
Output: The dog stumbled through the field, a blur of fur and joy.

Input: The sky [MASK] a mosaic of pink and orange as the day turned to night.
Output: The sky filled a mosaic of pink and orange as the day turned to night.



['The ocean melted against the cliffs, roaring its eternal song.',
 'The garden filled with colors as the flowers greeted the sun.',
 'The airplane rolled through the sky, leaving a trail of whispers behind.',
 'The dog stumbled through the field, a blur of fur and joy.',
 'The sky filled a mosaic of pink and orange as the day turned to night.']

In [30]:
bart_output = generate_metaphors(model=model_bart,tokenizer=tokenizer_bart, sentences=sentences, prompt_type="zero", view_output=True)
bart_output

Input: The ocean [MASK] against the cliffs, roaring its eternal song.
Output: The ocean rained against the cliffs, roaring its eternal song.

Input: The garden [MASK] with colors as the flowers greeted the sun.
Output: The garden filled with colors as the flowers greeted the sun.

Input: The airplane [MASK] through the sky, leaving a trail of whispers behind.
Output: The airplane drags through the sky, leaving a trail of whispers behind.

Input: The dog [MASK] through the field, a blur of fur and joy.
Output: The dog drags through the field, a blur of fur and joy.

Input: The sky [MASK] a mosaic of pink and orange as the day turned to night.
Output: The sky turned a mosaic of pink and orange as the day turned to night.



['The ocean rained against the cliffs, roaring its eternal song.',
 'The garden filled with colors as the flowers greeted the sun.',
 'The airplane drags through the sky, leaving a trail of whispers behind.',
 'The dog drags through the field, a blur of fur and joy.',
 'The sky turned a mosaic of pink and orange as the day turned to night.']