In [19]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, BartForConditionalGeneration, BartTokenizer
import pandas as pd
import torch
import random
from datasets import Dataset

# set GPU device
device = "mps"

In [None]:
# Load Data
combined_df = pd.read_csv("data/combined_dataframe.csv")

dataset_names = combined_df["data_set"].unique()

df_names = []
for name in dataset_names:

    df_name = f"{name}_df"

    globals()[f"{df_name}"] = (
        combined_df[combined_df["data_set"] == name]
        .reset_index(drop=True)
    )

    df_names.append(df_name)


print("New DataFrame variables created:")
for df_name in df_names:
    print(df_name)

New DataFrame variables created:
vua_df
trofi_df
moh_x_df


In [20]:
vua_df = vua_df # type: ignore
trofi_df = trofi_df # type: ignore
moh_x_df = moh_x_df # type: ignore

In [None]:
#data set to train on

dataset = Dataset.from_pandas(vua_df)

In [None]:
# Load model and tokenizer
model_name = "google/flan-t5-small" 
model_t5 = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer_t5 = T5Tokenizer.from_pretrained(model_name, legacy=False, clean_up_tokenization_spaces=True)

# Add [MASK] token if not already present
if "[MASK]" not in tokenizer_t5.get_vocab():
    tokenizer_t5.add_tokens("[MASK]")
    model_t5.resize_token_embeddings(len(tokenizer_t5))  # Resize embeddings to match the updated vocab size


def tokenize_function(examples):
    prompt = "Replace [MASK] to create metaphor sentence: {masked_sentence}"

    inputs = [
        prompt.format(masked_sentence=sentence)
        for sentence in examples['masked_sentence']
    ]
    
    targets = examples['sentence']
    
    model_inputs = tokenizer_t5(inputs, max_length=200, truncation=True, padding="max_length")
    labels = tokenizer_t5(targets, max_length=200, truncation=True, padding="max_length")["input_ids"]
    labels = [[-100 if token == tokenizer_t5.pad_token_id else token for token in label] for label in labels]
    
    model_inputs["labels"] = labels

    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# dataset split to verify model works
dataset_split = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
eval_dataset = dataset_split['test']


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    num_train_epochs=5,
    logging_steps=100,
)

trainer = Trainer(
    model=model_t5,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()


Map:   0%|          | 0/6227 [00:00<?, ? examples/s]

  0%|          | 0/195 [00:00<?, ?it/s]

{'loss': 0.2693, 'grad_norm': 0.3655458092689514, 'learning_rate': 0.0014615384615384616, 'epoch': 0.51}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.14250056445598602, 'eval_runtime': 3.4331, 'eval_samples_per_second': 90.879, 'eval_steps_per_second': 11.36, 'epoch': 1.0}
{'train_runtime': 198.1592, 'train_samples_per_second': 31.424, 'train_steps_per_second': 0.984, 'train_loss': 0.2311469933925531, 'epoch': 1.0}


TrainOutput(global_step=195, training_loss=0.2311469933925531, metrics={'train_runtime': 198.1592, 'train_samples_per_second': 31.424, 'train_steps_per_second': 0.984, 'total_flos': 452060831539200.0, 'train_loss': 0.2311469933925531, 'epoch': 1.0})

In [None]:
#data set to train on

dataset = Dataset.from_pandas(vua_df)

In [None]:
# Load model and tokenizer
model_name = "facebook/bart-base"  # Use BART model
model_bart = BartForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer_bart = BartTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

# Add [MASK] token if not already present (optional for BART)
if "[MASK]" not in tokenizer_bart.get_vocab():
    tokenizer_bart.add_tokens("[MASK]")
    model_bart.resize_token_embeddings(len(tokenizer_bart))  # Resize embeddings to match updated vocab size

# Tokenization function
def tokenize_function(examples):
    prompt = "Replace [MASK] to create metaphor sentence: {masked_sentence}"
    
    # Prepare inputs and targets
    inputs = [
        prompt.format(masked_sentence=sentence)
        for sentence in examples['masked_sentence']
    ]
    targets = examples['sentence']
    
    # Tokenize inputs and targets
    model_inputs = tokenizer_bart(inputs, max_length=200, truncation=True, padding="max_length")
    labels = tokenizer_bart(targets, max_length=200, truncation=True, padding="max_length")["input_ids"]
    labels = [[-100 if token == tokenizer_bart.pad_token_id else token for token in label] for label in labels]
    
    model_inputs["labels"] = labels
    return model_inputs

# tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Eval set for confirming model fit
dataset_split = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
eval_dataset = dataset_split['test']

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=32,  # Adjust batch size for memory constraints
    num_train_epochs=5,
    logging_steps=100,
)

trainer = Trainer(
    model=model_bart,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/6227 [00:00<?, ? examples/s]

  0%|          | 0/195 [00:00<?, ?it/s]

{'loss': 0.3051, 'grad_norm': 0.733235776424408, 'learning_rate': 0.00014615384615384615, 'epoch': 0.51}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 0.09412288665771484, 'eval_runtime': 5.6577, 'eval_samples_per_second': 55.146, 'eval_steps_per_second': 6.893, 'epoch': 1.0}
{'train_runtime': 390.8523, 'train_samples_per_second': 15.932, 'train_steps_per_second': 0.499, 'train_loss': 0.2376271712474334, 'epoch': 1.0}


TrainOutput(global_step=195, training_loss=0.2376271712474334, metrics={'train_runtime': 390.8523, 'train_samples_per_second': 15.932, 'train_steps_per_second': 0.499, 'total_flos': 741568149504000.0, 'train_loss': 0.2376271712474334, 'epoch': 1.0})

In [None]:
'''
To be updated
'''

synthetic_sentences = [
    "The sun [MASK] across the sky, painting the horizon in gold.",
    "Her voice [MASK] like a gentle breeze on a summer day.",
    "The old car [MASK] down the road, coughing with every mile.",
    "The tree [MASK] its arms to the heavens, yearning for light.",
    "The river [MASK] its way through the valley, carving stories in the earth.",
    "The city [MASK] with life as the morning light crept in.",
    "His words [MASK] into her heart like arrows from a bow.",
    "The night sky [MASK] with a blanket of shimmering stars.",
    "The storm [MASK] its fury across the land, leaving chaos in its wake.",
    "The clock [MASK] away the seconds, a steady heartbeat in the silence.",
    "The fire [MASK] in the hearth, whispering tales of warmth and comfort.",
    "The child [MASK] through the field, chasing butterflies of her imagination.",
    "The mountain [MASK] in silent majesty, watching over the valley.",
    "The wind [MASK] its way through the trees, singing a mournful tune.",
    "His laughter [MASK] the room, lighting up every corner.",
    "The ink [MASK] across the paper, leaving trails of forgotten thoughts.",
    "The skyscrapers [MASK] into the sky, defying the pull of the earth.",
    "Her shadow [MASK] behind her, clinging like an unspoken regret.",
    "The book [MASK] stories into the minds of those who dared to read.",
    "The horizon [MASK] the sea and sky in an eternal embrace.",
    "The storm clouds [MASK] a curtain of darkness over the town.",
    "The clock [MASK] its relentless march, indifferent to human desire.",
    "His ambition [MASK] like wildfire, consuming everything in its path.",
    "The melody [MASK] through the air, weaving an invisible tapestry of sound.",
    "The bridge [MASK] the chasm, uniting what was once divided.",
    "The city lights [MASK] the darkness, a sea of artificial stars.",
    "Her tears [MASK] rivers down her cheeks, carving paths of sorrow.",
    "The wind [MASK] secrets through the cracks of the ancient walls.",
    "The mountain's peak [MASK] the heavens, shrouded in a veil of clouds.",
    "The forest [MASK] a labyrinth of shadows, hiding untold mysteries.",
    "The stars above [MASK] in patterns that whispered ancient secrets to the dreamers below.",
    "The artist's brush [MASK] across the canvas, leaving behind strokes of vibrant emotion.",
    "As the river flowed, its surface [MASK] the sky in a shimmering dance of light.",
    "Her laughter [MASK] through the quiet room, breaking the stillness with its melody.",
    "The old house, forgotten by time, [MASK] stories of those who once lived within its walls.",
    "The forest's shadows [MASK] over the path, creating a maze of darkness and light.",
    "The city streets [MASK] with echoes of footsteps as the night grew older.",
    "The candle's flame [MASK] against the cold air, a fragile beacon in the dark.",
    "His words, though soft, [MASK] deep into the hearts of those who listened.",
    "The distant mountains [MASK] a hazy blue in the fading light of dusk.",
    "The clock on the wall [MASK] a steady rhythm, marking the passage of time.",
    "As the wind passed, it [MASK] the leaves into a delicate, rustling symphony.",
    "The clouds [MASK] their shadows over the fields, shifting as the sun moved.",
    "The silence of the library [MASK] with the weight of unwritten stories and untold knowledge.",
    "The waves [MASK] the shore, pulling back only to return with renewed vigor.",
    "The room [MASK] with a faint scent of lavender as she walked in.",
    "He [MASK] the page, his eyes scanning for something he couldn’t quite define.",
    "The train [MASK] the station at precisely 9:00 AM, as it always did.",
    "Her voice [MASK] through the conversation, quiet but firm.",
    "The door [MASK] softly as it closed behind him, leaving the room in silence.",
    "The floorboards [MASK] under his weight, a reminder of the house's age.",
    "The coffee [MASK] on the counter, untouched and growing cold.",
    "He [MASK] the glass, staring out at the rain without seeing it.",
    "The email [MASK] in her inbox, waiting for her reply.",
    "The crowd [MASK] as the speaker walked to the podium.",
    "The light from the window [MASK] across the desk, illuminating the open book.",
    "She [MASK] the scarf tighter around her neck, bracing against the cold.",
    "The papers [MASK] across the table, some slipping to the floor.",
    "The clock [MASK] on the wall, its ticking filling the quiet room.",
    "The car [MASK] in the driveway, engine running but going nowhere."
]

In [13]:
def prompts(sentence: str, method: str ) -> str:
    """
    Creates prompt for input to generate method

    Args:
        sentence (str): Masked sentence
        method (str): "zero" for zero shot "few" for few shot

    Returns:
        Prompt for input to generate method
    """

    # Few-shot example
    few_shot_examples = """
    Transform the following literal sentences into metaphorical ones by replacing [MASK]:

    Input: The task is [MASK] challenging.
    Output: The task is an uphill battle.

    Input: She is [MASK] sad.
    Output: She is drowning in sorrow.

    Input: He is [MASK] angry.
    Output: He is a volcano about to erupt.
    """

    # Zero-shot example
    zero_shot_example = """
    Transform the following literal sentence into metaphorical ones by replacing [MASK]:
    """

    if method == "zero":
        prompt = zero_shot_example + f" {sentence}"
    if method == "few":
        prompt = few_shot_examples + f"\nInput: {sentence}\nOutput:"

    return prompt


In [14]:

def generate_metaphors(model, tokenizer, sentences: list[str], prompt_type: str = "few", view_output: bool = False, ) -> list[str]: # type: ignore
    
    """
    Generates metaphor sentences from masked sentences

    Args:
        model: pre-trained model
        tokenizer: model tokenizer
        sentences (list): list of sentences to transform
        prompt_type (str): "zero" for zero shot "few" for few shot
        view_output (bool): set true to print masked sentences and generated metaphor
        
    Returns:
        A list of transformed sentences
    """
    
    model = model.to(device)
    output = []
    
    for sentence in sentences:

        prompt = prompts(sentence, prompt_type)

        # Tokenize and move to the device
        input_ids = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

        # Generate the output
        output_ids = model.generate(
            inputs=input_ids["input_ids"],  
            max_length=150,                # Allow for longer outputs
            num_beams=5,                  
            do_sample=True,
            temperature=1.2,               # Slightly higher temperature for creativity
            top_k=50,                      # Use top-k sampling to encourage variation
            top_p=0.8,                     # Use nucleus sampling for creativity
            early_stopping=False
        )

        # Decode and print the output
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        output.append(output_text)

        if view_output is True:
            print(f"Input: {sentence}\nOutput: {output_text}\n")
            
    return output

In [22]:
t5_output = generate_metaphors(model=model_t5,tokenizer=tokenizer_t5, sentences=synthetic_sentences[:5], prompt_type="zero", view_output=True)
t5_output

Input: The sun [MASK] across the sky, painting the horizon in gold.
Output: The sun swept across the sky, painting the horizon in gold.

Input: Her voice [MASK] like a gentle breeze on a summer day.
Output: Her voice rose like a gentle breeze on a summer day.

Input: The old car [MASK] down the road, coughing with every mile.
Output: The old car ran down the road, coughing with every mile.

Input: The tree [MASK] its arms to the heavens, yearning for light.
Output: The tree made its arms to the heavens, yearning for light.

Input: The river [MASK] its way through the valley, carving stories in the earth.
Output: The river took its way through the valley, carving stories in the earth.



['The sun swept across the sky, painting the horizon in gold.',
 'Her voice rose like a gentle breeze on a summer day.',
 'The old car ran down the road, coughing with every mile.',
 'The tree made its arms to the heavens, yearning for light.',
 'The river took its way through the valley, carving stories in the earth.']

In [23]:
bart_output = generate_metaphors(model=model_bart,tokenizer=tokenizer_bart, sentences=synthetic_sentences[:5], prompt_type="zero", view_output=True)
bart_output

Input: The sun [MASK] across the sky, painting the horizon in gold.
Output: Pursuing Transform the following literal sentence into metaphorical ones by replacing left-over faded across the sky, painting the horizon in gold.

Input: Her voice [MASK] like a gentle breeze on a summer day.
Output: Hang like a gentle breeze on a summer day.

Input: The old car [MASK] down the road, coughing with every mile.
Output: Turning the following literal sentence into metaphorical ones by replacing the old car running down the road, coughing with every mile.

Input: The tree [MASK] its arms to the heavens, yearning for light.
Output: Turning its arms to the heavens, yearning for light.

Input: The river [MASK] its way through the valley, carving stories in the earth.
Output: pursuing its way through the valley, carving stories in the earth.



['Pursuing Transform the following literal sentence into metaphorical ones by replacing left-over faded across the sky, painting the horizon in gold.',
 'Hang like a gentle breeze on a summer day.',
 'Turning the following literal sentence into metaphorical ones by replacing the old car running down the road, coughing with every mile.',
 'Turning its arms to the heavens, yearning for light.',
 'pursuing its way through the valley, carving stories in the earth.']