In [1]:
import numpy as np
from scipy.special import softmax
import pdb
import pandas as pd
import math
from typing import List
import random
import argparse
import torch


def sent_scoring(model_tokenizer, text, cuda, score_type="loss", output_attentions=False, length_normalize=False):
    model = model_tokenizer[0]
    tokenizer = model_tokenizer[1]
    assert model is not None
    assert tokenizer is not None
    encoded_text = tokenizer.encode(text)
    input_ids = torch.tensor(encoded_text).unsqueeze(0)
    if cuda:
        input_ids = input_ids.to('cuda')
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids, output_attentions=output_attentions)
    loss, logits = outputs[:2]

    sentence_prob = loss.item()
    if score_type == "prob":
        if length_normalize:
            mult = 2
        else:
            mult = len(encoded_text)

        sentence_prob = math.exp(-1.0 * loss * (mult - 1))

    if output_attentions:
        attn = outputs["attentions"]
        return sentence_prob, attn, input_ids

    return sentence_prob

def confusion_matrix(P_forward_1, P_forward_2, P_backward_1, P_backward_2):
    correct_forward = len(np.where(np.array(P_forward_1) >= 0.5)[0]) + len(np.where(np.array(P_forward_2) >=0.5)[0])
    wrong_forward = len(P_forward_1) + len(P_forward_2) - correct_forward

    correct_backward = len(np.where(np.array(P_backward_1) >= 0.5)[0]) + len(np.where(np.array(P_backward_2) >=0.5)[0])
    wrong_backward = len(P_backward_1) + len(P_backward_2) - correct_backward

    print("correct forward", correct_forward, "wrong forward", wrong_forward, "correct backward", correct_backward, "wrong_backward", wrong_backward)

    results = {
        "correct_forward": correct_forward,
        "wrong_forward": wrong_forward,
        "correct_backward": correct_backward,
        "wrong_backward": wrong_backward
    }

    return results

from tqdm import tqdm

def evaluate_model(model, tokenizer, test_set, middle_phrase="", use_prefix=0, verbose=True, score_type="prob", use_cuda=False, return_acc=False, total = 1094) -> tuple:
    preds = []
    labels = []
    x_1 = []
    x_2 = []
    y_1 = []
    y_2 = []
    P_x_1 = []
    P_x_2 = []
    P_y_1 = []
    P_y_2 = []
    P_x_1_y_1 = []
    P_x_1_y_2 = []
    P_x_2_y_1 = []
    P_x_2_y_2 = []
    P_x_1_correct = []
    P_x_2_correct = []
    P_y_1_correct = []
    P_y_2_correct = []
    correct = 0

    for i, metaphor_data in tqdm(enumerate(test_set), total = total):
        ctx, p1, p2 = metaphor_data["startphrase"], metaphor_data["ending1"], metaphor_data["ending2"]
        labels.append(int(metaphor_data["labels"]))
        if use_prefix > 0:
            prefix_prompt = select_prefix_prompts(prompt_file, use_prefix) if use_prefix else ""
        else:
            prefix_prompt = ""

        sent1 = prefix_prompt + ctx + ". " + middle_phrase + p1 + "."
        sent2 = prefix_prompt + ctx + ". " + middle_phrase + p2 + "."

        score1 = sent_scoring((model, tokenizer), sent1, use_cuda, score_type=score_type)
        score2 = sent_scoring((model, tokenizer), sent2, use_cuda, score_type=score_type)

        if score_type == "loss":
            pred = 0 if score1 < score2 else 1
        else:
            pred = 1 if score1 < score2 else 0

        pred_sent = sent1 if pred == 0 else sent2

        if i % 2 == 0:
            x_1.append(ctx)
            x_1_score = sent_scoring((model, tokenizer), ctx + ".", use_cuda, score_type=score_type)
            P_x_1.append(x_1_score)
            y_1.append(p1)
            y_2.append(p2)
            y1_score = sent_scoring((model, tokenizer), p1 + ".", use_cuda, score_type=score_type)
            y2_score = sent_scoring((model, tokenizer), p2 + ".", use_cuda, score_type=score_type)
            P_y_1.append(y1_score)
            P_y_2.append(y2_score)

            P_x_1_y_1.append(score1)
            P_x_1_y_2.append(score2)
            P_x_1_correct.append(score1/(score1 + score2))

        else:
            x_2.append(ctx)
            x_2_score = sent_scoring((model, tokenizer), ctx + ".", use_cuda, score_type=score_type)
            P_x_2.append(x_2_score)
            P_x_2_y_1.append(score1)
            P_x_2_y_2.append(score2)
            P_x_2_correct.append(score2/(score1 + score2))

            P_y_1_correct.append(P_x_1_y_1[-1]/(P_x_1_y_1[-1] + score1))
            P_y_2_correct.append(score2/(P_x_1_y_2[-1] + score2))

        if verbose:
            print(f"Q: {ctx}: 1. {p1} 2. {p2}")
            print(f"model says '{pred_sent}' is more likely")
            print("\n")
        if pred == metaphor_data["labels"]:
            correct += 1
        preds.append(pred)

    cols = {"x_1": x_1, "x_2": x_2, "y_1": y_1, "y_2": y_2, "P(x_1)": P_x_1, "P(x_2)": P_x_2, "P(y_1)": P_y_1, "P(y_2)": P_y_2,
        "P(x_1, y_1)": P_x_1_y_1, "P(x_1, y_2)": P_x_1_y_2, "P(x_2, y_1)": P_x_2_y_1, "P(x_2, y_2)": P_x_2_y_2,
        "P(y_1|x_1)": P_x_1_correct, "P(y_2|x_2)": P_x_2_correct, "P(x_1|y_1)": P_y_1_correct, "P(x_2|y_2)": P_y_2_correct}
    out_df = pd.DataFrame(cols)

    if return_acc:
        return correct/len(preds), out_df, preds, labels

    return out_df, preds, labels

def compute_stats(total_df: pd.DataFrame, all_preds: List, all_labels: List) -> None:
    print("overall accuracy: ")
    accuracyy = len(np.where(np.array(all_preds) == np.array(all_labels))[0])/len(all_labels)
    print(accuracyy)
    print("confusion matrix: ")
    matrix_dic = confusion_matrix(list(total_df["P(y_1|x_1)"]), list(total_df["P(y_2|x_2)"]), list(total_df["P(x_1|y_1)"]), list(total_df["P(x_2|y_2)"]))

    return accuracyy, matrix_dic




In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Load T5 tokenizer and model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [4]:
!git clone https://github.com/nightingal3/Fig-QA.git

Cloning into 'Fig-QA'...
remote: Enumerating objects: 639, done.[K
remote: Counting objects: 100% (208/208), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 639 (delta 130), reused 139 (delta 88), pack-reused 431[K
Receiving objects: 100% (639/639), 2.81 MiB | 12.47 MiB/s, done.
Resolving deltas: 100% (353/353), done.


In [5]:
import pandas as pd

# Define paths
train_small_path = "/kaggle/working/Fig-QA/data/filtered/train_s.csv"
train_path = "/kaggle/working/Fig-QA/data/filtered/train.csv"
mturk_path = "/kaggle/working/Fig-QA/data/filtered/mturk_processed - combined.csv"
test_path = "/kaggle/working/Fig-QA/data/filtered/test.csv"
train_xl_path = "/kaggle/working/Fig-QA/data/filtered/train_xl.csv"
original_data_path = "/kaggle/working/Fig-QA/data/filtered/original_data.csv"
dev_path = "/kaggle/working/Fig-QA/data/filtered/dev.csv"

# Load CSV files into dataframes
train_small_df = pd.read_csv(train_small_path)
train_df = pd.read_csv(train_path)
mturk_df = pd.read_csv(mturk_path)
test_df = pd.read_csv(test_path)
train_xl_df = pd.read_csv(train_xl_path)
original_data_df = pd.read_csv(original_data_path)
dev_df = pd.read_csv(dev_path)


In [6]:
from datasets import Dataset

train_small_dataset = Dataset.from_pandas(train_small_df)
train_dataset = Dataset.from_pandas(train_df)
mturk_dataset = Dataset.from_pandas(mturk_df)
test_dataset = Dataset.from_pandas(test_df)
train_xl_dataset = Dataset.from_pandas(train_xl_df)
original_data_dataset = Dataset.from_pandas(original_data_df)
dev_dataset = Dataset.from_pandas(dev_df)

datasets = {
    "train_small": train_small_dataset,
    "train": train_dataset,
    "mturk": mturk_dataset,
    "test": test_dataset,
    "train_xl": train_xl_dataset,
    "original_data": original_data_dataset,
    "dev": dev_dataset,
}


In [7]:
dev_dataset[0]
subset_test_dataset = dev_dataset.select(range(30))

In [8]:
subset_test_dataset = dev_dataset.select(range(30))
out_df, preds, labels = evaluate_model(model, tokenizer, subset_test_dataset, verbose = False, total = 30)
zero_shot_accuracy, conf_matrix_zero_shot =  compute_stats(out_df, preds, labels)

100%|██████████| 30/30 [00:35<00:00,  1.19s/it]

overall accuracy: 
0.5666666666666667
confusion matrix: 
correct forward 17 wrong forward 13 correct backward 14 wrong_backward 16





In [9]:
def map_concatenation_and_tokenization(samples):
    concatenated_phrases = []
    input_ids_list = []
    attention_mask_list = []

    for i in range(len(samples['startphrase'])):
        # Choose the ending based on the labels value for each sample in the batch
        ending = samples['ending1'][i] if samples['labels'][i] == 0 else samples['ending2'][i]
        concatenated_phrase = samples['startphrase'][i] + ' -> ' + ending
        concatenated_phrases.append(concatenated_phrase)

        # Tokenize the concatenated_phrase
        tokens = tokenizer(concatenated_phrase, truncation=True, max_length=512, return_tensors='pt')
        input_ids_list.append(tokens['input_ids'][0].tolist())
        attention_mask_list.append(tokens['attention_mask'][0].tolist())

    return {
        'concatenated_phrase': concatenated_phrases,
        'input_ids': input_ids_list,
        'attention_mask': attention_mask_list
    }

# def map_concatenation_and_tokenization(samples):
#     concatenated_phrases = []
#     input_ids_list = []
#     attention_mask_list = []

#     for i in range(len(samples['startphrase'])):
#         # Decide which ending is the correct and which is the wrong based on the 'labels' value
#         correct_ending = samples['ending1'][i] if samples['labels'][i] == 0 else samples['ending2'][i]
#         wrong_ending = samples['ending2'][i] if samples['labels'][i] == 0 else samples['ending1'][i]

#         concatenated_phrase = samples['startphrase'][i] + " That means that: " + correct_ending + " It is antonymous with: " + wrong_ending
#         concatenated_phrases.append(concatenated_phrase)

#         # Tokenize the concatenated_phrase
#         tokens = tokenizer(concatenated_phrase, truncation=True, max_length=512, return_tensors='pt')
#         input_ids_list.append(tokens['input_ids'][0].tolist())
#         attention_mask_list.append(tokens['attention_mask'][0].tolist())

#     return {
#         'concatenated_phrase': concatenated_phrases,
#         'input_ids': input_ids_list,
#         'attention_mask': attention_mask_list }

# Apply the mapping function

data = train_xl_dataset.map(map_concatenation_and_tokenization, batched=True)
data_val = dev_dataset.map(map_concatenation_and_tokenization, batched=True)


  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [10]:
data

Dataset({
    features: ['startphrase', 'ending1', 'ending2', 'labels', 'valid', 'qid', 'concatenated_phrase', 'input_ids', 'attention_mask'],
    num_rows: 8016
})

In [11]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.data['input_ids'][idx],
            'attention_mask': self.data['attention_mask'][idx],
            'labels': self.data['input_ids'][idx]  # In T5, labels are the same as input_ids for the decoder.
        }

train_dataset = CustomDataset(data)
val_dataset = CustomDataset(data_val)


In [12]:
def custom_data_collator(batch):
    # Find the maximum length of sequences in the batch for padding
    max_length = max([len(item['input_ids']) for item in batch])
    
    # Pad each sequence to the max_length
    input_ids = [item['input_ids'] + [tokenizer.pad_token_id] * (max_length - len(item['input_ids'])) for item in batch]
    attention_mask = [item['attention_mask'] + [0] * (max_length - len(item['attention_mask'])) for item in batch]
    labels = [item['labels'] + [tokenizer.pad_token_id] * (max_length - len(item['labels'])) for item in batch]
    
    return {
        'input_ids': torch.tensor(input_ids),
        'attention_mask': torch.tensor(attention_mask),
        'labels': torch.tensor(labels)
    }

In [13]:
training_args = TrainingArguments(
        per_device_train_batch_size=20,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps= 30,
        #num_train_epochs=2,
        eval_steps= 1,
        learning_rate= 0.0001,
        logging_steps=1,
        output_dir="outputs",
        #evaluation_strategy="steps"
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator= custom_data_collator
)

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
1,7.8924
2,7.2511
3,7.4789
4,6.2925
5,4.553
6,3.7588
7,3.3335
8,2.9664
9,2.0097
10,1.8307


TrainOutput(global_step=30, training_loss=1.8671572382251422, metrics={'train_runtime': 1334.3516, 'train_samples_per_second': 3.597, 'train_steps_per_second': 0.022, 'total_flos': 209234125209600.0, 'train_loss': 1.8671572382251422, 'epoch': 0.6})

In [14]:
model.to('cpu')
out_df, preds, labels = evaluate_model(model, tokenizer, subset_test_dataset, verbose = False, total = 30)
zero_shot_accuracy, conf_matrix_zero_shot =  compute_stats(out_df, preds, labels)

100%|██████████| 30/30 [00:36<00:00,  1.21s/it]

overall accuracy: 
0.36666666666666664
confusion matrix: 
correct forward 11 wrong forward 19 correct backward 10 wrong_backward 20



