In [2]:
import pandas as pd
from src.pycocoevalcap.meteor.meteor import Meteor
from src.pycocoevalcap.rouge.rouge import Rouge
from src.pycocoevalcap.bleu.bleu import Bleu
from sentence_transformers import SentenceTransformer, util

meteor_obj = Meteor()
rouge_obj = Rouge()
bleu_obj = Bleu(4)
from datasets import load_dataset
import random

def white_space_fix(text):
    return " ".join(text.split())


def process_narrative_row(row):
    """Helper functions for NarrativeQA Dataset."""
    answer = random.choice(row["answers"])["text"]

    question = row["question"]["text"]

    article = row["document"]["summary"]["text"]

    context = "question: " + question + " context: " + article + " </s>"

    return {
        "article": white_space_fix(context),
        "answer": white_space_fix(answer + " </s>"),
    }


def test_response_prediction(pred_file_name, dataset='squad_v2'):
    df = pd.read_csv(pred_file_name).astype(str)
    predictions = df["predictions_str"].tolist()
    normal_preds = [white_space_fix(pred).removesuffix(' </s>') for pred in predictions]

    if dataset == 'narrativeqa':
        dev_dataset = load_dataset("narrativeqa", split="validation")
        dev_dataset = dev_dataset.map(
            process_narrative_row,
            remove_columns=["document", "answers", "question"],
        )
    
    gold_lines = []
    for row in dev_dataset:
        gold_line = white_space_fix(row["answer"].strip()).removesuffix(' </s>')
        gold_lines.append(gold_line)

    assert len(gold_lines) == len(normal_preds)

    word_target_dict = {}
    word_response_dict = {}

    for i in range(len(gold_lines)):
        word_target_dict[i] = [gold_lines[i]]
        word_response_dict[i] = [normal_preds[i]]

    bleu_score, bleu_scores = bleu_obj.compute_score(
            word_target_dict, word_response_dict)

    bleu1_score, _, _, bleu4_score = bleu_score

    bleu1_scores, _, _, bleu4_scores = bleu_scores

    rouge_score, rouge_scores = rouge_obj.compute_score(
            word_target_dict, word_response_dict) 

    '''
    model = SentenceTransformer('stsb-roberta-large')

    embedding1 = model.encode(gold_lines, convert_to_tensor=True)
    embedding2 = model.encode(normal_preds, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    sim = 0.0
    for i in range(len(gold_lines)):
        sim += cosine_scores[i][i].item()

    mean_sim = sim / len(gold_lines)
    '''
    #return {"ROUGE-L": rouge_score, "BLEU-1": bleu1_score, "BLEU-4": bleu4_score, "COS": mean_sim}
    return {"ROUGE-L": rouge_score, "BLEU-1": bleu1_score, "BLEU-4": bleu4_score}

In [None]:
main_path = "/Users/saeed/Desktop/codes/repos/dreamscape-qa/experiment-data/response_narrative_qa_data/"
test_response_prediction(main_path + "narrativeqa_dev.epoch0.csv", dataset='narrativeqa')

Using custom data configuration default


Downloading and preparing dataset narrative_qa/default (download: 183.61 MiB, generated: 15.21 GiB, post-processed: Unknown size, total: 15.38 GiB) to /Users/saeed/.cache/huggingface/datasets/narrative_qa/default/0.0.0/daef7ccc51ec258bef464658d11751bb20f033da9b4c219fd84563b3a4af0422...


0 examples [00:00, ? examples/s]