# Compute BERTScores

Compute the pairwise similarity of each sentence-short text answer pair using BERTScore [BERTScore: Evaluating Text Generation with BERT](https://arxiv.org/abs/1904.09675).
GPU required.

In [None]:
import pandas as pd
import numpy as np
from bert_score import BERTScorer
import warnings
import logging

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)

scorer = BERTScorer(lang="en", rescale_with_baseline=True)


def bertscore_(prediction: str, reference: str):
    return scorer.score([prediction], [reference])


bertscore = np.vectorize(bertscore_)

In [None]:
annotators = ["A1", "A2", "A3", "A4", "A5"]


def long_to_wide(frame, n, col_names=None):
    """reshape long to wide format, transposing groups of n rows into columns"""
    base_index = pd.RangeIndex(len(frame)).to_series()
    group_level = base_index.floordiv(n)
    column_level = base_index.mod(n)

    if col_names:
        assert (
            len(col_names) == len(set(col_names)) == n
        ), "col_names must be unique and have the same length as n"
        column_level = column_level.map(dict(enumerate(col_names)))

    return (
        frame.set_index([group_level, column_level], append=True)
        .unstack()
        .droplevel(-1)
    )


def wide_to_long(frame):
    """reshape the joined wide sentence-answer frame to long format"""
    frame_long = frame.set_index("sentence", append=True).stack().rename("answer")
    return frame_long.reset_index(2).droplevel(-1)


# load datasets
nyt = pd.read_csv(
    "data/gold_data_mapped.csv",
    index_col=["batch", "file"],
)
sta_df = pd.read_csv(
    "data/clean_answers.csv",
    index_col=["batch", "file"],
)

# join datasets
sta_wide = long_to_wide(sta_df, len(annotators), col_names=annotators).answer
df = nyt[["sentence"]].join(sta_wide)
df_long = wide_to_long(df)

# inspect data
df_long

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence,answer
batch,file,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,"In the race for Westchester County executive, ...",The changing of the United States Constitution...
1,1,"In the race for Westchester County executive, ...",Mr Brodsky is criticizing Mr O'Rourke for not ...
1,1,"In the race for Westchester County executive, ...","Aside from New York city, Westchester has the ..."
1,1,"In the race for Westchester County executive, ...",about abortion
1,1,"In the race for Westchester County executive, ...",Abortion rights
...,...,...,...
10,10,The researcher is currently following a large ...,Trans fatty acids being added to some liquid v...
10,10,The researcher is currently following a large ...,Man made fatty acids are used in food to incre...
10,10,The researcher is currently following a large ...,Trans fatty acids are also added to some liqui...
10,10,The researcher is currently following a large ...,The adding of trans fatty acids to liquid vege...


In [None]:
# takes about 6 minutes with P100 GPU on kaggle.com
scores = bertscore(df_long.sentence.str.lower(), df_long.answer.str.lower())

In [None]:
# inspect results
scores_long = df_long.loc[:]
scores_long[["precision", "recall", "fscore"]] = np.array(scores).T
scores_long.head()

In [None]:
scores_long.to_csv("data/bert_scores_long_uncased.csv")