## Clustering

- Map the short text answers to the source article sentences
- Compute the percentage agreement with the gold data
- Inspect clusters

In [1]:
import pandas as pd
from mapping_util import (
    add_sentence_index,
    long_to_wide,
    get_prediction,
    get_mapped_sent_indexes,
)

bf = ["batch", "file"]
annotators = ["A1", "A2", "A3", "A4", "A5"]

nyt = pd.read_csv("data/gold_data_mapped.csv", index_col=bf).rename(
    columns={"sent_index": "par_sent_index"}
)
nyt = add_sentence_index(nyt)
gold = nyt.groupby(level=bf).is_antecedent.idxmax().str[-1]

sta = pd.read_csv("data/bert_scores_long_uncased.csv", index_col=bf)
scores = add_sentence_index(long_to_wide(sta, len(annotators), annotators))


def evaluate(strategy):
    prediction = get_prediction(scores[strategy].set_index(nyt.is_sn_sent, append=True))

    return prediction.eq(gold).mean()

#### Compute agreement with gold data

In [2]:
pd.DataFrame({"strategy": ["precision", "recall", "fscore"]}).assign(
    accuracy=lambda frame: frame.strategy.apply(evaluate)
).set_index("strategy")

Unnamed: 0_level_0,accuracy
strategy,Unnamed: 1_level_1
precision,0.36
recall,0.73
fscore,0.59


#### Inspect clusters

In [None]:
sn = "shellnoun"
shellnouns = nyt.shellnoun.groupby(level=bf).head(1)


def create_cumulative_table(frame, col_name):
    """Split into one column per noun and add cumulative counts"""

    counts = frame.groupby(level=[sn]).value_counts()
    wide = (
        counts.unstack(level=sn).rename_axis(index=col_name, columns=None)
    ).sort_index(ascending=False)
    return wide.assign(Cumul=wide.sum(axis=1).cumsum()).T


clustering = (
    scores.recall.groupby(level=bf)
    .apply(lambda f: f.droplevel(bf).idxmax())
    .set_index(shellnouns, append=True)
)
clustering.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A1,A2,A3,A4,A5
batch,file,shellnoun,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,issue,34,28,29,36,37
1,2,issue,11,6,6,6,6
1,3,fact,2,20,2,0,2
1,4,issue,0,8,22,22,0
1,5,fact,13,13,33,30,33


In [4]:
gold_sent_counts = clustering.eq(gold.values[:, None]).sum(axis=1)
create_cumulative_table(gold_sent_counts, "N")

N,5,4,3,2,1,0
fact,11,14,12,5,6,2
issue,2,8,10,9,10,11
Cumul,13,35,57,71,87,100


In [5]:
largest_clusters = clustering.apply(lambda row: row.value_counts().max(), axis=1)
create_cumulative_table(largest_clusters, "Size")

Size,5,4,3,2,1
fact,11,15,15,8,1
issue,2,9,16,19,4
Cumul,13,37,68,95,100
