In [1]:
data_files = ["../datasets/mnli/multinli_1.0_dev_matched.jsonl.small.jsonl",
             "../datasets/mnli/multinli_1.0_dev_mismatched.jsonl.small.jsonl",
             ]

In [2]:
import pandas as pd

def load(datasets):
    result = []
    for l in  datasets:
        df = pd.read_json(l, lines=True)
        df["dataset"] = l.split("/")[-1]
        result.append(df)

    return pd.concat(result)

df= load(data_files)
df.sample(n=5)

Unnamed: 0,gold_label,annotator_labels,pairID,promptID,dataset
1584,entailment,"[entailment, entailment, entailment, entailmen...",139669e,139669,multinli_1.0_dev_mismatched.jsonl.small.jsonl
7804,entailment,"[entailment, entailment, entailment, entailmen...",116253e,116253,multinli_1.0_dev_matched.jsonl.small.jsonl
7260,entailment,"[entailment, entailment, entailment, neutral, ...",115890e,115890,multinli_1.0_dev_matched.jsonl.small.jsonl
6010,entailment,"[entailment, entailment, entailment, entailmen...",86793e,86793,multinli_1.0_dev_matched.jsonl.small.jsonl
1344,neutral,"[neutral, contradiction, contradiction, neutra...",28337n,28337,multinli_1.0_dev_mismatched.jsonl.small.jsonl


In [3]:
df.describe()

Unnamed: 0,promptID
count,20000.0
mean,73259.8815
std,42218.092195
min,14.0
25%,36751.75
50%,72904.0
75%,110036.0
max,146129.0


In [4]:
from collections import Counter
import random
def add_column_pd(df):
    df["pd"] = df["annotator_labels"].apply(lambda x: Counter(x).most_common()[0][1]/len(x))
    df["majority_label"]= df["annotator_labels"].apply(lambda x: Counter(x).most_common()[0][0])
    return df

def add_column_anotherhuman(df, num_iterations=5):
    for i in range(num_iterations):
        df[f"another_human_{i}"] = df["annotator_labels"].apply(lambda x: random.sample(x, 1)[0])
    return df


In [5]:
df = df.pipe(add_column_pd)\
        .pipe(add_column_anotherhuman)
df.head()

Unnamed: 0,gold_label,annotator_labels,pairID,promptID,dataset,pd,majority_label,another_human_0,another_human_1,another_human_2,another_human_3,another_human_4
0,neutral,"[neutral, entailment, neutral, neutral, neutral]",63735n,63735,multinli_1.0_dev_matched.jsonl.small.jsonl,0.8,neutral,neutral,entailment,neutral,neutral,neutral
1,contradiction,"[contradiction, contradiction, contradiction, ...",91383c,91383,multinli_1.0_dev_matched.jsonl.small.jsonl,1.0,contradiction,contradiction,contradiction,contradiction,contradiction,contradiction
2,entailment,"[entailment, entailment, entailment, entailmen...",755e,755,multinli_1.0_dev_matched.jsonl.small.jsonl,1.0,entailment,entailment,entailment,entailment,entailment,entailment
3,contradiction,"[contradiction, contradiction, contradiction, ...",78013c,78013,multinli_1.0_dev_matched.jsonl.small.jsonl,1.0,contradiction,contradiction,contradiction,contradiction,contradiction,contradiction
4,contradiction,"[contradiction, contradiction, contradiction, ...",96377c,96377,multinli_1.0_dev_matched.jsonl.small.jsonl,1.0,contradiction,contradiction,contradiction,contradiction,contradiction,contradiction


In [6]:
df["pd"].value_counts()

pd
1.0    11743
0.8     4859
0.6     3045
0.4      353
Name: count, dtype: int64

In [7]:
from sklearn.metrics import accuracy_score
import statistics


def accuracy_by_pd(df,pred_labels, majority_label="majority_label"):
    items_dataset = df["dataset"].unique()
    result   = []
    # For each dataset
    for d in items_dataset:
        df_dataset = df[df["dataset"] == d]
        items_pd = df_dataset["pd"].unique()
        # For pd
        for p_d in items_pd:
            subset_df = df_dataset[df_dataset["pd"] == p_d]
            scores = []

            # For pred label
            for pred_label in pred_labels:
                score = accuracy_score(subset_df[majority_label], subset_df[pred_label])
                scores.append(score)

            result.append({
                "dataset": d,
                "pd": p_d,
                "S": len(subset_df),
                "Expected_Accuracy": p_d,
                 f"score_mean_{pred_labels[0]}": statistics.mean(scores),
                f"score_std_{pred_labels[0]}": statistics.stdev(scores),
                f"delta_expected_{pred_labels[0]}": p_d - statistics.mean(scores),
                "M": "-"

            })
    return pd.DataFrame(result)

scores_df = accuracy_by_pd(df,[l for l in list(df.columns) if l.startswith( "another_human" )])
scores_df

Unnamed: 0,dataset,pd,S,Expected_Accuracy,score_mean_another_human_0,score_std_another_human_0,delta_expected_another_human_0,M
0,multinli_1.0_dev_matched.jsonl.small.jsonl,0.8,2457,0.8,0.799919,0.007341,8.1e-05,-
1,multinli_1.0_dev_matched.jsonl.small.jsonl,1.0,5759,1.0,1.0,0.0,0.0,-
2,multinli_1.0_dev_matched.jsonl.small.jsonl,0.6,1599,0.6,0.597749,0.018443,0.002251,-
3,multinli_1.0_dev_matched.jsonl.small.jsonl,0.4,185,0.4,0.419459,0.039091,-0.019459,-
4,multinli_1.0_dev_mismatched.jsonl.small.jsonl,1.0,5984,1.0,1.0,0.0,0.0,-
5,multinli_1.0_dev_mismatched.jsonl.small.jsonl,0.6,1446,0.6,0.607469,0.013072,-0.007469,-
6,multinli_1.0_dev_mismatched.jsonl.small.jsonl,0.8,2402,0.8,0.797669,0.005791,0.002331,-
7,multinli_1.0_dev_mismatched.jsonl.small.jsonl,0.4,168,0.4,0.390476,0.038021,0.009524,-


In [11]:
def format_latex(df):
    df["formatted_score"] = df.apply(lambda x: f"{x['score_mean_another_human_0']:.2f}$\\pm${x['score_std_another_human_0']:.2f}", axis=1)

    df["formatted_pd"] = df["pd"].apply(lambda x: f"{x:.2f}")

    df_new = df.set_index(["dataset", "formatted_pd", "S"])

    return df_new[["Expected_Accuracy", "formatted_score", "delta_expected_another_human_0", "M"]].to_latex(index=True)

print(format_latex(scores_df))

\begin{tabular}{lllrlrl}
\toprule
 &  &  & Expected_Accuracy & formatted_score & delta_expected_another_human_0 & M \\
dataset & formatted_pd & S &  &  &  &  \\
\midrule
\multirow[t]{4}{*}{multinli_1.0_dev_matched.jsonl.small.jsonl} & 0.80 & 2457 & 0.800000 & 0.80$\pm$0.01 & 0.000081 & - \\
\cline{2-7}
 & 1.00 & 5759 & 1.000000 & 1.00$\pm$0.00 & 0.000000 & - \\
\cline{2-7}
 & 0.60 & 1599 & 0.600000 & 0.60$\pm$0.02 & 0.002251 & - \\
\cline{2-7}
 & 0.40 & 185 & 0.400000 & 0.42$\pm$0.04 & -0.019459 & - \\
\cline{1-7} \cline{2-7}
\multirow[t]{4}{*}{multinli_1.0_dev_mismatched.jsonl.small.jsonl} & 1.00 & 5984 & 1.000000 & 1.00$\pm$0.00 & 0.000000 & - \\
\cline{2-7}
 & 0.60 & 1446 & 0.600000 & 0.61$\pm$0.01 & -0.007469 & - \\
\cline{2-7}
 & 0.80 & 2402 & 0.800000 & 0.80$\pm$0.01 & 0.002331 & - \\
\cline{2-7}
 & 0.40 & 168 & 0.400000 & 0.39$\pm$0.04 & 0.009524 & - \\
\cline{1-7} \cline{2-7}
\bottomrule
\end{tabular}

