# Compare Rankings from NIST Assessment and Auto Summary

In [1]:
import pandas as pd

In [2]:
subs_df = pd.read_csv("submissions.csv")

In [3]:
run_to_team_map = {row["runtag"]:row["team"] for idx,row in subs_df.iterrows()}

In [4]:
rouge_df = pd.read_csv("by_run.rouge.summary.csv")
bert_df = pd.read_csv("by_run.bertscore.summary.csv")

In [5]:
rouge_df["team"] = rouge_df["run"].apply(run_to_team_map.get)
bert_df["team"] = bert_df["run"].apply(run_to_team_map.get)

In [6]:
ranking_rouge_nist = list(zip(range(rouge_df.shape[0]), rouge_df.sort_values(by="nist.f1", ascending=False)["run"]))
ranking_rouge_wiki = list(zip(range(rouge_df.shape[0]), rouge_df.sort_values(by="wiki.f1", ascending=False)["run"]))
ranking_bert_nist = list(zip(range(bert_df.shape[0]), bert_df.sort_values(by="nist.f1", ascending=False)["run"]))
ranking_bert_wiki = list(zip(range(bert_df.shape[0]), bert_df.sort_values(by="wiki.f1", ascending=False)["run"]))


In [7]:
ranking_rouge_nist_df = pd.DataFrame(ranking_rouge_nist, columns=["rank.nist", "runtag"]).set_index("runtag")
ranking_rouge_wiki_df = pd.DataFrame(ranking_rouge_wiki, columns=["rank.wiki", "runtag"]).set_index("runtag")
ranking_bert_nist_df = pd.DataFrame(ranking_bert_nist, columns=["rank.nist", "runtag"]).set_index("runtag")
ranking_bert_wiki_df = pd.DataFrame(ranking_bert_wiki, columns=["rank.wiki", "runtag"]).set_index("runtag")

In [8]:
ranking_rouge_df = ranking_rouge_nist_df.join(ranking_rouge_wiki_df)
ranking_bert_df = ranking_bert_nist_df.join(ranking_bert_wiki_df)

In [9]:
auto_sum_rank_df = ranking_rouge_df.join(ranking_bert_df, lsuffix=".rouge", rsuffix=".bert")

In [10]:
auto_sum_rank_df.corr()

Unnamed: 0,rank.nist.rouge,rank.wiki.rouge,rank.nist.bert,rank.wiki.bert
rank.nist.rouge,1.0,0.492674,0.664225,0.64591
rank.wiki.rouge,0.492674,1.0,0.62149,0.529304
rank.nist.bert,0.664225,0.62149,1.0,0.702686
rank.wiki.bert,0.64591,0.529304,0.702686,1.0


In [11]:
assess_df = pd.read_csv("evaluation.output.assessors/all_runs.csv", index_col=0)
assess_df["event_id"] = assess_df["req_id"].apply(lambda req_id: req_id.rpartition("-")[0])

In [12]:
rel2023_events = [
    'CrisisFACTS-009',
    'CrisisFACTS-010',
    'CrisisFACTS-011',
    'CrisisFACTS-012',
    'CrisisFACTS-013',
    'CrisisFACTS-014',
    'CrisisFACTS-015',
    'CrisisFACTS-016',
    'CrisisFACTS-017',
    'CrisisFACTS-018'

]

In [13]:
rel_assess_df = assess_df[assess_df["event_id"].isin(rel2023_events)]

Unnamed: 0,req_id,run,redundancy,comprehensiveness,team,event_id
322,CrisisFACTS-013-r0,IRLabIITBHU_BM25_1,1.000000,0.027778,IRLAB_IIT_BHU,CrisisFACTS-013
323,CrisisFACTS-013-r0,IRLabIITBHU_DFReeKLIM_1,1.000000,0.027778,IRLAB_IIT_BHU,CrisisFACTS-013
324,CrisisFACTS-013-r0,Siena.WikiTrigrams1,1.000000,0.027778,SienaCLTeam,CrisisFACTS-013
325,CrisisFACTS-013-r0,baseline.v1,0.500000,0.027778,crisisfacts,CrisisFACTS-013
326,CrisisFACTS-013-r0,baseline.v2,1.000000,0.027778,crisisfacts,CrisisFACTS-013
...,...,...,...,...,...,...
1441,CrisisFACTS-009-r0,V-TorontoMU_USE_4,0.000000,0.000000,V-TorontoMU,CrisisFACTS-009
1442,CrisisFACTS-009-r0,llama,1.000000,0.090909,umd_hcil,CrisisFACTS-009
1443,CrisisFACTS-009-r0,Human_Info_Lab-FM-B,0.142857,0.181818,Human_Info_Lab,CrisisFACTS-009
1444,CrisisFACTS-009-r0,llama_13b_chat,0.000000,0.000000,OHM,CrisisFACTS-009


In [14]:
run_scores = []
for runtag,group in rel_assess_df.groupby("run"):
    this_mean_score = group[["event_id", "redundancy", "comprehensiveness"]].groupby("event_id").mean().mean()
    
    run_scores.append({
        "runtag": runtag,
        "redundancy": this_mean_score["redundancy"],
        "comprehensiveness": this_mean_score["comprehensiveness"],
        "f1": ((2 * this_mean_score["redundancy"] * this_mean_score["comprehensiveness"]) / (this_mean_score["redundancy"] + this_mean_score["comprehensiveness"]))
    })

In [15]:
run_scores_df = pd.DataFrame(run_scores)
ranking_nist = list(zip(range(run_scores_df.shape[0]), run_scores_df.sort_values(by="f1", ascending=False)["runtag"]))
ranking_nist_df = pd.DataFrame(ranking_nist, columns=["rank.assessor", "runtag"]).set_index("runtag")


In [16]:
ranking_nist_df.join(auto_sum_rank_df).corr()

Unnamed: 0,rank.assessor,rank.nist.rouge,rank.wiki.rouge,rank.nist.bert,rank.wiki.bert
rank.assessor,1.0,0.695783,0.465999,0.834007,0.592121
rank.nist.rouge,0.695783,1.0,0.61509,0.771227,0.769862
rank.wiki.rouge,0.465999,0.61509,1.0,0.594945,0.368347
rank.nist.bert,0.834007,0.771227,0.594945,1.0,0.708968
rank.wiki.bert,0.592121,0.769862,0.368347,0.708968,1.0
