# Merge Facts from Submitted Runs

For every submitted run with evaluation priority of <= 2 (i.e., the top two most important runs submitted by a team), we take the `TOP_K=32` facts from that run and add it to a running list of facts generated for a specific event-day pair.

We will use these lists for de-duplication in the next script.

In [None]:
import glob
import gzip
import json
import bert_score
import pandas as pd

In [None]:
from itertools import combinations

In [None]:
from nltk.tokenize import TweetTokenizer

In [None]:
run_data_df = pd.read_csv("submissions.csv")

In [None]:
all_runs_to_include = set()
for team,group in run_data_df.groupby("team"):
    print(team)
    print("\t", ", ".join(group["priority"].apply(str)))
    
    runs_to_include = group[group["priority"] <= 2].sort_values(by="priority", ascending=False).head(2)
    all_runs_to_include = all_runs_to_include.union(runs_to_include["filename"])

In [None]:
all_runs_to_include

In [None]:
filename_to_runtag = {row["filename"]:row["runtag"] for idx,row in run_data_df.iterrows()}

In [None]:
OUTPUT_DIR = "event-days"
TOP_K = 32

In [None]:
tknzr = TweetTokenizer()

In [None]:
for submission_file in glob.glob("*.gz"):
    print(submission_file, filename_to_runtag[submission_file])
    
    if not submission_file in all_runs_to_include:
        print("\t", "SKIPPING")
        continue
    
    runtag = filename_to_runtag[submission_file]
    
    with gzip.open(submission_file, "rb") as in_file:
        rows = []
        for line_ in in_file:
            line = line_.decode("utf8")
            fact = json.loads(line)
            
            rows.append(fact)
            
        this_run_df = pd.DataFrame(rows)
        for requestId,group in this_run_df.groupby("requestID"):
            new_group_df = group.sort_values(by="unixTimestamp")

            # Data hygiene to ensure we have non-empty sentences with more than one token
            new_group_df["tokens"] = new_group_df["factText"].apply(lambda s: len(tknzr.tokenize(s)))
            new_group_df = new_group_df[new_group_df["tokens"] > 1]
            new_group_df = new_group_df[new_group_df["factText"].str.len() > 0].copy()
            
            new_group_df.index = list(range(0,new_group_df.shape[0]))
            new_group_df["factID"] = ["%s-%s-%04d" % (requestId,runtag,i) for i in new_group_df.index]
            new_group_df["runtag"] = runtag
            
            with open("%s/%s.json" % (OUTPUT_DIR,requestId), "a") as out_file:
                [out_file.write("%s\n" % (json.dumps(r))) for r in new_group_df.to_dict(orient="records")]
