In [1]:
import json
import pandas as pd
import random

from pathlib import Path

In [2]:
random.seed(42)

In [3]:
data_dir = Path("/scratch/li.mil/hierarchical-mds/instruct_baselines/cochrane/mistral-7b-instruct")
doc_dir = data_dir / "document"
para_dir = data_dir / "paragraph"
sent_dir = data_dir / "sentence"
seed = "658"

truncate_all_doc = doc_dir / "retrieved_truncate_all_greedy" / seed / "outputs.json"
truncate_all_para = para_dir / "retrieved_naive_granular_greedy" / seed / "outputs.json"
truncate_all_sent = sent_dir / "retrieved_naive_granular_greedy" / seed / "outputs.json"

In [4]:
truncate_all_doc_outputs = json.load(open(truncate_all_doc, "rb"))
truncate_all_para_outputs = json.load(open(truncate_all_para, "rb"))
truncate_all_sent_outputs = json.load(open(truncate_all_sent, "rb"))

In [5]:
doc_preds = truncate_all_doc_outputs['preds']
para_preds = truncate_all_para_outputs['preds']
sent_preds = truncate_all_sent_outputs['preds']
labels_list = truncate_all_doc_outputs['labels']

assert(truncate_all_doc_outputs['labels'] == truncate_all_para_outputs['labels'])
assert(truncate_all_para_outputs['labels'] == truncate_all_sent_outputs['labels'])

In [6]:
# Randomly order the doc/para/sents with the labels, then save this
labels = []
inputs = []

input_order = []
for doc, para, sent, label in zip(doc_preds, para_preds, sent_preds, labels_list):
    labels.append(label)
    options = ["doc", "para", "sent"]
    current_order = []
    current_str = []
    for idx in range(len(options)):
        option = random.choice(options)
        options.remove(option)

        if option == "doc":
            current_str.append(f"Output {idx + 1}: {doc}")
        elif option == "para":
            current_str.append(f"Output {idx + 1}: {para}")
        else:
            current_str.append(f"Output {idx + 1}: {sent}")

        current_order.append(option)
    
    item = "\n\n".join(current_str)
    input_order.append(current_order)
    inputs.append(item)



In [7]:
# Contains the outputs
output_dict = {
    "label": labels,
    "output": inputs
}

# Contains the keys corresponding to the output
key_dict = {
    "order": input_order
}

In [8]:
outputs_df = pd.DataFrame.from_dict(output_dict)
key_df = pd.DataFrame.from_dict(key_dict)

In [9]:
output_dir = Path("/home/li.mil/hierarchical-mds/human_eval/csv")
outputs_df.to_csv(output_dir / "preference_ranking.csv", index=False)
key_df.to_csv(output_dir / "preference_ranking_key.csv", index=False)