# Preprocess human and AI responses

In [54]:
import polars as pl
import numpy as np
import pandas as pd
import random

In [55]:
np.random.seed(777)
random.seed(777)

In [56]:
from sklearn.metrics import accuracy_score, confusion_matrix

**PREPROCESSING RULES**
 - Tasks that AI has not answered are removed from the human data (to prevent leaks)
 - LLM may answer questions other than those listed in the options, in which case the answer will be marked as `OTHER`.

In [57]:
human = pl.read_csv("ds1task5/human_data_with_gt.csv", schema_overrides={"label":str, "gt":str})

In [58]:
g100 = pl.read_csv("ds1task5/ai_workers/gpt3.5/gpt3.5_samples_100_transformed.csv", schema_overrides={"label":str, "gt":str})

In [59]:
f100 = pl.read_csv("ds1task5/ai_workers/google_flan-t5-xl/google_flan-t5-xl_ds_1_task_5_sample_100_epochs_3_prompt_max_len_4096_batch_size_4_grad_acc_2_transformed.csv", schema_overrides={"label":str, "gt":str})

In [60]:
l250 = pl.read_csv("ds1task5/ai_workers/Llama-2-70b-chat-hf/Llama-2-70b-chat-hf___ds_1_task_5_sample_250_epochs_3_prompt_max_len_4096_batch_size_4_grad_acc_2_temp_0_05_transformed.csv", schema_overrides={"label":str, "gt":str})

In [61]:
AIs = [g100, f100, l250]

In [62]:
fhuman = human.clone()
for ai in AIs:
    fhuman = fhuman.filter(pl.col("task").is_in(ai.unique("task")["task"]))

In [63]:
fAIs = []
for ai in AIs:
    target_worker = ai["worker"][0]
    missing_rows = fhuman.join(ai, on="task", how="anti")
    rows_to_add = missing_rows.select([
        pl.col("task"),
        pl.lit(target_worker).alias("worker"),
        pl.lit(None, dtype=ai["label"].dtype).alias("label"), 
        pl.col("gt")
    ]).unique("task")
    nai = pl.concat([ai, rows_to_add])
    nai = nai.with_columns(
        pl.col("label").fill_null(5).alias("label")  # fill nulls with 'other' label
    )
    #print(nai.group_by("gt").agg(pl.count()).sort("gt"))
    #print(nai.group_by("label").agg(pl.count()).sort("label"))
    fAIs.append(nai)

In [64]:
# for homo scenario
f_fhuman = fhuman.clone()

for ai, name in zip(fAIs, ["g100", "f100", "l250"]):
    f_fhuman = f_fhuman.filter(pl.col("task").is_in(ai.unique("task")["task"]))

unique_tasks = f_fhuman.select(["gt", "task"]).unique()
#print(f_fhuman.unique("task").group_by("gt").agg(pl.count()).sort("gt"))
f_fhuman.write_csv("../datasets/human.csv")

for ai, name in zip(fAIs, ["g100", "f100", "l250"]):
    #print(name)
    f_fai = ai
    #print(f_fhuman.unique("task").group_by("gt").agg(pl.count()).sort("gt"))
    #print(f_fai.unique("task").group_by("gt").agg(pl.count()).sort("gt"))
    f_fai.write_csv(f"../datasets/{name}.csv")
    #print(accuracy_score(f_fai["gt"], f_fai["label"]))
    #print(confusion_matrix(f_fai["gt"], f_fai["label"]))