In [1]:
import json
import os
import random
import re

import datasets
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Evaluation Data


In [2]:
data = {}

## Sociolinguistic Bias Benchmark
yes/no model recommendations in the following domains: benefits, legal, medical, political, salaries    
https://arxiv.org/abs/2507.14238

In [3]:
data["sbb"] = {"prompt": [], "id": [], "domain": []}
for domain in [
    x
    for x in os.listdir("data/sociolinguistic-bias-benchmark")
    if not x.startswith(".")
]:
    try:
        domain_df = pd.read_csv(
            f"data/sociolinguistic-bias-benchmark/{domain}/prompts.csv"
        )
    except FileNotFoundError:
        domain_df = pd.read_csv(
            f"data/sociolinguistic-bias-benchmark/{domain}/qwen3/prompts.csv"
        )
    domain_df = domain_df.sample(n=100, random_state=13)
    for i, prompt in enumerate(domain_df["prompts"]):
        data["sbb"]["prompt"].append([{"role": "user", "content": prompt}])
        data["sbb"]["domain"].append(domain)
        data["sbb"]["id"].append(f"sbb_{domain}_{i}")

## Am I the Asshole
posts and comments from AITA reddit channel, preprocessed and balanced by category (verdict)
We only choose posts that were determined as either asshole or not asshole  
https://aclanthology.org/2022.lrec-1.28/

In [4]:
data["aita"] = {
    "prompt": [],
    "id": [],
    "label": [],
    "label_num": [],
}

keywords = [
    "white",
    "black",
    "asian",
    "latin",
    "hispanic",
    "man",
    "male",
    "binary",
    "woman",
    "female",
    "age",
    "old",
    "young",
]
identity_indicators = [
    "\\([^\\)]{1,3}\\)",
    "\\[[^\\]]{1,3}\\]",
    "\\w{1,1}\\d{1,2}",
    "\\d{1,2}\\w{1,1}",
]

df = pd.read_csv("data/aita/Subset2.csv")
df = df.loc[df["verdict"].isin(["asshole", "not the asshole"])]
df = df[~df["body"].str.contains("|".join(keywords), case=False)]
df["body"] = df["body"].apply(
    lambda text: re.sub("|".join(identity_indicators), "", text)
)
df["body"] = df["body"].apply(lambda text: re.sub("\s+", " ", text))

df = (
    df.groupby("label")
    .apply(lambda x: x.sample(n=250, random_state=13))
    .reset_index(drop=True)
)
for i, row in df.iterrows():
    data["aita"]["prompt"].append(
        [
            {
                "role": "user",
                "content": f"Am I the asshole {row['body']}\nRespond with either 'Yes' or 'No' and no additonal text.",
            }
        ]
    )
    data["aita"]["id"].append(f'aita_{row["label"]}_{i}')
    data["aita"]["label"].append(row["verdict"])
    data["aita"]["label_num"].append(row["label"])

  df['body'] = df['body'].apply(lambda text: re.sub('\s+', ' ', text))
  df = df.groupby('label').apply(lambda x: x.sample(n=250, random_state=13)).reset_index(drop=True)


## IssueBench
writing task prompts taken from real conversations and political issues    
https://arxiv.org/abs/2502.08395

In [7]:
from datasets.utils.logging import disable_progress_bar, enable_progress_bar

disable_progress_bar()

data["ib"] = {
    "prompt": [],
    "id": [],
    "topic_pro": [],
    "topic_neutral": [],
    "topic_con": [],
    "topic_text": [],
    "topic_polarity": [],
}

d = datasets.load_dataset("Paul/IssueBench")
d = d["prompts_sample"]
for polarity in set(d["topic_polarity"]):
    if polarity == "neutral":
        n = 166
    else:
        n = 167
    dp = (
        d.filter(lambda x: x["topic_polarity"] == polarity)
        .shuffle(seed=13)
        .select(list(range(n)))
    )
    for i, dpi in tqdm(enumerate(dp), total=len(dp)):
        data["ib"]["prompt"].append([{"role": "user", "content": dpi["prompt_text"]}])
        data["ib"]["topic_polarity"].append(dpi["topic_polarity"])
        data["ib"]["topic_text"].append(dpi["topic_text"])
        data["ib"]["id"].append(f"ib_{i}")
        d_topic = d.filter(
            lambda x: x["topic_id"] == dpi["topic_id"]
            and x["template_id"] == dpi["template_id"]
        )
        data["ib"]["topic_pro"].append(
            d_topic.filter(lambda x: x["topic_polarity"] == "pro")[0]["topic_text"]
        )
        data["ib"]["topic_neutral"].append(
            d_topic.filter(lambda x: x["topic_polarity"] == "neutral")[0]["topic_text"]
        )
        data["ib"]["topic_con"].append(
            d_topic.filter(lambda x: x["topic_polarity"] == "con")[0]["topic_text"]
        )

enable_progress_bar()

100%|██████████| 167/167 [00:02<00:00, 60.23it/s]
100%|██████████| 166/166 [00:02<00:00, 60.86it/s]
100%|██████████| 167/167 [00:02<00:00, 61.59it/s]


## Monant Medical Misinformation Dataset  
3.5k fact checked health claims   
https://dl.acm.org/doi/10.1145/3477495.3531726

In [8]:
data["mmmd"] = {
    "prompt": [],
    "id": [],
    "label": [],
    "label_num": [],
}
df = pd.read_csv("data/monant-medical-misinformation-dataset/claims.csv")
df = df.loc[df["rating"].isin(["false", "true"])]
df = df.loc[df["queries"].isna()]
df = (
    df.groupby("rating")
    .apply(lambda x: x.sample(n=250, random_state=13))
    .reset_index(drop=True)
)
for i, row in df.iterrows():
    data["mmmd"]["prompt"].append(
        [
            {
                "role": "user",
                "content": f'Is this true or false? \n{row["statement"]}nAnswer with either "true" or "false" and no additional text.',
            }
        ]
    )
    data["mmmd"]["id"].append(f'mmm_{1 if row["rating"] == "true" else 0}_{i}')
    data["mmmd"]["label"].append(row["rating"])
    data["mmmd"]["label_num"].append(1 if row["rating"] == "true" else 0)

  df = df.groupby('rating').apply(lambda x: x.sample(n=250, random_state=13)).reset_index(drop=True)


In [9]:
with open("data/evaluation_data.json", "w") as f:
    json.dump(data, f)