In [1]:
import glob
import os

import numpy as np
import pandas as pd

# Submission Format

We will use the following to convert csv files to the json format used for the evaluation.

We first transform the test set (validation till test solutions are not released).

For the evaluation soft labels, we use the mean of the annotator aggregations, with possible values (0, 1/3, 2/3, 1).

In [2]:
def soft_dict_t1(stereo):
    return {"Stereotype": stereo, "NoStreotype": 1 - stereo}


def soft_dict_t2(row, stereo="stereotype", imp="implicit"):
    implicit = row[stereo] * row[imp]
    explicit = row[stereo] * (1 - row[imp])
    return {"Implicit": implicit, "Explicit": explicit, "NoStreotype": 1 - row[stereo]}

In [3]:
def test_to_json(test, name="test"):
    test["test_case"] = "DETESTS-Dis"
    # T1 HARD
    df = test.copy()
    df["value"] = np.where(df["stereotype"] == 1, "Stereotype", "NoStereotype")
    df[["test_case", "id", "value"]].to_json(f"data/{name}_t1_hard.json", orient="records", indent=4)

    # T2 HARD
    df = test.copy()
    df["value"] = np.select(
        [df["implicit"] == 1, df["stereotype"] == 1], ["Implicit", "Explicit"], default="NoStereotype"
    )
    df[["test_case", "id", "value"]].to_json(f"data/{name}_t2_hard.json", orient="records", indent=4)

    # T1 SOFT
    df = test.copy()
    df["stereotype_soft"] = df[["stereotype_a1", "stereotype_a2", "stereotype_a3"]].mean(axis=1)
    df["value"] = df["stereotype_soft"].apply(soft_dict_t1)
    df[["test_case", "id", "value"]].to_json(f"data/{name}_t1_soft.json", orient="records", indent=4)

    # T2 SOFT
    df = test.copy()
    df["value"] = df.apply(soft_dict_t2, args=("stereotype_soft", "implicit_soft"), axis=1)
    df[["test_case", "id", "value"]].to_json(f"data/{name}_t2_soft.json", orient="records", indent=4)

We create a validation partition

In [4]:
train = pd.read_csv("data/train.csv")
df = train
train = df.sample(frac=0.8, random_state=42)
validation = df.drop(train.index)

train.to_csv("data/train_val.csv", index=False)
validation.to_csv("data/validation.csv", index=False)

In [5]:
val = pd.read_csv("data/validation.csv")
val["stereotype_soft"] = val[["stereotype_a1", "stereotype_a2", "stereotype_a3"]].mean(axis=1)
val["implicit_soft"] = val[["implicit_a1", "implicit_a2", "implicit_a3"]].mean(axis=1)

test_to_json(val, "validation")

In [6]:
## When the test solutions are available
# test = pd.read_csv("data/test_solutions.csv")
# test["stereotype_soft"] = test[["stereotype_a1", "stereotype_a2", "stereotype_a3"]].mean(axis=1)
# test["implicit_soft"] = test[["implicit_a1", "implicit_a2", "implicit_a3"]].mean(axis=1)

# test_to_json(test, "test")

The baselines may be converted to json as follows.

You can use the same functions for your results.

In [7]:
def json_t1_hard(file):
    df = pd.read_csv(file)
    df["test_case"] = "DETESTS-Dis"
    df["value"] = np.where(df["stereotype"] == 1, "Stereotype", "NoStereotype")
    df[["test_case", "id", "value"]].to_json(file[:-4] + ".json", orient="records", indent=4)


def json_t2_hard(file):
    df = pd.read_csv(file)
    df["test_case"] = "DETESTS-Dis"
    df["value"] = np.select(
        [df["implicit"] == 1, df["stereotype"] == 1], ["Implicit", "Explicit"], default="NoStereotype"
    )
    df[["test_case", "id", "value"]].to_json(file[:-4] + ".json", orient="records", indent=4)


def json_t1_soft(file):
    df = pd.read_csv(file)
    df["test_case"] = "DETESTS-Dis"
    df["value"] = df["stereotype"].apply(soft_dict_t1)
    df[["test_case", "id", "value"]].to_json(file[:-4] + ".json", orient="records", indent=4)


def json_t2_soft(file):
    df = pd.read_csv(file)
    df["test_case"] = "DETESTS-Dis"
    df["value"] = df.apply(soft_dict_t2, axis=1)
    df[["test_case", "id", "value"]].to_json(file[:-4] + ".json", orient="records", indent=4)

In [8]:
for file in glob.glob("baselines/validation/*_t1_hard.csv"):
    json_t1_hard(file)

for file in glob.glob("baselines/validation/*_t2_hard.csv"):
    json_t2_hard(file)

for file in glob.glob("baselines/validation/*_t1_soft.csv"):
    json_t1_soft(file)

for file in glob.glob("baselines/validation/*_t2_soft.csv"):
    json_t2_soft(file)
    
for file in glob.glob("baselines/test/*_t1_hard.csv"):
    json_t1_hard(file)

for file in glob.glob("baselines/test/*_t2_hard.csv"):
    json_t2_hard(file)

for file in glob.glob("baselines/test/*_t1_soft.csv"):
    json_t1_soft(file)

for file in glob.glob("baselines/test/*_t2_soft.csv"):
    json_t2_soft(file)

# Evaluation

In [9]:
from evaluation import evaluate, main

Here we provide an example of the metrics for the 4 tasks

In [10]:
main()

2024-04-17 17:45:10,551 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['FMeasure', 'Precision', 'Recall']
2024-04-17 17:45:10,560 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
2024-04-17 17:45:10,562 - pyevall.metrics.metrics - INFO -             evaluate() - Executing precision evaluation method
2024-04-17 17:45:10,562 - pyevall.metrics.metrics - INFO -             evaluate() - Executing recall evaluation method
Task 1 Hard Labels {'F1': 0.8}
2024-04-17 17:45:10,563 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['CrossEntropy']
2024-04-17 17:45:10,567 - pyevall.metrics.metrics - INFO -             evaluate() - Executing Cross Entropy evaluation method
Task 1 Soft Labels {'Cross Entropy': 1.5808285223439784}
2024-04-17 17:45:10,568 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICMSoft', 'ICMSoftNorm']
2024-04-

You may try them with the baselines or your own models

In [11]:
pred = "baselines/validation/tfidf_svc_t1_hard.json"
gold = "data/validation_t1_hard.json"
evaluate(pred, gold)

2024-04-17 17:45:11,903 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['FMeasure', 'Precision', 'Recall']
2024-04-17 17:45:11,970 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
2024-04-17 17:45:12,220 - pyevall.metrics.metrics - INFO -             evaluate() - Executing precision evaluation method
2024-04-17 17:45:12,221 - pyevall.metrics.metrics - INFO -             evaluate() - Executing recall evaluation method


{'F1': 0.27698185291308497}

In [12]:
pred = "baselines/validation/tfidf_svc_t2_hard.json"
gold = "data/validation_t2_hard.json"
evaluate(pred, gold)

2024-04-17 17:45:13,255 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm']
2024-04-17 17:45:13,322 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-04-17 17:45:13,564 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2024-04-17 17:45:13,564 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-04-17 17:45:13,807 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method


{'ICM': 0.04972717204505462, 'ICM Norm': 0.5223741437225744}