Let $C = \{c_1, c_2, ..., c_n\}$ be the comments and $X_i$ = $x_i^j , j=1,2,\cdots,m$, where $x_i^j$ the annotation of annotator j for comment i.


Let $X^{\rho}_i$ the annotations for $c_i$ which belong in a partition $\rho \in \Rho$. Each $\rho$ in this case is a factor of feature $\Rho$

Then we can define $aposteriori(c_i) = max_{\rho} \{ndfu(X_i) - ndfu(X_i^ \rho)\}$
and $aposteriori(C) = Wilcoxon(aposteriori(c_i), 0, alternative="greater")$

In [1]:
%cd ..

/home/dimits/Documents/research/llm_moderation/experiments


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm

from tasks import preprocessing
from annotation_agreement import aposteriori


CONVERSATION_DIR = "data/discussions_output"
ANNOTATION_DIR = "data/annotations_output" 

In [3]:
conv_df = preprocessing.import_and_format_conversations(CONVERSATION_DIR)
conv_df = conv_df.rename(
    {
        "age": "user_age",
        "ethnicity": "user_ethnicity",
        "is_heterosexual": "user_is_heterosexual",
        "profession": "user_profession",
        "gender": "user_gender",
        "education": "user_education",
        "intent": "user_intent"
    },
    axis=1,
)
conv_df = conv_df[
    [
        "id",
        "user",
        "moderator_exists",
        "message",
        "user_age",
        "user_is_heterosexual",
        "user_profession",
        "user_gender",
        "user_intent"
    ]
]
conv_df

Unnamed: 0,id,user,moderator_exists,message,user_age,user_is_heterosexual,user_profession,user_gender,user_intent
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,DarkShadow333,True,I'm so sick of people making excuses for terro...,29.0,False,employed,Man,Troll
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,moderator,True,"@DarkShadow333, I understand your frustration,...",,True,employed,,
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,TechGuru88,True,I'm not sure what kind of world we're living i...,35.0,True,employed,Man,Helpful
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,moderator,True,,,True,employed,,
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,WiseOwl71,True,I couldn't agree more with @TechGuru88. It's a...,71.0,True,employed,Woman,Helpful
...,...,...,...,...,...,...,...,...,...
0,71323709-ef1f-46d1-845e-425f38d44a7c,moderator,True,"@TechGuru88, I understand your frustration and...",,True,employed,,
0,71323709-ef1f-46d1-845e-425f38d44a7c,WiseOwl71,True,I'm glad someone finally spoke up against this...,71.0,True,employed,Woman,Helpful
0,71323709-ef1f-46d1-845e-425f38d44a7c,moderator,True,,,True,employed,,
0,71323709-ef1f-46d1-845e-425f38d44a7c,doing_my_best24,True,"I understand your frustration @WiseOwl71, but ...",24.0,False,employed,Woman,Neutral


In [4]:
annot_df = preprocessing.import_and_format_annotations(
    ANNOTATION_DIR, round=True, sentinel_value=-1
)
annot_df = annot_df.rename(
    {
        "age": "annotator_age",
        "ethnicity": "annotator_ethnicity",
        "is_heterosexual": "annotator_is_heterosexual",
        "profession": "annotator_profession",
        "gender": "annotator_gender",
        "education": "annotator_education",
    },
    axis=1,
)
annot_df
annot_df = annot_df[
    [
        "conv_id",
        "message",
        "toxicity",
        "annotator_age",
        #"annotator_ethnicity",
        "annotator_is_heterosexual",
        "annotator_profession",
        "annotator_gender",
    ]
]
annot_df

Unnamed: 0,conv_id,message,toxicity,annotator_age,annotator_is_heterosexual,annotator_profession,annotator_gender
0,89ade850-1148-4a7a-a2ea-4b0fad9fc54d,"**Comment:** ""I'm not going to let you guys si...",4,30,True,unemployed,Woman
0,89ade850-1148-4a7a-a2ea-4b0fad9fc54d,"**Comment:** ""I'm not sure what kind of experi...",4,30,True,unemployed,Woman
0,89ade850-1148-4a7a-a2ea-4b0fad9fc54d,"**Comment:** ""I'm not sure what kind of experi...",3,30,True,unemployed,Woman
0,89ade850-1148-4a7a-a2ea-4b0fad9fc54d,"**Comment:** ""I'm not sure what kind of experi...",4,30,True,unemployed,Woman
0,89ade850-1148-4a7a-a2ea-4b0fad9fc54d,"**Comment:** ""Oh, spare me the sanctimonious n...",4,30,True,unemployed,Woman
...,...,...,...,...,...,...,...
0,55a46846-74f8-463a-9dff-f116bb9daa8c,@DarkShadow333 I couldn't agree more with your...,1,30,False,employed,Woman
0,55a46846-74f8-463a-9dff-f116bb9daa8c,@GoldenYears66 I couldn't agree more with your...,1,30,False,employed,Woman
0,55a46846-74f8-463a-9dff-f116bb9daa8c,@TechGuru88 You're so naive. You think the wor...,3,30,False,employed,Woman
0,55a46846-74f8-463a-9dff-f116bb9daa8c,I'm not sure what kind of world @QueenOfChaos ...,2,30,False,employed,Woman


In [5]:
combined_df = conv_df.merge(annot_df, left_on=["id", "message"], right_on=["conv_id", "message"])
combined_df.user_age = combined_df.user_age.astype("Int64")
combined_df = combined_df.drop("conv_id", axis=1)
combined_df.head()

Unnamed: 0,id,user,moderator_exists,message,user_age,user_is_heterosexual,user_profession,user_gender,user_intent,toxicity,annotator_age,annotator_is_heterosexual,annotator_profession,annotator_gender
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,TechGuru88,True,I'm not sure what kind of world we're living i...,35,True,employed,Man,Helpful,4,30,True,unemployed,Woman
1,132b1278-8811-4145-bdfa-bddcc3ef56f9,WiseOwl71,True,I couldn't agree more with @TechGuru88. It's a...,71,True,employed,Woman,Helpful,4,30,False,employed,Man
2,132b1278-8811-4145-bdfa-bddcc3ef56f9,WiseOwl71,True,I couldn't agree more with @TechGuru88. It's a...,71,True,employed,Woman,Helpful,4,30,True,employed,Man
3,132b1278-8811-4145-bdfa-bddcc3ef56f9,WiseOwl71,True,I couldn't agree more with @TechGuru88. It's a...,71,True,employed,Woman,Helpful,4,70,True,employed,Woman
4,132b1278-8811-4145-bdfa-bddcc3ef56f9,GoldenYears66,True,"@WiseOwl71, I couldn't agree more with your se...",66,True,employed,Man,Supportive,4,30,True,unemployed,Woman


## Annotator analysis

### Gender

In [6]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3072 entries, 0 to 3071
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   id                         3072 non-null   object
 1   user                       3072 non-null   object
 2   moderator_exists           3072 non-null   bool  
 3   message                    3072 non-null   object
 4   user_age                   3059 non-null   Int64 
 5   user_is_heterosexual       3072 non-null   bool  
 6   user_profession            3072 non-null   object
 7   user_gender                3059 non-null   object
 8   user_intent                2364 non-null   object
 9   toxicity                   3072 non-null   int64 
 10  annotator_age              3072 non-null   int64 
 11  annotator_is_heterosexual  3072 non-null   bool  
 12  annotator_profession       3072 non-null   object
 13  annotator_gender           3072 non-null   object
dtypes: Int64

In [7]:
def filter_discussions_with_missing_annotations(
    df: pd.DataFrame, annotation_col: str
) -> pd.DataFrame:
    """
    Filters out rows in the DataFrame where the length of the annotation list
    is not equal to the maximum length of all annotation lists.

    Args:
        df (pd.DataFrame): The input DataFrame.
        annotation_col (str): The name of the column containing annotation values.

    Returns:
        pd.DataFrame: A filtered DataFrame with rows containing valid annotations.
    """
    max_length = df[annotation_col].map(len).max()
    return df[df[annotation_col].map(len) == max_length]

In [8]:
from typing import Hashable


def calculate_aposteriori_unimodality_for_dataframe(
    df: pd.DataFrame,
    annotation_col: str,
    grouping_cols: list[str],
    id_col: str
) -> dict[str, dict[Hashable, float]]:
    """
    Calculates the aposteriori unimodality for a list of grouping columns in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame containing annotations and grouping information.
        annotation_col (str): The name of the column containing annotation values as a list or array.
        grouping_cols (List[str]): A list of column names to be used for grouping annotators.
        id_col (str): The name of the column containing unique identifiers for comments.

    Returns:
        Dict[str, Dict[Hashable, float]]: A dictionary where each key is a grouping column,
        and the value is another dictionary mapping group levels to p-values.
    """
    df = filter_discussions_with_missing_annotations(df, annotation_col)
    results = {}

    # Validate inputs
    if annotation_col not in df.columns:
        raise ValueError(f"Annotation column '{annotation_col}' not found in the DataFrame.")
    if id_col not in df.columns:
        raise ValueError(f"ID column '{id_col}' not found in the DataFrame.")

    for grouping_col in grouping_cols:
        if grouping_col not in df.columns:
            raise ValueError(f"Grouping column '{grouping_col}' not found in the DataFrame.")

        # Prepare data for aposteriori_unimodality function
        annotations = []
        annotator_groups = []

        for _, group in df.groupby(id_col):
            annotations.append(np.array(group[annotation_col].tolist()))
            annotator_groups.append(np.array(group[grouping_col].tolist()))

        # Calculate aposteriori unimodality for the grouping column
        aposteriori_results = aposteriori.aposteriori_unimodality(annotations, annotator_groups)
        results[grouping_col] = aposteriori_results

    return results

In [9]:
# Assuming `annotations` column has been aggregated into lists for each comment ID
df_grouped = (
    combined_df.groupby("id")
    .agg(
        annotation=("toxicity", list),
        annotator_gender=("annotator_gender", list),
        annotator_age=("annotator_age", list),
        annotator_is_heterosexual=("annotator_is_heterosexual", list),
        annotator_profession=("annotator_profession", list),
    )
    .reset_index()
)
df_grouped

Unnamed: 0,id,annotation,annotator_gender,annotator_age,annotator_is_heterosexual,annotator_profession
0,02013080-38bb-47b9-b54f-43f876ca1f8c,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, ...","[Woman, Woman, Man, Woman, Woman, Man, Man, Wo...","[30, 70, 30, 30, 30, 70, 30, 30, 30, 70, 30, 3...","[False, True, False, False, True, True, False,...","[employed, employed, employed, employed, unemp..."
1,0270a843-05ee-4768-87b7-1ff298a66c0a,"[4, 3, 1, 1, 3, 1, 2, 3, 5, 5, 5, 4, 4, 4, 4, ...","[Woman, Woman, Woman, Man, Woman, Woman, Man, ...","[30, 30, 70, 30, 30, 70, 30, 30, 70, 30, 30, 7...","[True, True, True, False, True, True, False, T...","[unemployed, unemployed, employed, employed, u..."
2,02c51a56-8fe1-49bf-8a26-82ac4d6cc1b1,"[1, 1, 1, 4, 4, 1, 4, 1, 4, 3, 4, 3, 4, 4, 4]","[Man, Man, Man, Man, Man, Man, Man, Man, Man, ...","[70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7...","[True, True, True, True, True, True, True, Tru...","[employed, employed, employed, employed, emplo..."
3,03024d4f-9a1f-4ad5-9384-f90f3f6b77d0,"[1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3]","[Woman, Man, Woman, Man, Woman, Man, Woman, Ma...","[70, 30, 70, 30, 70, 30, 70, 30, 70, 70, 30, 7...","[True, True, True, True, True, True, True, Tru...","[employed, employed, employed, employed, emplo..."
4,0379c2e9-3fe4-478b-95c7-39bc9e6e9606,"[3, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 2, 4, 3, 2, ...","[Woman, Woman, Man, Man, Woman, Man, Woman, Wo...","[70, 30, 30, 70, 30, 70, 70, 70, 30, 30, 70, 7...","[True, True, False, True, True, True, True, Tr...","[employed, unemployed, employed, employed, une..."
...,...,...,...,...,...,...
83,ef58c92d-c140-4f93-a02d-61f5e31ef62f,"[4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 3, 4, 3, 4, ...","[Man, Woman, Woman, Woman, Woman, Man, Woman, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[False, True, False, True, False, False, True,...","[employed, unemployed, employed, unemployed, e..."
84,f1326d5c-8184-4aa2-ab80-51629d0a4b74,"[2, 3, 3, 3, 3, 2, 4, 4, 3, 4, 4, 1, 4, 3, 1, ...","[Woman, Woman, Man, Woman, Man, Man, Woman, Ma...","[30, 30, 30, 30, 30, 70, 30, 30, 70, 30, 30, 7...","[False, False, True, False, True, True, False,...","[employed, employed, employed, employed, emplo..."
85,f8411209-283e-4106-ba48-1c2f57c04dc2,"[3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, ...","[Woman, Man, Man, Man, Woman, Man, Man, Man, W...","[70, 70, 30, 30, 70, 70, 30, 30, 70, 70, 30, 3...","[True, True, True, False, True, True, True, Fa...","[employed, employed, employed, employed, emplo..."
86,fa1362b8-ee85-4d1f-900f-6963f6b5af61,"[4, 1, 1, 1, 1, 1, 1, 1, 1]","[Woman, Woman, Woman, Woman, Woman, Woman, Wom...","[70, 70, 70, 70, 70, 70, 70, 70, 70]","[True, True, True, True, True, True, True, Tru...","[employed, employed, employed, employed, emplo..."


In [10]:
results = calculate_aposteriori_unimodality_for_dataframe(
    df_grouped,
    annotation_col="annotation",
    grouping_cols=[
        "annotator_gender",
        "annotator_age",
        "annotator_is_heterosexual",
        "annotator_profession",
    ],
    id_col="id",
)
results

{'annotator_gender': {'Man': 0.5, 'Woman': 1.0},
 'annotator_age': {30: 1.0, 70: 0.5},
 'annotator_is_heterosexual': {False: 0.5, True: 1.0},
 'annotator_profession': {'employed': 0.5, 'unemployed': 1.0}}