Let $C = \{c_1, c_2, ..., c_n\}$ be the comments and $X_i$ = $x_i^j , j=1,2,\cdots,m$, where $x_i^j$ the annotation of annotator j for comment i.


Let $X^{\rho}_i$ the annotations for $c_i$ which belong in a partition $\rho \in \Rho$. Each $\rho$ in this case is a factor of feature $\Rho$

Then we can define $aposteriori(c_i) = max_{\rho} \{ndfu(X_i) - ndfu(X_i^ \rho)\}$
and $aposteriori(C) = Wilcoxon(aposteriori(c_i), 0, alternative="greater")$

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import re

from ndfu import ndfu
import aposteriori


def get_annotations(annot_str: str) -> dict:
    """Extracts all key-value pairs from the annotation string into a dictionary."""
    try:
        annot_str = str(annot_str).lower()
        # Regex to match key-value pairs of the form type=value
        pattern = r"(\w+)=([-\d\.]+)"
        matches = re.findall(pattern, annot_str)
        return {
            key: float(value) if "." in value else int(value) for key, value in matches
        }
    except Exception as e:
        return {}


def get_user_intent(prompt: str) -> str:
    prompt = prompt.lower()

    if "community" in prompt:
        return "Community-oriented"
    elif "troll" in prompt:
        return "Troll"
    elif "special_instructions: ," in prompt:
        return "Neutral"
    else:
        return "Unknown"


def format_dataset(df: pd.DataFrame, min_message_len: int) -> pd.DataFrame:
    df = df.astype(str)

    # Extract all annotations from the 'annotation' column
    annotations = df["annotation"].apply(get_annotations)

    # Convert each annotation dictionary into separate columns
    annotations_df = pd.json_normalize(annotations)

    # Concatenate the new columns with the original dataframe
    df = pd.concat([df, annotations_df], axis=1)
    df = df[(df.toxicity != -1) | (df.argumentquality != -1)]

    df.message_order = df.message_order.astype(int)

    # Process other columns as needed
    df.is_moderator = (df.is_moderator == "True").astype(bool)
    df["intent"] = df.user_prompt.apply(get_user_intent).astype(str)
    df.intent = np.where(df.is_moderator, "Moderator", df.intent).astype(str)

    df["polarization"] = (
        df.groupby(["conv_id", "message"])["toxicity"]
        .transform(lambda x: ndfu(x))
        .astype(float)
    )

    df["not_intervened"] = (
        df.is_moderator & df.message.apply(lambda x: len(x.strip()) < min_message_len)
    ).astype(bool)

    return df

In [3]:
df = pd.read_csv("dataset.csv")
df = format_dataset(df, min_message_len=3)
df

Unnamed: 0.1,Unnamed: 0,conv_id,timestamp_conv,ctx_length_conv,conv_variant,user,message,model,user_prompt,is_moderator,...,sexual_orientation_annot,demographic_group_annot,current_employment_annot,personality_characteristics_annot,education_level_annot,toxicity,argumentquality,intent,polarization,not_intervened
0,0,b7674ce3-6caf-443a-b076-b6ae35f421a2,25-02-12-14-59,3,vanilla,RebelCoder07,"In other words, he’s not brown enough for ya t...",hardcoded,You are a human participating in an online cha...,False,...,Heterosexual,Middle Eastern,Farmer,"['hardworking', 'practical', 'resilient', 'con...",No formal education,2,3,Neutral,0.833333,False
1,1,b7674ce3-6caf-443a-b076-b6ae35f421a2,25-02-12-14-59,3,vanilla,RebelCoder07,"In other words, he’s not brown enough for ya t...",hardcoded,You are a human participating in an online cha...,False,...,Heterosexual,South Asian,Retired Philosopher,"['wise', 'stoic', 'pragmatic', 'serene']",PhD,2,3,Neutral,0.833333,False
2,2,b7674ce3-6caf-443a-b076-b6ae35f421a2,25-02-12-14-59,3,vanilla,RebelCoder07,"In other words, he’s not brown enough for ya t...",hardcoded,You are a human participating in an online cha...,False,...,Heterosexual,Indigenous,Botanist,"['calm', 'empathetic', 'insightful', 'relaxed']",,3,2,Neutral,0.833333,False
3,3,b7674ce3-6caf-443a-b076-b6ae35f421a2,25-02-12-14-59,3,vanilla,RebelCoder07,"In other words, he’s not brown enough for ya t...",hardcoded,You are a human participating in an online cha...,False,...,Bisexual,Black,Cybersecurity Expert,"['rebellious', 'clever', 'cynical', 'frustrated']",,3,2,Neutral,0.833333,False
4,4,b7674ce3-6caf-443a-b076-b6ae35f421a2,25-02-12-14-59,3,vanilla,RebelCoder07,"In other words, he’s not brown enough for ya t...",hardcoded,You are a human participating in an online cha...,False,...,Asexual,Caucasian,Historian,"['analytical', 'reserved', 'meticulous', 'nost...",PhD,2,3,Neutral,0.833333,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61142,61142,c8c3eda7-92ec-4880-a6cf-4b26bb1f3815,25-02-13-21-12,3,constructive_communications,moderator,"@AdventureSeeker34, thank you for clarifying y...",mistral-nemo-abliterated,You are a human participating in an online cha...,True,...,Heterosexual,Pacific Islander,Travel Blogger,"['spontaneous', 'energetic', 'fearless', 'exci...",Some College,1,3,Moderator,0.000000,False
61143,61143,c8c3eda7-92ec-4880-a6cf-4b26bb1f3815,25-02-13-21-12,3,constructive_communications,moderator,"@AdventureSeeker34, thank you for clarifying y...",mistral-nemo-abliterated,You are a human participating in an online cha...,True,...,Bisexual,Black,Cybersecurity Expert,"['rebellious', 'clever', 'cynical', 'frustrated']",,1,3,Moderator,0.000000,False
61144,61144,c8c3eda7-92ec-4880-a6cf-4b26bb1f3815,25-02-13-21-12,3,constructive_communications,moderator,"@AdventureSeeker34, thank you for clarifying y...",mistral-nemo-abliterated,You are a human participating in an online cha...,True,...,Heterosexual,Latino,Registered Nurse,"['compassionate', 'patient', 'diligent', 'over...",,1,3,Moderator,0.000000,False
61145,61145,c8c3eda7-92ec-4880-a6cf-4b26bb1f3815,25-02-13-21-12,3,constructive_communications,moderator,"@AdventureSeeker34, thank you for clarifying y...",mistral-nemo-abliterated,You are a human participating in an online cha...,True,...,Lesbian,Arab,Poet,"['sensitive', 'imaginative', 'quiet', 'moody']",,1,3,Moderator,0.000000,False


## Annotator analysis

### Gender

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61147 entries, 0 to 61146
Data columns (total 39 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         61147 non-null  object 
 1   conv_id                            61147 non-null  object 
 2   timestamp_conv                     61147 non-null  object 
 3   ctx_length_conv                    61147 non-null  object 
 4   conv_variant                       61147 non-null  object 
 5   user                               61147 non-null  object 
 6   message                            61147 non-null  object 
 7   model                              61147 non-null  object 
 8   user_prompt                        61147 non-null  object 
 9   is_moderator                       61147 non-null  bool   
 10  message_id                         61147 non-null  object 
 11  message_order                      61147 non-null  int

In [5]:
def filter_discussions_with_missing_annotations(
    df: pd.DataFrame, annotation_col: str
) -> pd.DataFrame:
    """
    Filters out rows in the DataFrame where the length of the annotation list
    is not equal to the maximum length of all annotation lists.

    Args:
        df (pd.DataFrame): The input DataFrame.
        annotation_col (str): The name of the column containing annotation values.

    Returns:
        pd.DataFrame: A filtered DataFrame with rows containing valid annotations.
    """
    max_length = df[annotation_col].map(len).max()
    return df[df[annotation_col].map(len) == max_length]

In [6]:
from typing import Hashable


def prepare_annotations_for_aposteriori(
    df: pd.DataFrame, annotation_col: str, id_col: str
) -> list[np.ndarray]:
    """
    Prepares the annotations as a list of 1D numpy arrays, where each numpy array
    represents the annotations of a specific comment, and the list contains
    comments of the same discussion.

    Args:
        df (pd.DataFrame): The input DataFrame.
        annotation_col (str): The name of the column containing annotation values as lists.
        id_col (str): The name of the column containing unique identifiers for comments.

    Returns:
        list[np.ndarray]: A list of 1D numpy arrays for each discussion's comments.
    """
    return [
        np.array(group[annotation_col].tolist())
        for _, group in df.groupby(id_col)
    ]


def calculate_aposteriori_unimodality_for_dataframe(
    df: pd.DataFrame,
    annotation_col: str,
    grouping_cols: list[str],
    id_col: str
) -> dict[str, dict[Hashable, float]]:
    """
    Calculates the aposteriori unimodality for a list of grouping columns in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame containing annotations and grouping information.
        annotation_col (str): The name of the column containing annotation values as a list or array.
        grouping_cols (List[str]): A list of column names to be used for grouping annotators.
        id_col (str): The name of the column containing unique identifiers for comments.

    Returns:
        dict[str, dict[Hashable, float]]: A dictionary where each key is a grouping column,
        and the value is another dictionary mapping group levels to p-values.
    """
    # Filter out discussions with missing annotations
    df = filter_discussions_with_missing_annotations(df, annotation_col)

    results = {}

    # Validate inputs
    if annotation_col not in df.columns:
        raise ValueError(f"Annotation column '{annotation_col}' not found in the DataFrame.")
    if id_col not in df.columns:
        raise ValueError(f"ID column '{id_col}' not found in the DataFrame.")

    for grouping_col in grouping_cols:
        if grouping_col not in df.columns:
            raise ValueError(f"Grouping column '{grouping_col}' not found in the DataFrame.")

        # Prepare data for aposteriori_unimodality function
        annotations = prepare_annotations_for_aposteriori(df, annotation_col, id_col)
        annotator_groups = [
            np.array(group[grouping_col].tolist())
            for _, group in df.groupby(id_col)
        ]

        # Calculate aposteriori unimodality for the grouping column
        aposteriori_results = aposteriori.aposteriori_unimodality(annotations, annotator_groups, sample_ratio=0.5, bootstrap_steps=1000)
        results[grouping_col] = aposteriori_results

    return results

In [7]:
# Assuming `annotations` column has been aggregated into lists for each comment ID
df_grouped = df.groupby("conv_id").agg(
    annotation=("toxicity", list),
    annotator_gender=("sex_annot", list),
    annotator_age=("age_annot", list)
).reset_index()
df_grouped


Unnamed: 0,conv_id,annotation,annotator_gender,annotator_age
0,028f869d-cf12-444a-aefc-62f8b6294acd,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[male, female, nan, male, male, male, male, fe...","[28, 40, 32, 60, 65, 21, 50, 27, 45, 55, 38, 2..."
1,0594f5fe-3bef-4642-8bb7-9ac356f44acb,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, ...","[female, female, female, male, nan, male, male...","[38, 45, 27, 60, 32, 55, 28, 21, 50, 65, 40, 3..."
2,05d53124-194a-4608-9614-c7075f71a827,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[female, nan, female, male, female, female, ma...","[38, 32, 40, 60, 27, 45, 50, 38, 32, 40, 60, 2..."
3,07bd0dbc-5591-4b1a-ba75-fb7a84c28986,"[2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 1, 1, 1, ...","[male, male, male, male, female, female, femal...","[50, 21, 60, 65, 38, 40, 45, 55, 27, 32, 28, 5..."
4,095f8e2c-4c6b-45d3-983b-c0a836cd1d4e,"[3, 3, 4, 4, 3, 3, 5, 4, 4, 4, 3, 3, 2, 2, 2, ...","[male, female, female, female, male, male, fem...","[21, 45, 27, 38, 28, 55, 40, 65, 32, 60, 50, 2..."
...,...,...,...,...
135,f83f95e7-1200-4dcd-a620-29694858190b,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, ...","[female, female, nan, male, male, male, female...","[40, 38, 32, 55, 28, 50, 27, 60, 45, 65, 21, 4..."
136,fac8059e-390f-421a-bbb8-a4b615da8d35,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, ...","[male, male, male, nan, female, male, male, ma...","[60, 21, 55, 32, 40, 28, 60, 21, 55, 32, 40, 2..."
137,fce13b53-ae50-47b2-b70e-1de66f31a8bf,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[male, male, nan, male, male, female, female, ...","[65, 28, 32, 60, 55, 45, 38, 50, 27, 21, 40, 6..."
138,fdb670f7-dd7e-4222-8f5e-e3c6a8fccff4,"[4, 5, 5, 4, 5, 5, 5, 4, 4, 5, 5, 3, 2, 2, 2, ...","[male, male, male, male, female, female, male,...","[60, 65, 21, 55, 38, 27, 28, 50, 40, 32, 45, 6..."


In [8]:
results = calculate_aposteriori_unimodality_for_dataframe(
    df_grouped, annotation_col="annotation", grouping_cols=["annotator_gender", "annotator_age"], id_col="conv_id"
)
results

{'annotator_gender': {'female': 1.0, 'male': 1.0, 'nan': 1.0},
 'annotator_age': {'21': 1.0,
  '27': 1.0,
  '28': 1.0,
  '32': 1.0,
  '38': 1.0,
  '40': 1.0,
  '45': 1.0,
  '50': 1.0,
  '55': 1.0,
  '60': 1.0,
  '65': 1.0}}