In [3]:
from pathlib import Path

import pandas as pd
import numpy as np

from tasks import preprocessing_util


INPUT_PATH = Path("../downloads/umod/umod.csv")
OUTPUT_PATH = Path("../datasets/umod.csv")


def combine_comments(df):
    preceding = df.drop(columns=["reply"]).rename(
        columns={"preceding_comment": "text"}
    )

    # For reply rows
    reply = df.drop(columns=["preceding_comment"]).rename(
        columns={"reply": "text"}
    )

    # Add a helper column to keep track of source (optional)
    preceding["source"] = "preceding_comment"
    reply["source"] = "reply"

    # Concatenate vertically
    combined = pd.concat([preceding, reply], ignore_index=True)
    return combined


def aggregate_notes(df, exclude_cols):
    """
    Given a stacked DataFrame with columns including conv_id, text, source,
    aggregate all other columns into a single 'notes' dictionary column.

    Args:
        df (pd.DataFrame): Input stacked DataFrame.
        conv_id_col (str): Name of conversation ID column.
        text_col (str): Name of the text column.
        source_col (str): Name of the source column.

    Returns:
        pd.DataFrame: DataFrame with columns [conv_id, text, source, notes]
    """
    notes_cols = [col for col in df.columns if col not in exclude_cols]

    df = df.copy()
    df["notes"] = df[notes_cols].apply(
        lambda row: row.dropna().to_dict(), axis=1
    )
    df = df.drop(columns=notes_cols)

    return df


df = pd.read_csv(INPUT_PATH, sep="\t")
df = combine_comments(df)
df = aggregate_notes(
    df,
    exclude_cols=[
        "id",
        "entropy_moderation",
        "text",
        "source",
        "softlabel_raw",
    ],
)
df["speaker_turn"] = np.where(df.source == "reply", 1, 0)
df["message_id"] = df.apply(
    lambda row: f"umod-{row.get("id")}-{row.get("speaker_turn")}",
    axis=1,
)
# if comment is reply, is 70% moderation (aggregated via labels) and
# if annotators are more than 50% confident
df["is_moderator"] = (
    (df.source == "reply")
    & (df.entropy_moderation <= 0.75)
    & (df.softlabel_raw >= 0.75)
)
# all users are unique
df["user"] = "user-" + df.message_id

df["dataset"] = "umod"
df["reply_to"] = preprocessing_util.assign_reply_to(
    df,
    conv_id_col="id",
    message_id_col="message_id",
    order_col="speaker_turn",
)

df = df.rename(columns={"id": "conv_id"})
df






Unnamed: 0,conv_id,text,entropy_moderation,softlabel_raw,source,notes,speaker_turn,message_id,is_moderator,user,dataset,reply_to
0,cci0pa6,It should be noted that the majority of Americ...,0.503258,0.888889,preceding_comment,"{'softlabel_mace': 0.9784273339949084, 'subjec...",0,umod-cci0pa6-0,False,user-umod-cci0pa6-0,umod,
1,cebi7l2,"As someone already stated below, there is a ma...",0.991076,0.555556,preceding_comment,"{'softlabel_mace': 0.0151780491202524, 'subjec...",0,umod-cebi7l2-0,False,user-umod-cebi7l2-0,umod,
2,cag71xz,I believe he spends too much time playing batt...,0.764205,0.777778,preceding_comment,"{'softlabel_mace': 0.6933489081337547, 'subjec...",0,umod-cag71xz-0,False,user-umod-cag71xz-0,umod,
3,cf2ubvz,This claim is highly disputed. It should stop ...,0.863121,0.714286,preceding_comment,"{'softlabel_mace': 0.2568197780049782, 'subjec...",0,umod-cf2ubvz-0,False,user-umod-cf2ubvz-0,umod,
4,ch0psid,I assume that's what OP meant by combating oth...,0.991076,0.555556,preceding_comment,"{'softlabel_mace': 0.1161456499481049, 'subjec...",0,umod-ch0psid-0,False,user-umod-ch0psid-0,umod,
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,cfpl4ns,People care about what their peers care about....,0.468996,0.900000,reply,"{'softlabel_mace': 0.9999959217141802, 'subjec...",1,umod-cfpl4ns-1,True,user-umod-cfpl4ns-1,umod,umod-cfpl4ns-0
1996,cgtad17,That's your own personal prejudices. I bet a r...,0.881291,0.700000,reply,"{'softlabel_mace': 0.9999755756429916, 'subjec...",1,umod-cgtad17-1,False,user-umod-cgtad17-1,umod,umod-cgtad17-0
1997,ccbjoho,But with the wealth distribution he is talking...,0.721928,0.800000,reply,"{'softlabel_mace': 0.9999957419159248, 'subjec...",1,umod-ccbjoho-1,True,user-umod-ccbjoho-1,umod,umod-ccbjoho-0
1998,cjx4lnd,"&>>Assuming that Pat is not a child, ignorance...",0.881291,0.700000,reply,"{'softlabel_mace': 0.999958658380532, 'subject...",1,umod-cjx4lnd-1,False,user-umod-cjx4lnd-1,umod,umod-cjx4lnd-0


In [2]:
def assign_reply_to(
    df: pd.DataFrame, conv_id_col: str, message_id_col: str, order_col: str
) -> pd.Series:
    df_sorted = df.sort_values([conv_id_col, order_col])
    # shift comment id by 1
    reply_to = df_sorted.groupby(conv_id_col)[message_id_col].shift(1)
    # The result is aligned with df_sorted, we must reindex to original df order
    reply_to = reply_to.reindex(df.index)
    return reply_to

df.SpeakerTurn = df.SpeakerTurn.astype(int)
df["reply_to"] = assign_reply_to(df, "conv_id", "message_id", "SpeakerTurn")
df.loc[:, ["conv_id", "message_id", "SpeakerTurn", "reply_to"]]

Unnamed: 0,conv_id,message_id,SpeakerTurn,reply_to
0,2257,fora-2257-1,1,
1,2257,fora-2257-2,2,fora-2257-1
2,2257,fora-2257-3,3,fora-2257-2
3,2257,fora-2257-4,4,fora-2257-3
4,2257,fora-2257-5,5,fora-2257-4
...,...,...,...,...
39906,2278,fora-2278-317,317,fora-2278-316
39907,2278,fora-2278-318,318,fora-2278-317
39908,2278,fora-2278-319,319,fora-2278-318
39909,2278,fora-2278-320,320,fora-2278-319
