In [1]:
%cd ..

/home/dimits/Documents/research/llm_moderation/experiments


In [2]:
import pandas as pd
from tasks import preprocessing


CONVERSATION_DIR = "data/generated_discussions_output"
ANNOTATION_DIR = "data/annotations_output" 

In [3]:
import re


def extract_socio_demographics(text):
    demographics = {
        "age": None,
        "sexual_orientation": None,
        "ethnicity": None,
        "employment": None,
        "occupation": None,
        "personality_traits": [],
        "gender": None,
        "education": None,
    }

    # Extract age
    age_match = re.search(r"(\d+)\s*years old", text)
    if age_match:
        demographics["age"] = int(age_match.group(1))

    # Extract sexual orientation
    orientation_match = re.search(
        r"(Heterosexual|Bisexual|Homosexual)", text, re.IGNORECASE
    )
    if orientation_match:
        demographics["sexual_orientation"] = orientation_match.group(1).capitalize()

    # Extract ethnicity
    ethnicity_match = re.search(
        r"(African American|Hispanic|White|Asian|Native American|Pacific Islander|Other)",
        text,
        re.IGNORECASE,
    )
    if ethnicity_match:
        demographics["ethnicity"] = ethnicity_match.group(1).capitalize()

    # Determine employment status and occupation
    employment_match = re.search(
        r"(Unemployed|Retired|Freelance|Part-time|Worker|Illustrator|Engineer|Professor)",
        text,
        re.IGNORECASE,
    )
    if employment_match:
        occupation = employment_match.group(1)
        demographics["occupation"] = occupation
        demographics["employment"] = occupation.lower() not in ["unemployed", "retired"]

    # Extract personality traits (assumed to be between commas after occupation and before gender)
    traits_match = re.search(r"(\w+(?:, \w+)*)\s*,\s*(man|woman)", text, re.IGNORECASE)
    if traits_match:
        traits = traits_match.group(1).split(", ")
        demographics["personality_traits"] = [trait.lower() for trait in traits]

    # Extract gender
    gender_match = re.search(r"\b(man|woman)\b", text, re.IGNORECASE)
    if gender_match:
        demographics["gender"] = gender_match.group(1).capitalize()

    # Extract education level
    education_match = re.search(r"(High School|Masters|PhD)", text, re.IGNORECASE)
    if education_match:
        demographics["education"] = education_match.group(1).capitalize()

    return demographics

In [4]:
conv_df = preprocessing.import_conversations(CONVERSATION_DIR)
conv_df = conv_df[["id", "user_prompts", "user", "message"]]
conv_df.head()

Unnamed: 0,id,user_prompts,user,message
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",DarkShadow333,I'm so sick of people making excuses for terro...
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",moderator,"@DarkShadow333, I understand your frustration,..."
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",TechGuru88,I'm not sure what kind of world we're living i...
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",moderator,
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",WiseOwl71,I couldn't agree more with @TechGuru88. It's a...


In [5]:
annot_df = preprocessing.import_annotations(ANNOTATION_DIR, round=True, sentinel_value=-1)
annot_df = annot_df[annot_df.toxicity != -1]
annot_df = annot_df[["conv_id", "annotator_prompt", "message", "toxicity"]]
annot_df

Unnamed: 0,conv_id,annotator_prompt,message,toxicity
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"You are 30 years old, Bisexual, African Ameri...","**Comment:** @QueenOfChaos, you're absolutely...",4
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"You are 30 years old, Bisexual, African Ameri...","**Comment:** @GoldenYears66, you're absolutel...",4
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"You are 30 years old, Bisexual, African Ameri...","**Comment:** @GoldenYears66, I think your arg...",4
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"You are 30 years old, Bisexual, African Ameri...","**Comment:** @TechGuru88, I understand your p...",1
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"You are 30 years old, Bisexual, African Ameri...",**Comment:** I'm not sure what's more disturb...,4
...,...,...,...,...
0,55a46846-74f8-463a-9dff-f116bb9daa8c,"You are 30 years old, Bisexual, African Ameri...",@DarkShadow333 I couldn't agree more with your...,1
0,55a46846-74f8-463a-9dff-f116bb9daa8c,"You are 30 years old, Bisexual, African Ameri...",@GoldenYears66 I couldn't agree more with your...,1
0,55a46846-74f8-463a-9dff-f116bb9daa8c,"You are 30 years old, Bisexual, African Ameri...",@TechGuru88 You're so naive. You think the wor...,3
0,55a46846-74f8-463a-9dff-f116bb9daa8c,"You are 30 years old, Bisexual, African Ameri...",I'm not sure what kind of world @QueenOfChaos ...,2


In [6]:
demographics_df = (
    annot_df["annotator_prompt"].apply(extract_socio_demographics).apply(pd.Series)
)
annot_df = pd.concat(
    [annot_df.drop(columns=["annotator_prompt"]), demographics_df], axis=1
)
annot_df = annot_df.rename(
    {
        "age": "annotator_age",
        "ethnicity": "annotator_ethnicity",
        "sexual_orientation": "annotator_sexual_orientation",
        "employment": "annotator_employment",
        "gender": "annotator_gender",
        "education": "annotator_education",
    },
    axis=1,
)
annot_df = annot_df[
    [
        "conv_id",
        "message",
        "toxicity",
        "annotator_age",
        "annotator_sexual_orientation",
        "annotator_employment",
        "annotator_gender",
    ]
]
annot_df

Unnamed: 0,conv_id,message,toxicity,annotator_age,annotator_sexual_orientation,annotator_employment,annotator_gender
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"**Comment:** @QueenOfChaos, you're absolutely...",4,30,Bisexual,True,Woman
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"**Comment:** @GoldenYears66, you're absolutel...",4,30,Bisexual,True,Woman
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"**Comment:** @GoldenYears66, I think your arg...",4,30,Bisexual,True,Woman
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,"**Comment:** @TechGuru88, I understand your p...",1,30,Bisexual,True,Woman
0,e1a8533f-690b-4b4f-a8bc-78ad287f8129,**Comment:** I'm not sure what's more disturb...,4,30,Bisexual,True,Woman
...,...,...,...,...,...,...,...
0,55a46846-74f8-463a-9dff-f116bb9daa8c,@DarkShadow333 I couldn't agree more with your...,1,30,Bisexual,True,Woman
0,55a46846-74f8-463a-9dff-f116bb9daa8c,@GoldenYears66 I couldn't agree more with your...,1,30,Bisexual,True,Woman
0,55a46846-74f8-463a-9dff-f116bb9daa8c,@TechGuru88 You're so naive. You think the wor...,3,30,Bisexual,True,Woman
0,55a46846-74f8-463a-9dff-f116bb9daa8c,I'm not sure what kind of world @QueenOfChaos ...,2,30,Bisexual,True,Woman


In [7]:
combined_df = conv_df.merge(annot_df, left_on=["id", "message"], right_on=["conv_id", "message"])
combined_df

Unnamed: 0,id,user_prompts,user,message,conv_id,toxicity,annotator_age,annotator_sexual_orientation,annotator_employment,annotator_gender
0,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",TechGuru88,I'm not sure what kind of world we're living i...,132b1278-8811-4145-bdfa-bddcc3ef56f9,4,30,Heterosexual,False,Woman
1,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",WiseOwl71,I couldn't agree more with @TechGuru88. It's a...,132b1278-8811-4145-bdfa-bddcc3ef56f9,4,30,Homosexual,True,Man
2,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",WiseOwl71,I couldn't agree more with @TechGuru88. It's a...,132b1278-8811-4145-bdfa-bddcc3ef56f9,4,30,Heterosexual,True,Man
3,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",WiseOwl71,I couldn't agree more with @TechGuru88. It's a...,132b1278-8811-4145-bdfa-bddcc3ef56f9,4,70,Heterosexual,False,Woman
4,132b1278-8811-4145-bdfa-bddcc3ef56f9,"[You are WiseOwl71 71 years old, Heterosexual,...",GoldenYears66,"@WiseOwl71, I couldn't agree more with your se...",132b1278-8811-4145-bdfa-bddcc3ef56f9,4,30,Heterosexual,False,Woman
...,...,...,...,...,...,...,...,...,...,...
2365,71323709-ef1f-46d1-845e-425f38d44a7c,"[You are QueenOfChaos 32 years old, Heterosexu...",WiseOwl71,I'm glad someone finally spoke up against this...,71323709-ef1f-46d1-845e-425f38d44a7c,4,30,Heterosexual,True,Man
2366,71323709-ef1f-46d1-845e-425f38d44a7c,"[You are QueenOfChaos 32 years old, Heterosexu...",doing_my_best24,"I understand your frustration @WiseOwl71, but ...",71323709-ef1f-46d1-845e-425f38d44a7c,4,70,Heterosexual,True,Man
2367,71323709-ef1f-46d1-845e-425f38d44a7c,"[You are QueenOfChaos 32 years old, Heterosexu...",doing_my_best24,"I understand your frustration @WiseOwl71, but ...",71323709-ef1f-46d1-845e-425f38d44a7c,4,70,Heterosexual,False,Woman
2368,71323709-ef1f-46d1-845e-425f38d44a7c,"[You are QueenOfChaos 32 years old, Heterosexu...",doing_my_best24,"I understand your frustration @WiseOwl71, but ...",71323709-ef1f-46d1-845e-425f38d44a7c,4,30,Homosexual,True,Man
