In [1]:
from pathlib import Path

try:
    BASE_DIR = Path(__file__).resolve().parent.parent
except NameError:
    BASE_DIR = Path.cwd().parent

DATA_DIR = BASE_DIR / "data"
RAW_TRAIN = DATA_DIR / "jigsaw_dataset.csv"
OUT_FILE = DATA_DIR / "jigsaw_gbv.csv"   

In [None]:
"""
Prepare Jigsaw Unintended Bias dataset for HEARTS replication.

Goal:
- Build a dataset focused on comments about women/girls.
- Create a binary label for gender-based harmful / misogynistic content.
- Save a compact CSV: (comment_text, label).

label = 1: GBV-related towards women
label = 0: non-harmful / neutral comments about women
"""

import pandas as pd
import re

def main():
    print(f"Loading raw data from: {RAW_TRAIN}")
    df = pd.read_csv(RAW_TRAIN)
    print("Original shape:", df.shape)

    # Ensure comment_text is string
    df["comment_text"] = df["comment_text"].fillna("").astype(str)

    # 1. FILTER: COMMENTS ABOUT WOMEN / GIRLS

    # A) Identity-based filter (Jigsaw identity column)
    #    'female' is a float in [0,1]; >= 0.5 means strong association
    female_col = "female"
    if female_col not in df.columns:
        df[female_col] = 0.0
    mask_identity_strong = df[female_col] >= 0.5

    # B) Strong noun-based keywords: explicit references to women/girls
    female_nouns = [
        "woman", "women", "girl", "girls",
        "lady", "ladies",
        "mother", "mom", "mum",
        "wife", "girlfriend",
        "sister", "daughter",
        "female",
    ]

    noun_pattern = r"\b(" + "|".join(re.escape(k) for k in female_nouns) + r")\b"
    mask_nouns = df["comment_text"].str.contains(
        noun_pattern, case=False, regex=True, na=False
    )

    # Final "about women" mask:
    # strongly female-identified OR explicit female nouns
    mask_female_related = mask_identity_strong | mask_nouns

    df = df[mask_female_related].copy()
    print("After filtering women-related comments:", df.shape)

    # 2. LABEL: HARMFUL / MISOGYNISTIC CONTENT TOWARDS WOMEN

    # Base toxicity score (Jigsaw 'target' column)
    if "target" not in df.columns:
        raise KeyError("Expected 'target' column not found in Jigsaw data.")
    toxic = df["target"] >= 0.5  # you can tune this threshold

    # Harm components ‚Äì GBV-like harmful signals
    harm_cols = [
        "identity_attack",
        "insult",
        "threat",
        "severe_toxicity",
        "sexual_explicit",
    ]

    for col in harm_cols:
        if col not in df.columns:
            df[col] = 0.0

    harm_component = (
        (df["identity_attack"] >= 0.3) |
        (df["insult"]          >= 0.3) |
        (df["threat"]          >= 0.3) |
        (df["severe_toxicity"] >= 0.3) |
        (df["sexual_explicit"] >= 0.3)
    )

    # Final binary label:
    # 1 = toxic AND has at least one strong harm component
    # 0 = still about women, but non-harmful / mild / neutral
    df["label"] = (toxic & harm_component).astype(int)

    # 3. KEEP ONLY NEEDED COLUMNS

    out = df[["comment_text", "label"]].dropna()
    print("Final dataset shape (text, label):", out.shape)
    print("Label counts:\n", out["label"].value_counts())

    # 4. SAVE

    OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(OUT_FILE, index=False)
    print(f"\nSaved processed dataset to: {OUT_FILE.resolve()}")


if __name__ == "__main__":
    main()

Loading raw data from: /Users/macbook/Desktop/CLEAN_PROJECT/data/jigsaw-unintended-bias-in-toxicity-classification/train.csv
Original shape: (1804874, 45)


  mask_nouns = df["comment_text"].str.contains(


After filtering women-related comments: (86719, 45)
Final dataset shape (text, label): (86719, 2)
Label counts:
 label
0    76380
1    10339
Name: count, dtype: int64

Saved processed dataset to: /Users/macbook/Desktop/CLEAN_PROJECT/data/jigsaw_gbv.csv


In [54]:

try:
    BASE_DIR = Path(__file__).resolve().parent.parent
except NameError:
    BASE_DIR = Path.cwd().parent

CSV_PATH = BASE_DIR / "data" / "jigsaw_gbv.csv"

print("Loading from:", CSV_PATH)

df = pd.read_csv(CSV_PATH)

# Harmful comments about women
print("\n=== Sample: Harmful comments about women (label=1) ===")
sample_1 = df[df["label"] == 1].sample(5, random_state=42)

for _, row in sample_1.iterrows():
    print("----")
    print(row["comment_text"])


Loading from: /Users/macbook/Desktop/CLEAN_PROJECT/data/jigsaw_gbv.csv

=== Sample: Harmful comments about women (label=1) ===
----
He's a goof, just like pretty boy/girl.
----
Being a woman is not female enough for feminists .
----
i bet that this loony tunes woman spends more on hats than the people she represents make in a year!!
----
Only someone wound way too tight and way too far into the trough for Trudeau would have taken my attempt at droll humour seriously.

Of course there is no evidence Junior was complicit in this stupid, drunken escapade.

Myself, I'm sorry the kid didn't get away with material on NDP political strategy and tactics.  Had he done so, maybe the Liberals would become as messed up campaign wise as the Dippers were in 2015.

For the record, Nixon wasn`t complicit in the Watergate break in either.  A guy (Howard Hunt) four levels down with too big a budget, too much idle time and too little to do hatched this stupid  plot.  John Mitchell who reported to Nixon a

In [55]:
# Non-harmful comments about women
print("\n=== Sample: Non-harmful comments about women (label=0) ===")
sample_0 = df[df["label"] == 0].sample(5, random_state=42)

for _, row in sample_0.iterrows():
    print("----")
    print(row["comment_text"])


=== Sample: Non-harmful comments about women (label=0) ===
----
AlaskaPI is correct, stalking isn't always sexual in nature. There are people out there that think they are superior, above the law and are out to prove it. I have to shake my head when they suggest the paths the victims have to get something done about it. There is no path! You can file police reports which go into the que, snow removal or police protection, the latter is shut down in this town so there is one option. They go out of their way an send you to an online reporting system which goes into a round file. Our wonderful police department is under orders from Berkie to only respond when the body is cold, anything else gets the shovel....maybe. How many times do we read of someone doing something and they have a record as long as your arm yet they continue till they kill someone. I think there is a very good example the other day, guy kills his wife. Pretty sad when we are paying for both snow and police and get nei