In [48]:
import pandas as pd
labels_dataset = pd.read_csv('eval-dataset-with-labels.csv')

import json

model_output = []
with open('model_output.json', 'r') as f:
    for line in f:
        model_output.append(json.loads(line.strip()))

print(f"Loaded {len(model_output)} records")

import re

# Convert model_output to DataFrame
model_output_df = pd.DataFrame(model_output)
print("Model output columns:", model_output_df.columns.tolist())
print("Labels dataset columns:", labels_dataset.columns.tolist())

def clean_tokens(text):
    if pd.isna(text):
        return text
    text = re.sub(r'\[CLS\]|\[SEP\]|\[PAD\]|\[MASK\]|\[UNK\]', '', text)
    text = re.sub(r'<\s*[\w_]+(\s+[\w_]+)*\s*>', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

model_output_df['source'] = model_output_df['source'].apply(clean_tokens)
model_output_df['recover'] = model_output_df['recover'].apply(clean_tokens)

universal_df = model_output_df.merge(
    labels_dataset[['id', 'class']],  # adjust column name if different
    on='id',
    how='inner'
)

universal_df = universal_df[['id', 'source', 'recover', 'class']]
print(f"\nUniversal dataset shape: {universal_df.shape}")

df = pd.read_csv('Suicide_Dataset_Ann.csv')
females = df[(df['gender'] == 'F') & (df['confidence'] >= .99999)]

import uuid

universal_df['gender'] = 'M'

females_df = females[['text', 'class']].copy()
females_df['id'] = [str(uuid.uuid4()) for _ in range(len(females_df))]
females_df = females_df.rename(columns={'text': 'source'})
females_df['recover'] = females_df['source']
females_df['gender'] = 'F'
females_df = females_df.sample(n=len(universal_df))

universal_df = pd.concat([universal_df, females_df[['id', 'source', 'recover', 'class', 'gender']]], ignore_index=True)


Loaded 2644 records
Model output columns: ['id', 'recover', 'reference', 'source']
Labels dataset columns: ['Unnamed: 0', 'id', 'source', 'recover', 'source_score', 'recover_score', 'class']

Universal dataset shape: (831, 4)


In [49]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load hedge detection model
tok = AutoTokenizer.from_pretrained("ChrisLiewJY/BERTweet-Hedge")
clf = AutoModelForSequenceClassification.from_pretrained("ChrisLiewJY/BERTweet-Hedge")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clf = clf.to(device)

def get_hedge_level(texts, batch_size=32):
    """
    Calculate hedge level for a list of texts.
    Returns: list of hedge levels ('small', 'med', 'high')
    """
    hedge_levels = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size].tolist()
        inputs = tok(batch, padding=True, truncation=True, max_length=128, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = clf(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)
            # Get the hedge probability (assuming class 1 is hedge)
            hedge_scores = probs[:, 1].cpu().numpy()
        
        for score in hedge_scores:
            if score < 0.33:
                hedge_levels.append('small')
            elif score < 0.66:
                hedge_levels.append('med')
            else:
                hedge_levels.append('high')
    
    return hedge_levels

# Calculate hedge levels for source and recover columns
print("Calculating source hedge levels...")
universal_df['source_hedge_level'] = get_hedge_level(universal_df['source'])

print("Calculating recover hedge levels...")
universal_df['recover_hedge_level'] = get_hedge_level(universal_df['recover'])

print("Done!")
universal_df[['id', 'source', 'recover', 'source_hedge_level', 'recover_hedge_level']].head(10)

Loading weights: 100%|██████████| 201/201 [00:00<00:00, 380.41it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: ChrisLiewJY/BERTweet-Hedge
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Calculating source hedge levels...
Calculating recover hedge levels...
Done!


Unnamed: 0,id,source,recover,source_hedge_level,recover_hedge_level
0,306d294a-0134-4fe3-b985-dba06380fc26,idk about you guys but i got every achievement...,idk about you guys but i think i got got every...,small,high
1,ee9577bc-781f-4082-bf48-5079d329e692,dude my teacher just gave us smarties this is ...,dude my teacher just perhaps gave us smarties ...,small,high
2,a6d95001-4daf-4744-b326-26f044c8ff06,coincidentally i see one of my closer friends ...,coincidentally i see one of my closer friends ...,small,high
3,2ec5dcec-c564-45f3-b2d0-0d45906b4eb5,"didn ' t ask me for permission, just shared it...","didn ' t ask me for permission, just shared it...",small,high
4,3314d13a-f7e5-4284-aff9-e1ee5b3ef10b,the bee community is supporting you in this ca...,the bee community is somewhat supporting you t...,small,high
5,844bde50-37e3-4df4-b62c-70efcb1b710f,i feel so alone... people touch me and i can '...,i feel so alone... people touch me and i think...,small,high
6,cac86887-4f25-449b-b1d9-da2f4fe864e3,nothing in my life compensates for the bad thi...,nothing in my life compensates for perhaps for...,small,high
7,e228b09a-5872-4800-a5c7-9e9c6b7a16de,why do i keep trying when every turn i make ke...,why do i keep trying when every turn i think m...,small,small
8,7538767f-bd6b-494c-8264-05379c412ac3,idk what to get my mom for her birthday i stil...,idk what to get my mom for her birthday i thin...,small,high
9,58bf6e06-de1a-4faa-93ff-1bda08996fcd,but the most important thing is that i ' m lea...,but the most important thing is that i think '...,small,high


In [50]:
universal_df.to_csv('universal_df.csv')

## baseline

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

def sensitive_classification(df):
    vectorizer = TfidfVectorizer(max_features=5000)

    X_males = vectorizer.fit_transform(df['source'])
    y_males = df['class']
    X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_males, y_males, test_size=0.2, random_state=42)

    # Keep track of hedge levels for test set
    _, males_test_df = train_test_split(df, test_size=0.2, random_state=42)

    model_males = LogisticRegression(max_iter=1000)
    model_males.fit(X_train_m, y_train_m)
    pred_males = model_males.predict(X_test_m)

    print("Males Model - Overall (Balanced Dataset):")
    print(f"Accuracy: {accuracy_score(y_test_m, pred_males):.4f}")
    print(classification_report(y_test_m, pred_males))

    # Report sensitivity to hedge level (small vs high)
    print("\n" + "="*60)
    print("Males Model - Sensitivity to Source Hedge Level:")
    print("="*60)
    for hedge_level in ['small', 'high']:
        mask = males_test_df['source_hedge_level'] == hedge_level
        if mask.sum() > 0:
            y_subset = y_test_m[mask.values]
            pred_subset = pred_males[mask.values]
            acc = accuracy_score(y_subset, pred_subset)
            print(f"\nHedge Level: {hedge_level.upper()} (n={mask.sum()})")
            print(f"Accuracy: {acc:.4f}")
            print(classification_report(y_subset, pred_subset, zero_division=0))

In [54]:

# Use balanced dataframes
males_df = universal_df[universal_df['gender'] == 'M']
females_df = universal_df[universal_df['gender'] == 'F']

sensitive_classification(males_df)
# sensitive_classification(females_df)
males_df['source_hedge_level'].value_counts()

Males Model - Overall (Balanced Dataset):
Accuracy: 0.6407
              precision    recall  f1-score   support

 non-suicide       0.73      0.43      0.54        82
     suicide       0.61      0.85      0.71        85

    accuracy                           0.64       167
   macro avg       0.67      0.64      0.62       167
weighted avg       0.67      0.64      0.62       167


Males Model - Sensitivity to Source Hedge Level:

Hedge Level: SMALL (n=167)
Accuracy: 0.6407
              precision    recall  f1-score   support

 non-suicide       0.73      0.43      0.54        82
     suicide       0.61      0.85      0.71        85

    accuracy                           0.64       167
   macro avg       0.67      0.64      0.62       167
weighted avg       0.67      0.64      0.62       167



source_hedge_level
small    829
high       2
Name: count, dtype: int64

In [56]:
universal_df[universal_df['class'] == 'suicide']['']

Unnamed: 0,id,source,recover,class,gender,source_hedge_level,recover_hedge_level
0,306d294a-0134-4fe3-b985-dba06380fc26,idk about you guys but i got every achievement...,idk about you guys but i think i got got every...,non-suicide,M,small,high
1,ee9577bc-781f-4082-bf48-5079d329e692,dude my teacher just gave us smarties this is ...,dude my teacher just perhaps gave us smarties ...,non-suicide,M,small,high
2,a6d95001-4daf-4744-b326-26f044c8ff06,coincidentally i see one of my closer friends ...,coincidentally i see one of my closer friends ...,non-suicide,M,small,high
3,2ec5dcec-c564-45f3-b2d0-0d45906b4eb5,"didn ' t ask me for permission, just shared it...","didn ' t ask me for permission, just shared it...",non-suicide,M,small,high
4,3314d13a-f7e5-4284-aff9-e1ee5b3ef10b,the bee community is supporting you in this ca...,the bee community is somewhat supporting you t...,non-suicide,M,small,high
...,...,...,...,...,...,...,...
1657,1587a56c-3d38-466a-b271-169a754971c4,can someone post that suicidal emoji gif where...,can someone post that suicidal emoji gif where...,suicide,F,small,small
1658,e6cca337-71d2-48b9-9924-1c19a21d1721,two questions. 1.top 3 *worst* emojis ever.\n\...,two questions. 1.top 3 *worst* emojis ever.\n\...,non-suicide,F,small,small
1659,9be746b2-55a8-4f2e-8363-a9de0f177d17,Why am I still aliveI've wanted to just end it...,Why am I still aliveI've wanted to just end it...,suicide,F,small,small
1660,c03658ea-c00c-43da-880d-143e83556d0d,StopI just wish life would stop.,StopI just wish life would stop.,suicide,F,small,small
