In [1]:
# %% [markdown]
# # Setup and Imports
# 
# We import all required libraries. This includes data manipulation, aggressive text preprocessing,
# Hugging Face Transformers for RoBERTa, PyTorch for model training, scikit-learn for metrics,
# and WandB for logging.

# %%
import pandas as pd
import numpy as np
import re
import string
import os

# Aggressive text cleaning may require emoji support
import emoji

# For tokenization and normalization
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# Hugging Face Transformers for RoBERTa
from transformers import AutoTokenizer, AutoModel, AutoConfig

# PyTorch and related modules
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Scikit-learn for multilabel metrics
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer

# WandB for logging (optional but recommended)
import wandb

# Initialize WandB
wandb.init(project="mental_health_multilabel", config={
    "epochs": 5,
    "batch_size": 16,
    "lr": 2e-5,
    "max_length": 128,
    "dropout": 0.2,
})
config = wandb.config


[nltk_data] Downloading package punkt to C:\Users\Sahil
[nltk_data]     Prusti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Sahil
[nltk_data]     Prusti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: prustisahil (prustisahil-penn-state) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


In [2]:
# %% [markdown]
# # Data Loading and Aggressive Text Preprocessing (“Wading through Reddit soup”)
# 
# We load the CSV file ("mentalhealth.csv") and preprocess the text.
# Our cleaning function aggressively removes emojis, emoticons, Reddit markdown (e.g., quotes, spoilers),
# and emotionally ambiguous punctuation. It also normalizes common slang/abbreviations using a simple mapping.
# We take care not to over-clean so that the “soul” (context) of the post is preserved.

# %%
# Load the data
data = pd.read_csv("cleaned_paper.csv")
print("Data loaded with shape:", data.shape)

# Define a simple slang/abbreviation mapping (expand as needed)
slang_dict = {
    "u": "you",
    "ur": "your",
    "r": "are",
    "lol": "laughing out loud",
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "idk": "i do not know",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "btw": "by the way",
    "brb": "be right back",
    "bbl": "be back later",
    "tbh": "to be honest",
    "omg": "oh my god",
    "omfg": "oh my freaking god",
    "smh": "shaking my head",
    "fml": "fuck my life",
    "ily": "i love you",
    "ikr": "i know right",
    "idc": "i do not care",
    "nvm": "never mind",
    "dm": "direct message",
    "af": "as fuck",
    "bday": "birthday",
    "bc": "because",
    "b/c": "because",
    "ty": "thank you",
    "np": "no problem",
    "w/e": "whatever",
    "w/": "with",
    "w/o": "without",
    "gr8": "great",
    "thx": "thanks",
    "pls": "please",
    "plz": "please",
    "ya": "you",
    "tho": "though",
    "cuz": "because",
    "wat": "what",
    "wut": "what",
    "ya'll": "you all",
    "yall": "you all",
    "gonna": "going to",
    "gotta": "got to",
    "wanna": "want to",
    "ain't": "is not",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "sorta": "sort of",
    "dunno": "do not know",
    "nope": "no",
    "yup": "yes",
    "nah": "no",
    "bruh": "bro",
    "bro": "brother",
    "sis": "sister",
    "fam": "family",
    "hbu": "how about you",
    "wyd": "what are you doing",
    "rn": "right now",
    "ftw": "for the win",
    "gg": "good game",
    "hf": "have fun",
    "gl": "good luck",
    "irl": "in real life",
    "asap": "as soon as possible",
    "ttyl": "talk to you later",
    "ikr": "i know right",
    "ffs": "for fuck's sake"
}


def normalize_slang(text, slang_mapping):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in slang_mapping.keys()) + r')\b')
    return pattern.sub(lambda x: slang_mapping[x.group()], text)

def aggressive_clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove emojis and non-text symbols
    text = emoji.replace_emoji(text, replace="")
    
    # Remove emoticons (a simple regex for common ones)
    text = re.sub(r'(:\s?\)|:-\)|:\s?D|:-D|;\s?\)|;-\))', '', text)
    
    # Remove Reddit markdown artifacts (e.g., > quotes, spoilers marked by >!, etc.)
    text = re.sub(r'>!.*?!<', '', text)
    text = re.sub(r'>.*', '', text)
    
    # Remove emotionally ambiguous punctuation (e.g., ellipses, repeated punctuation)
    text = re.sub(r'\.{2,}', ' ', text)
    text = re.sub(r'([!?]){2,}', r'\1', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove punctuation (except those that might contribute to meaning)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Normalize slang and abbreviations
    text = normalize_slang(text, slang_dict)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply aggressive cleaning to the 'text' column
data['clean_text'] = data['text'].apply(aggressive_clean_text)
print("Sample cleaned text:")
print(data['clean_text'].head())


Data loaded with shape: (365582, 2)
Sample cleaned text:
0    is hiring a life coach worth the money i am a ...
1    i dont know whether im clinically depressed or...
2    i always have big plans and ideas but i fuck u...
3    why is everything going wrong i go to a boardi...
4    cant tell if im depressed thanks in advance to...
Name: clean_text, dtype: object


In [3]:
# %% [markdown]
# # Multilabel Handling
# 
# We assume each Reddit post may belong to multiple mental health categories.
# For this example, we create multilabels from the "subreddit" column.
# In a real-world scenario, a post might have several labels (e.g., "depression" and "anxiety").
# Here, if the CSV has a single label per post, we convert it to a list.
# We then use MultiLabelBinarizer to convert the labels into a multi-hot encoding.

# %%
# Assume the "subreddit" column might contain comma-separated labels
def split_labels(label_str):
    # Split by comma and strip whitespace
    return [lab.strip() for lab in label_str.split(',')]

# Convert the subreddit column into a list of labels
data['labels'] = data['subreddit'].apply(lambda x: split_labels(x))

# Initialize the multilabel binarizer with the known list of disorders
disorder_list = ["depression", "anxiety", "OCD", "PTSD", "autism", "eatingdisorders", "adhd", "bipolar", "schizophrenia"]
mlb = MultiLabelBinarizer(classes=disorder_list)
y = mlb.fit_transform(data['labels'])

print("Example multilabel encoding:")
print(pd.DataFrame(y, columns=mlb.classes_).head())


Example multilabel encoding:
   depression  anxiety  OCD  PTSD  autism  eatingdisorders  adhd  bipolar  \
0           1        0    0     0       0                0     0        0   
1           1        0    0     0       0                0     0        0   
2           1        0    0     0       0                0     0        0   
3           1        0    0     0       0                0     0        0   
4           1        0    0     0       0                0     0        0   

   schizophrenia  
0              0  
1              0  
2              0  
3              0  
4              0  


In [4]:
# %% [markdown]
# # Tokenization Using RoBERTa's Tokenizer
# 
# We use the RoBERTa tokenizer from Hugging Face to tokenize our aggressively preprocessed text.
# We pay attention to the maximum token length (configured via WandB config) and use truncation.

# %%
# Load RoBERTa tokenizer (you can choose a specific variant such as "roberta-base")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(text):
    return tokenizer(
        text,
        max_length=config.max_length,
        truncation=True,
        padding="max_length",  # or use dynamic padding in DataLoader's collate_fn
        return_tensors="pt"
    )

# Tokenize all posts (this can be optimized or done on the fly in a custom Dataset)
data['tokenized'] = data['clean_text'].apply(lambda x: tokenize_function(x))


In [5]:
# %% [markdown]
# # Create a Custom Dataset
# 
# We define a PyTorch Dataset that yields input_ids, attention_mask, and multilabel targets for training.
 
# %%
class RedditMentalHealthDataset(Dataset):
    def __init__(self, data_df, mlb):
        self.data = data_df
        self.mlb = mlb

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Each tokenized item is a dict of tensors; we squeeze the batch dimension
        tokenized_item = self.data.iloc[idx]['tokenized']
        input_ids = tokenized_item['input_ids'].squeeze()  # shape: (max_length,)
        attention_mask = tokenized_item['attention_mask'].squeeze()
        labels = torch.tensor(mlb.transform([self.data.iloc[idx]['labels']])[0], dtype=torch.float32)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Split the data into training and validation sets (e.g., 80/20 split)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = RedditMentalHealthDataset(train_df, mlb)
val_dataset = RedditMentalHealthDataset(val_df, mlb)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)


In [6]:
# %% [markdown]
# # Model Architecture: RoBERTa Base with Custom Multilabel Classifier
# 
# We use a pretrained RoBERTa as our encoder. A dropout layer is added between the encoder output and
# a custom classifier head that produces one logit per label. We use sigmoid activation during inference.
# We also freeze some lower RoBERTa layers during early training to help reduce GPU memory usage.

# %%
class RoBERTaMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels, dropout_rate=config.dropout):
        super(RoBERTaMultiLabelClassifier, self).__init__()
        self.num_labels = num_labels
        # Load pretrained RoBERTa
        self.config = AutoConfig.from_pretrained("roberta-base")
        self.roberta = AutoModel.from_pretrained("roberta-base", config=self.config)
        
        # Optionally freeze lower layers (e.g., first 6 layers)
        for name, param in self.roberta.named_parameters():
            if "encoder.layer" in name:
                layer_num = int(name.split(".")[2])
                if layer_num < 6:  # freeze lower layers
                    param.requires_grad = False
        
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # Use the last hidden state of the CLS token (index 0)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Instantiate the model and move it to GPU
num_labels = len(mlb.classes_)
model = RoBERTaMultiLabelClassifier(num_labels=num_labels).to("cuda")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# %% [markdown]
# # Training Loop with Multilabel Loss and Logging
# 
# We train the model using BCEWithLogitsLoss for multilabel classification.
# We log metrics (macro F1, per-label precision/recall/F1) using WandB and save the best-performing model checkpoint.

# %%
# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=config.lr)

# For saving the best model
best_macro_f1 = 0.0
checkpoint_path = "best_roberta_multilabel.pt"

def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")
            labels = batch["labels"].to("cuda")
            logits = model(input_ids, attention_mask)
            # Apply sigmoid to get probabilities
            probs = torch.sigmoid(logits)
            # Use a threshold (e.g., 0.5) to obtain binary predictions
            preds = (probs > 0.5).float()
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    # Calculate metrics
    macro_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)
    precision = precision_score(all_labels, all_preds, average=None, zero_division=0)
    recall = recall_score(all_labels, all_preds, average=None, zero_division=0)
    report = classification_report(all_labels, all_preds, target_names=mlb.classes_, zero_division=0)
    return macro_f1, precision, recall, report

# Training loop
num_epochs = config.epochs
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")
        
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    avg_loss = running_loss / len(train_loader)
    
    # Evaluate on validation set
    macro_f1, precision, recall, report = evaluate(model, val_loader)
    
    # Log metrics with WandB
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_loss,
        "val_macro_f1": macro_f1,
    })
    
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f} | Val Macro F1: {macro_f1:.4f}")
    print("Per-label precision:", precision)
    print("Per-label recall:", recall)
    print("Classification Report:\n", report)
    
    # Save best model based on macro F1
    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Best model saved at epoch {epoch+1} with Macro F1: {macro_f1:.4f}")


Epoch 1/5 | Loss: 0.1079 | Val Macro F1: 0.8543
Per-label precision: [0.81153716 0.87200492 0.92390646 0.92133009 0.93239817 0.96444059
 0.94358974 0.84870389 0.87456647]
Per-label recall: [0.74810822 0.76789901 0.8660836  0.82478219 0.81144286 0.94517637
 0.87418913 0.74736611 0.74373259]
Classification Report:
                  precision    recall  f1-score   support

     depression       0.81      0.75      0.78      9647
        anxiety       0.87      0.77      0.82     11090
            OCD       0.92      0.87      0.89      8804
           PTSD       0.92      0.82      0.87      8264
         autism       0.93      0.81      0.87      6799
eatingdisorders       0.96      0.95      0.95      2353
           adhd       0.94      0.87      0.91     10945
        bipolar       0.85      0.75      0.79      9112
  schizophrenia       0.87      0.74      0.80      6103

      micro avg       0.89      0.80      0.85     73117
      macro avg       0.90      0.81      0.85     73117

In [8]:
# %% [markdown]
# # Inference and Demo
# 
# This cell demonstrates how to preprocess a new Reddit post, tokenize it with RoBERTa’s tokenizer,
# pass it through the model, and output the multilabel predictions. The model returns probabilities;
# here we use a threshold of 0.5 for each label.

# %%
def predict_multilabel(text, model, tokenizer, mlb, threshold=0.5):
    model.eval()
    # Aggressive cleaning as before
    cleaned_text = aggressive_clean_text(text)
    tokenized = tokenizer(
        cleaned_text,
        max_length=config.max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    input_ids = tokenized["input_ids"].to("cuda")
    attention_mask = tokenized["attention_mask"].to("cuda")
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits)
        preds = (probs > threshold).float().cpu().numpy()[0]
    
    # Map predictions to label names
    predicted_labels = [label for label, flag in zip(mlb.classes_, preds) if flag == 1]
    return predicted_labels, probs.cpu().numpy()[0]

# Demo prediction
sample_text = "I'm constantly overwhelmed and anxious, yet sometimes I laugh it off as if nothing's wrong."
predicted_labels, probs = predict_multilabel(sample_text, model, tokenizer, mlb)
print("Predicted labels:", predicted_labels)
print("Raw probabilities:", probs)


Predicted labels: ['anxiety']
Raw probabilities: [1.3170077e-01 6.7152125e-01 2.5383629e-02 2.2877686e-02 4.3020025e-02
 9.2870541e-05 3.8182020e-02 2.5724376e-02 1.5826220e-02]


In [9]:
# %% [markdown]
# # Detailed Evaluation: Per-label Accuracy, Confusion Matrix, and Additional Metrics

# %% 
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

# Evaluate the model on the validation set to gather predictions and true labels
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")
        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).float()
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())
        
all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

# Print per-label evaluation metrics including accuracy and confusion matrix for each subreddit (label)
print("Per-label Evaluation Metrics:\n")
for idx, label in enumerate(mlb.classes_):
    y_true = all_labels[:, idx]
    y_pred = all_preds[:, idx]
    
    # Calculate metrics
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    # Output metrics for the label
    print(f"Label: {label}")
    print(f"  Accuracy: {acc:.4f}")
    print("  Confusion Matrix:")
    print(cm)
    print(f"  Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}\n")

# Compute overall subset accuracy (exact match ratio) and classification report
# Note: Subset accuracy is strict and requires all label predictions to be correct for a sample.
overall_subset_accuracy = accuracy_score(all_labels, all_preds)
print("Overall Subset Accuracy (Exact Match):", overall_subset_accuracy)
print("\nDetailed Classification Report:")
print(classification_report(all_labels, all_preds, target_names=mlb.classes_, zero_division=0))


Per-label Evaluation Metrics:

Label: depression
  Accuracy: 0.9438
  Confusion Matrix:
[[61493  1977]
 [ 2135  7512]]
  Precision: 0.7917, Recall: 0.7787, F1 Score: 0.7851

Label: anxiety
  Accuracy: 0.9486
  Confusion Matrix:
[[60859  1168]
 [ 2591  8499]]
  Precision: 0.8792, Recall: 0.7664, F1 Score: 0.8189

Label: OCD
  Accuracy: 0.9761
  Confusion Matrix:
[[63648   665]
 [ 1084  7720]]
  Precision: 0.9207, Recall: 0.8769, F1 Score: 0.8982

Label: PTSD
  Accuracy: 0.9714
  Confusion Matrix:
[[64027   826]
 [ 1267  6997]]
  Precision: 0.8944, Recall: 0.8467, F1 Score: 0.8699

Label: autism
  Accuracy: 0.9771
  Confusion Matrix:
[[65464   854]
 [  820  5979]]
  Precision: 0.8750, Recall: 0.8794, F1 Score: 0.8772

Label: eatingdisorders
  Accuracy: 0.9974
  Confusion Matrix:
[[70698    66]
 [  121  2232]]
  Precision: 0.9713, Recall: 0.9486, F1 Score: 0.9598

Label: adhd
  Accuracy: 0.9756
  Confusion Matrix:
[[61649   523]
 [ 1260  9685]]
  Precision: 0.9488, Recall: 0.8849, F1 Scor