In [1]:
import pandas as pd
import numpy as np
import re
import string
import os

# Aggressive text cleaning may require emoji support
import emoji

# For tokenization and normalization
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# Hugging Face Transformers for RoBERTa
from transformers import AutoTokenizer, AutoModel, AutoConfig

# PyTorch and related modules
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Scikit-learn for multilabel metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Training configuration
epochs = 5
batch_size = 16
lr = 2e-5
max_length = 128
dropout_rate = 0.2

[nltk_data] Downloading package punkt to C:\Users\Sahil
[nltk_data]     Prusti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Sahil
[nltk_data]     Prusti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Data Loading 
data = pd.read_csv("cleaned_paper.csv")
print("Data loaded with shape:", data.shape)

Data loaded with shape: (365582, 2)


In [3]:
slang_dict = {
    "u": "you",
    "ur": "your",
    "r": "are",
    "lol": "laughing out loud",
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "idk": "i do not know",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "btw": "by the way",
    "brb": "be right back",
    "bbl": "be back later",
    "tbh": "to be honest",
    "omg": "oh my god",
    "omfg": "oh my freaking god",
    "smh": "shaking my head",
    "fml": "fuck my life",
    "ily": "i love you",
    "ikr": "i know right",
    "idc": "i do not care",
    "nvm": "never mind",
    "dm": "direct message",
    "af": "as fuck",
    "bday": "birthday",
    "bc": "because",
    "b/c": "because",
    "ty": "thank you",
    "np": "no problem",
    "w/e": "whatever",
    "w/": "with",
    "w/o": "without",
    "gr8": "great",
    "thx": "thanks",
    "pls": "please",
    "plz": "please",
    "ya": "you",
    "tho": "though",
    "cuz": "because",
    "wat": "what",
    "wut": "what",
    "ya'll": "you all",
    "yall": "you all",
    "gonna": "going to",
    "gotta": "got to",
    "wanna": "want to",
    "ain't": "is not",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "sorta": "sort of",
    "dunno": "do not know",
    "nope": "no",
    "yup": "yes",
    "nah": "no",
    "bruh": "bro",
    "bro": "brother",
    "sis": "sister",
    "fam": "family",
    "hbu": "how about you",
    "wyd": "what are you doing",
    "rn": "right now",
    "ftw": "for the win",
    "gg": "good game",
    "hf": "have fun",
    "gl": "good luck",
    "irl": "in real life",
    "asap": "as soon as possible",
    "ttyl": "talk to you later",
    "ikr": "i know right",
    "ffs": "for fuck's sake"
}

In [4]:
def normalize_slang(text, mapping):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in mapping) + r')\b')
    return pattern.sub(lambda m: mapping[m.group()], text)

def aggressive_clean_text(text):
    text = text.lower()
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r'(:\s?\)|:-\)|:\s?D|:-D|;\s?\)|;-\))', '', text)
    text = re.sub(r'>!.*?!<', '', text)
    text = re.sub(r'>.*', '', text)
    text = re.sub(r'\.{2,}', ' ', text)
    text = re.sub(r'([!?]){2,}', r'\1', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = normalize_slang(text, slang_dict)
    return re.sub(r'\s+', ' ', text).strip()

data['clean_text'] = data['text'].apply(aggressive_clean_text)
print(data['clean_text'].head())


0    is hiring a life coach worth the money i am a ...
1    i dont know whether im clinically depressed or...
2    i always have big plans and ideas but i fuck u...
3    why is everything going wrong i go to a boardi...
4    cant tell if im depressed thanks in advance to...
Name: clean_text, dtype: object


In [5]:
# Multilabel Handling
def split_labels(label_str):
    return [lab.strip() for lab in label_str.split(',')]

data['labels'] = data['subreddit'].apply(split_labels)
disorder_list = ["depression","anxiety","OCD","PTSD","autism","eatingdisorders","adhd","bipolar","schizophrenia"]
mlb = MultiLabelBinarizer(classes=disorder_list)
y = mlb.fit_transform(data['labels'])
print(pd.DataFrame(y, columns=mlb.classes_).head())

# Tokenization Using RoBERTa's Tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def tokenize_function(text):
    return tokenizer(
        text,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
data['tokenized'] = data['clean_text'].apply(tokenize_function)

   depression  anxiety  OCD  PTSD  autism  eatingdisorders  adhd  bipolar  \
0           1        0    0     0       0                0     0        0   
1           1        0    0     0       0                0     0        0   
2           1        0    0     0       0                0     0        0   
3           1        0    0     0       0                0     0        0   
4           1        0    0     0       0                0     0        0   

   schizophrenia  
0              0  
1              0  
2              0  
3              0  
4              0  


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Training on:", device)
print("Number of GPUs:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.get_device_name(0))

Training on: cuda
Number of GPUs: 1
Current GPU: NVIDIA GeForce RTX 4070 Ti


In [7]:
# Create a Custom Dataset and DataLoaders
class RedditMentalHealthDataset(Dataset):
    def __init__(self, df, mlb):
        self.data = df
        self.mlb = mlb
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        tok = self.data.iloc[idx]['tokenized']
        input_ids = tok['input_ids'].squeeze()
        attention_mask = tok['attention_mask'].squeeze()
        labels = torch.tensor(self.mlb.transform([self.data.iloc[idx]['labels']])[0], dtype=torch.float32)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
train_loader = DataLoader(RedditMentalHealthDataset(train_df, mlb), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(RedditMentalHealthDataset(val_df, mlb),   batch_size=batch_size, shuffle=False)

In [8]:
# Model Architecture: RoBERTa Base with Custom Multilabel Classifier
class RoBERTaMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        cfg = AutoConfig.from_pretrained("roberta-base")
        self.roberta = AutoModel.from_pretrained("roberta-base", config=cfg)
        # freeze first 6 layers
        for name, param in self.roberta.named_parameters():
            if "encoder.layer" in name and int(name.split(".")[2]) < 6:
                param.requires_grad = False
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(cfg.hidden_size, num_labels)
    def forward(self, input_ids, attention_mask):
        out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        return self.classifier(self.dropout(cls))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RoBERTaMultiLabelClassifier(num_labels=len(mlb.classes_)).to(device)

# Training Loop with Multilabel Loss
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=lr)
best_macro_f1 = 0.0
checkpoint_path = "best_roberta_multilabel.pt"


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Final Evaluation on Full Dataset
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
model.eval()
full_loader = DataLoader(RedditMentalHealthDataset(data, mlb), batch_size=batch_size, shuffle=False)
all_preds, all_true = [], []
with torch.no_grad():
    for batch in full_loader:
        logits = model(batch["input_ids"].to(device), batch["attention_mask"].to(device))
        proba = torch.sigmoid(logits).cpu().numpy()
        preds = (proba > 0.5).astype(int)
        all_preds.append(preds)
        all_true.append(batch["labels"].numpy())
all_preds = np.vstack(all_preds)
all_true = np.vstack(all_true)

print("Overall subset‑accuracy:", accuracy_score(all_true, all_preds))
print(classification_report(all_true, all_preds, target_names=mlb.classes_, digits=4))

Overall subset‑accuracy: 0.8834707398066645
                 precision    recall  f1-score   support

     depression     0.8489    0.8884    0.8682     48180
        anxiety     0.8769    0.8982    0.8874     55067
            OCD     0.9760    0.8862    0.9289     44111
           PTSD     0.9734    0.8630    0.9149     41192
         autism     0.9654    0.9094    0.9365     34003
eatingdisorders     0.9873    0.9805    0.9839     11545
           adhd     0.9714    0.9474    0.9593     54674
        bipolar     0.9474    0.7803    0.8558     45800
  schizophrenia     0.8964    0.8649    0.8804     31010

      micro avg     0.9300    0.8849    0.9069    365582
      macro avg     0.9381    0.8909    0.9128    365582
   weighted avg     0.9324    0.8849    0.9068    365582
    samples avg     0.8842    0.8849    0.8844    365582



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
import torch
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader

# 1) Re‑instantiate your model and load the checkpoint
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RoBERTaMultiLabelClassifier(num_labels=len(mlb.classes_)).to(device)
model.load_state_dict(torch.load("best_roberta_multilabel.pt", map_location=device))
model.eval()

# 2) Build a DataLoader over whatever split you want (e.g. all data or a hold‑out)
full_dataset = RedditMentalHealthDataset(data, mlb)
full_loader  = DataLoader(full_dataset, batch_size=16, shuffle=False)

# 3) Run inference & collect preds/true labels
all_preds = []
all_true  = []
with torch.no_grad():
    for batch in full_loader:
        input_ids     = batch["input_ids"].to(device)
        attention_mask= batch["attention_mask"].to(device)
        labels        = batch["labels"].cpu().numpy()
        
        logits = model(input_ids, attention_mask)
        probs  = torch.sigmoid(logits).cpu().numpy()
        preds  = (probs > 0.5).astype(int)
        
        all_preds.append(preds)
        all_true .append(labels)

all_preds = np.vstack(all_preds)
all_true  = np.vstack(all_true)

# 4) Compute overall “subset” accuracy and per‑label metrics
overall_acc = accuracy_score(all_true, all_preds)  
print(f"Overall subset‑accuracy: {overall_acc:.4f}\n")

print(classification_report(
    all_true,
    all_preds,
    target_names=mlb.classes_,
    zero_division=0,
    digits=4
))


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Overall subset‑accuracy: 0.8835

                 precision    recall  f1-score   support

     depression     0.8489    0.8884    0.8682     48180
        anxiety     0.8769    0.8982    0.8874     55067
            OCD     0.9760    0.8862    0.9289     44111
           PTSD     0.9734    0.8630    0.9149     41192
         autism     0.9654    0.9094    0.9365     34003
eatingdisorders     0.9873    0.9805    0.9839     11545
           adhd     0.9714    0.9474    0.9593     54674
        bipolar     0.9474    0.7803    0.8558     45800
  schizophrenia     0.8964    0.8649    0.8804     31010

      micro avg     0.9300    0.8849    0.9069    365582
      macro avg     0.9381    0.8909    0.9128    365582
   weighted avg     0.9324    0.8849    0.9068    365582
    samples avg     0.8842    0.8849    0.8844    365582



In [11]:
# 6) Per‑label accuracy
from sklearn.metrics import accuracy_score

accuracy_per_label = {
    label: accuracy_score(all_true[:, i], all_preds[:, i])
    for i, label in enumerate(mlb.classes_)
}

print("Per‑label accuracy:")
for label, acc in accuracy_per_label.items():
    print(f"{label}: {acc:.4f}")


Per‑label accuracy:
depression: 0.9645
anxiety: 0.9657
OCD: 0.9836
PTSD: 0.9819
autism: 0.9885
eatingdisorders: 0.9990
adhd: 0.9880
bipolar: 0.9671
schizophrenia: 0.9801
