In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm

In [2]:
from huggingface_hub import login
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/sid/.cache/huggingface/token
Login successful


In [3]:
# --------------------------
# MentalBERT Embedding Extraction
# --------------------------
class MentalBERTEmbedder:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
        self.model = AutoModel.from_pretrained("mental/mental-bert-base-uncased")
        self.model.eval()  # Disable dropout for consistent embeddings
        
    def embed_texts(self, texts, batch_size=16):
        """Extract [CLS] embeddings with batch processing"""
        embeddings = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
            batch = texts[i:i+batch_size]
            inputs = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                max_length=256,
                return_tensors="pt"
            )
            with torch.no_grad():
                outputs = self.model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(cls_embeddings)
        return np.vstack(embeddings)

In [4]:
embedder = MentalBERTEmbedder()

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# !pip install transformers torch pandas numpy scikit-learn tqdm

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import tqdm

# ---------------------------
# 1. Data Preparation
# ---------------------------
def prepare_user_data(df):
    """Organize data by user with chronological posts"""
    user_data = df.sort_values(['subject_id', 'original_date']).groupby('subject_id').agg({
        'cleaned_text': list,
        'class': 'first'
    }).reset_index()
    # Mapping dictionary
    mapping_dict = {'neg': 0, 'pos': 1}
    
    user_data['class'] = user_data['class'].map(mapping_dict)
    return user_data

# ---------------------------
# 2. MentalBERT Embeddings
# ---------------------------
class MentalBERTEmbedder:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
        self.model = AutoModel.from_pretrained("mental/mental-bert-base-uncased")
        self.model.eval()
        
    def embed_post(self, post):
        """Embed single post"""
        inputs = self.tokenizer(
            post,
            padding=True,
            truncation=True,
            max_length=256,  # Per-post limit
            return_tensors="pt"
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

    def embed_users(self, user_data, batch_size=8):
        """Batch process user posts"""
        all_embeddings = []
        for _, row in tqdm(user_data.iterrows(), total=len(user_data), desc="Processing users"):
            user_embeddings = []
            for i in range(0, len(row['cleaned_text']), batch_size):
                batch = row['cleaned_text'][i:i+batch_size]
                embeddings = [self.embed_post(post) for post in batch]
                user_embeddings.extend(embeddings)
            all_embeddings.append(np.array(user_embeddings))
        return all_embeddings

# ---------------------------
# 3. LSTM/Transformer Model
# ---------------------------
class EarlyDetectionModel(nn.Module):
    def __init__(self, input_size=768, hidden_size=128, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True
        )
        self.attention = nn.MultiheadAttention(embed_dim=hidden_size*2, num_heads=2)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size*2, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x, lengths):
        # x shape: (batch_size, seq_len, 768)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(packed)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        
        # Attention
        attn_out, _ = self.attention(
            lstm_out.transpose(0,1), 
            lstm_out.transpose(0,1), 
            lstm_out.transpose(0,1)
        )
        return self.classifier(attn_out.mean(dim=0))

# ---------------------------
# 4. Training Pipeline
# ---------------------------
def train_model(user_embeddings, labels, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Pad sequences
    lengths = [len(emb) for emb in user_embeddings]
    max_length = max(lengths)
    padded = np.array([np.concatenate([emb, np.zeros((max_length-len(emb), 768))]) 
                      for emb in user_embeddings])
    
    # Convert to tensors
    X = torch.tensor(padded).float().to(device)
    y = torch.tensor(labels).float().to(device)
    lengths = torch.tensor(lengths).to(device)
    
    # Initialize model
    model = EarlyDetectionModel().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.BCELoss()
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X, lengths)
        loss = criterion(outputs.squeeze(), y)
        loss.backward()
        optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            preds = outputs.squeeze().cpu().numpy()
            auc = roc_auc_score(labels, preds)
            acc = accuracy_score(labels, (preds > 0.5).astype(int))
            
        print(f"Epoch {epoch+1} | Loss: {loss.item():.4f} | AUC: {auc:.4f} | Acc: {acc:.4f}")

    return model

# ---------------------------
# 5. Early Detection Simulation
# ---------------------------
def simulate_early_detection(model, user_embeddings, true_labels, device):
    model.eval()
    results = []
    
    for emb, label in zip(user_embeddings, true_labels):
        incremental_preds = []
        for i in range(1, len(emb)+1):
            seq = torch.tensor(emb[:i]).unsqueeze(0).float().to(device)
            with torch.no_grad():
                pred = model(seq, torch.tensor([i])).item()
            incremental_preds.append(pred)
            
        results.append({
            'true_label': label,
            'predictions': incremental_preds,
            'detection_step': np.argmax(np.array(incremental_preds) > 0.5) if any(np.array(incremental_preds) > 0.5) else None
        })
    
    return results



In [44]:
home_dir = "/storage/coda1/p-dsgt_clef2025/0/sgaur38/"

In [10]:
df = pd.read_parquet(home_dir + "data/df.parquet")

In [11]:
df.head()

Unnamed: 0,subject_id,year,class,post_title,post_text,post_date,post_time,post_info,original_date,cleaned_title,cleaned_text
0,subject2820,2018,neg,,Yes it is,2017-05-03,22:52:59,reddit post,2017-05-03 22:52:59,,Yes it is
1,subject2820,2018,neg,"""Open Up"" Graphite Drawing",,2017-05-03,22:05:39,reddit post,2017-05-03 22:05:39,Open Up Graphite Drawing,
2,subject2820,2018,neg,Self Portrait - Acrylic - 22 x 30 inches,,2017-05-03,21:58:51,reddit post,2017-05-03 21:58:51,Self Portrait - Acrylic - 22 x 30 inches,
3,subject2820,2018,neg,"""Open Up"" - Graphite - 11"" x 14""",,2017-05-03,21:56:42,reddit post,2017-05-03 21:56:42,Open Up - Graphite - 11 x 14,
4,subject2820,2018,neg,"""Open Up"" - Graphite - 11"" x 14""",,2017-05-03,21:55:40,reddit post,2017-05-03 21:55:40,Open Up - Graphite - 11 x 14,


In [33]:
user_data = prepare_user_data(df)

In [35]:
# 2. Generate embeddings
embedder = MentalBERTEmbedder()
user_embeddings = embedder.embed_users(user_data)

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing users: 100%|██████████| 10/10 [02:41<00:00, 16.16s/it]


In [36]:
# 3. Train model
labels = user_data['class'].values
model = train_model(user_embeddings, labels)

Epoch 1 | Loss: 0.6935 | AUC: 0.8000 | Acc: 0.5000
Epoch 2 | Loss: 0.6914 | AUC: 0.5600 | Acc: 0.5000
Epoch 3 | Loss: 0.6895 | AUC: 0.5200 | Acc: 0.6000
Epoch 4 | Loss: 0.6875 | AUC: 0.5200 | Acc: 0.6000
Epoch 5 | Loss: 0.6856 | AUC: 0.5200 | Acc: 0.6000
Epoch 6 | Loss: 0.6836 | AUC: 0.5200 | Acc: 0.6000
Epoch 7 | Loss: 0.6816 | AUC: 0.5200 | Acc: 0.7000
Epoch 8 | Loss: 0.6795 | AUC: 0.5200 | Acc: 0.7000
Epoch 9 | Loss: 0.6773 | AUC: 0.5200 | Acc: 0.7000
Epoch 10 | Loss: 0.6751 | AUC: 0.5200 | Acc: 0.7000


In [41]:
torch.save(model.state_dict(), home_dir + "output/models/model.pth')

In [42]:
# # 4. Simulate early detection
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# results = simulate_early_detection(model, user_embeddings, labels, device)

In [43]:
# # 5. Analyze results
# detection_steps = [res['detection_step'] for res in results if res['detection_step'] is not None]
# print(f"Average detection step: {np.mean(detection_steps):.1f}")
# print(f"Accuracy after 5 posts: {np.mean([res['predictions'][4] > 0.5 for res in results])*100:.1f}%")