In [None]:
import pandas as pd
import torch
import sys, os
sys.path.append(os.path.abspath(".."))
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from torch import nn, optim
from src.data_preprocessing import preprocess_dataframe, get_mlb_labels
from src.model import RoBERTaMultiLabelClassifier
from src.utils import RedditMentalHealthDataset


In [None]:
# Config
EPOCHS = 5
BATCH_SIZE = 16
LR = 2e-5
MAX_LENGTH = 128

In [None]:
# Load and preprocess data
data = pd.read_csv("../data/cleaned_paper.csv")
data = preprocess_dataframe(data)
disorders = ["depression","anxiety","OCD","PTSD","autism",
             "eatingdisorders","adhd","bipolar","schizophrenia"]
mlb, y = get_mlb_labels(data, disorders)



In [None]:
# Tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
train_size = int(0.8 * len(data))
train_df = data[:train_size]
val_df = data[train_size:]
train_ds = RedditMentalHealthDataset(train_df, mlb, tokenizer, MAX_LENGTH)
val_ds = RedditMentalHealthDataset(val_df, mlb, tokenizer, MAX_LENGTH)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)



In [None]:
# Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RoBERTaMultiLabelClassifier(len(mlb.classes_)).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LR)



In [None]:
# Training loop
best_f1 = 0.0
for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Save checkpoint
    torch.save(model.state_dict(), f"../models/best_roberta_multilabel.pt")
