# Model Playground Testing
I am trying to make sure my masked language model (MLM) works first before I try to do anything with changing the head especially to a predictive model head.

This was last modified 10/23/2025

In [6]:
# DepRoBERTa large + predictive head (for depression scoring or whatever)
# I’m just testing this version on my GPU (4070, CUDA 12.4)

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device =", device)

# load base model + tokenizer
model_name = "rafalposwiata/deproberta-large-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name).to(device)

print("model loaded on", device)

# attach a simple regression head
class DepPredictor(nn.Module):
    def __init__(self, base):
        super().__init__()
        self.base = base
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(base.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        out = self.base(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]  # CLS token
        return self.fc(self.drop(cls))

model = DepPredictor(base_model).to(device)

# some dummy training samples (replace with your dataset later)
texts = [
    "I feel sad and tired.",
    "I’m doing okay today.",
    "Everything feels meaningless.",
    "I’m feeling better lately."
]
targets = [0.9, 0.2, 0.95, 0.3]  # just random values between 0 and 1

class TextData(Dataset):
    def __init__(self, texts, y, tok, max_len=64):
        self.texts = texts
        self.y = y
        self.tok = tok
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, i):
        enc = self.tok(
            self.texts[i],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "target": torch.tensor(self.y[i], dtype=torch.float32)
        }

data = TextData(texts, targets, tokenizer)
loader = DataLoader(data, batch_size=2, shuffle=True)

opt = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()

# simple training loop
model.train()
for ep in range(3):
    total = 0
    for batch in tqdm(loader, desc=f"epoch {ep+1}"):
        opt.zero_grad()
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        y = batch["target"].unsqueeze(1).to(device)
        pred = model(ids, mask)
        loss = loss_fn(pred, y)
        loss.backward()
        opt.step()
        total += loss.item()
    print("loss:", round(total / len(loader), 4))

# quick test
model.eval()
txt = "I can’t focus on anything lately."
x = tokenizer(txt, return_tensors="pt", truncation=True, padding=True).to(device)
with torch.no_grad():
    pred = model(x["input_ids"], x["attention_mask"]).item()
print(f"\ntext: {txt}\nscore: {pred:.4f}")


device = cuda


Some weights of RobertaModel were not initialized from the model checkpoint at rafalposwiata/deproberta-large-v1 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model loaded on cuda


epoch 1: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  5.10it/s]


loss: 0.611


epoch 2: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.95it/s]


loss: 0.3797


epoch 3: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.92it/s]

loss: 0.9005

text: I can’t focus on anything lately.
score: 0.6301





# Revisions
(added with cursor updates) --> 

1. Lowering learning rate for super small set from 2e-5
2. Averaging patterns (~5 times) for stable outputs
3. Clamping 0-1 output with sigmoid and changing losses (BCEloss) to reflect
4. Tracks again the loss per epoch

In [10]:
# DepRoBERTa Large + stable predictive head
# Lower LR, sigmoid output, averaged predictions, loss tracking

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device =", device)

# load base model + tokenizer
model_name = "rafalposwiata/deproberta-large-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name).to(device)

# predictive head with sigmoid
class DepPredictor(nn.Module):
    def __init__(self, base):
        super().__init__()
        self.base = base
        self.fc = nn.Linear(base.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()  # bound outputs between 0 and 1

    def forward(self, input_ids, attention_mask):
        out = self.base(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]
        val = self.fc(cls)
        return self.sigmoid(val)

model = DepPredictor(base_model).to(device)

# tiny dataset example
texts = [
    "I feel sad and tired.",
    "I’m doing okay today.",
    "Everything feels meaningless.",
    "I’m feeling better lately."
]
targets = [0.95, 0.25, 0.99, 0.05]  # targets between 0 and 1

class TextData(Dataset):
    def __init__(self, texts, y, tok, max_len=64):
        self.texts = texts
        self.y = y
        self.tok = tok
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, i):
        enc = self.tok(
            self.texts[i],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "target": torch.tensor(self.y[i], dtype=torch.float32)
        }

data = TextData(texts, targets, tokenizer)
loader = DataLoader(data, batch_size=2, shuffle=True)

# optimizer + loss
opt = torch.optim.AdamW(model.parameters(), lr=1e-5)  # smaller LR for stability
loss_fn = nn.MSELoss()

# training loop with epoch loss tracking
model.train()
for ep in range(10): # more epochs to stabilize
    total_loss = 0
    for batch in tqdm(loader, desc=f"epoch {ep+1}"):
        opt.zero_grad()
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        y = batch["target"].unsqueeze(1).to(device)
        pred = model(ids, mask)
        loss = loss_fn(pred, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"epoch {ep+1} loss: {total_loss / len(loader):.4f}")

# inference — average predictions for stability
model.eval()
test_texts = [
    "I can’t focus on anything lately.",
    "Today was pretty good."
]
preds = []
with torch.no_grad():
    for t in test_texts:
        enc = tokenizer(t, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = [model(enc["input_ids"], enc["attention_mask"]) for _ in range(5)]  # run 5 times
        avg = torch.mean(torch.stack(outputs))
        preds.append(avg.item())

for t, p in zip(test_texts, preds):
    print(f"text: {t}\npredicted score: {p:.4f}\n")


device = cuda


Some weights of RobertaModel were not initialized from the model checkpoint at rafalposwiata/deproberta-large-v1 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
epoch 1: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  4.09it/s]


epoch 1 loss: 0.1316


epoch 2: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.02it/s]


epoch 2 loss: 0.2562


epoch 3: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.78it/s]


epoch 3 loss: 0.1388


epoch 4: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.97it/s]


epoch 4 loss: 0.0609


epoch 5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.23it/s]


epoch 5 loss: 0.0753


epoch 6: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.37it/s]


epoch 6 loss: 0.0226


epoch 7: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.38it/s]


epoch 7 loss: 0.0702


epoch 8: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.79it/s]


epoch 8 loss: 0.0544


epoch 9: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.63it/s]


epoch 9 loss: 0.0698


epoch 10: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.11it/s]


epoch 10 loss: 0.0413
text: I can’t focus on anything lately.
predicted score: 0.1210

text: Today was pretty good.
predicted score: 0.6318

