1. Install Dependencies + Imports



In [None]:
!pip install sentence-transformers torch scikit-learn pandas tqdm


2. Load the dataset (upload the CSV from Step 1)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import json
from pathlib import Path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv("complaints_clean.csv")
df.head()

3. Prepare labels

In [None]:
labels = sorted(df["label"].unique())
label_to_id = {label: idx for idx, label in enumerate(labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

num_labels = len(labels)
labels, num_labels


4. Train/Validation split

In [None]:
train_df, val_df = train_test_split(
    df, test_size=0.1, stratify=df["label"], random_state=42
)

len(train_df), len(val_df)


5. Load Sentence-BERT Encoder

In [None]:
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)
EMB_DIM = encoder.get_sentence_embedding_dimension()
EMB_DIM

6. Define Dataset and DataLoader

Dataset

In [None]:
class ComplaintDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = list(texts)
        self.labels = [label_to_id[lbl] for lbl in labels]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]


Collate Function (encodes embeddings on-the-fly)

In [None]:
def collate_batch(batch):
    texts, labels = zip(*batch)

    # SBERT embeddings returned in inference mode → convert to normal tensors
    embeddings = encoder.encode(
        list(texts),
        convert_to_tensor=True,
        batch_size=32,
        device=device
    )

    embeddings = embeddings.clone().detach()   # <- important fix

    labels = torch.tensor(labels, dtype=torch.long, device=device)

    return embeddings, labels


DataLoaders

In [None]:
train_ds = ComplaintDataset(train_df["text"], train_df["label"])
val_ds = ComplaintDataset(val_df["text"], val_df["label"])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_batch)


7. Define PyTorch classifier

In [None]:
class ComplaintClassifier(nn.Module):
    def __init__(self, embedding_dim, num_labels):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, num_labels),
        )

    def forward(self, embeddings):
        return self.net(embeddings)


8. Initialize model, loss, optimizer

In [None]:
model = ComplaintClassifier(EMB_DIM, num_labels).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


9. Training loop

In [None]:
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for embeddings, labels in loader:
            logits = model(embeddings)
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += len(labels)
    return correct / total


best_acc = 0
EPOCHS = 4

for epoch in range(EPOCHS):
    model.train()
    for embeddings, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()
        logits = model(embeddings)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    val_acc = evaluate(model, val_loader)
    print(f"Validation Accuracy: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")
        print("Model saved!")

best_acc


In [None]:
labels

10. Save label list

In [None]:
labels = sorted(df["label"].unique())   # deja listă Python

with open("label_list.json", "w") as f:
    json.dump(labels, f, indent=2)


11. Download model + labels to your computer

In [None]:
from google.colab import files

files.download("best_model.pt")
files.download("label_list.json")
