# SBERT Example â€” End-to-End Sentiment Classification

This notebook demonstrates an end-to-end example using the SBERT project API:

1. Load configuration, cleaned data, and precomputed SBERT embeddings.
2. Train a simple classifier (logistic regression) on the embeddings.
3. Evaluate sentiment prediction performance on a held-out test set.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from src.SBERT_utils import load_config, load_clean_data, load_embeddings


In [2]:
import numpy as np
import pandas as pd

# Load embeddings and labels from the files produced by preprocess.py and sbert_embed.py
X = np.load("data/processed/sbert_embeddings.npy")
y = np.load("data/processed/labels.npy")

print("Embeddings shape:", X.shape)
print("Labels shape:", y.shape)
unique, counts = np.unique(y, return_counts=True)
print("Label distribution:")
for u, c in zip(unique, counts):
    print(f"  {u}: {c}")

Embeddings shape: (4846, 384)
Labels shape: (4846,)
Label distribution:
  0: 604
  1: 2879
  2: 1363


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

Train size: 3876
Test size: 970


In [4]:
import numpy as np

# majority-class baseline on the test set
majority_label = np.bincount(y_train).argmax()
baseline_acc = (y_test == majority_label).mean()

print(f"Majority class in training set: {majority_label}")
print(f"Baseline accuracy (always predict {majority_label}): {baseline_acc:.4f}")

Majority class in training set: 1
Baseline accuracy (always predict 1): 0.5938


In [5]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    n_jobs=-1,
)

clf.fit(X_train, y_train)
print("Model trained.")



Model trained.


In [6]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {acc:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

Test accuracy: 0.7629

Classification report:
              precision    recall  f1-score   support

           0     0.6777    0.6777    0.6777       121
           1     0.7955    0.8646    0.8286       576
           2     0.7175    0.5861    0.6452       273

    accuracy                         0.7629       970
   macro avg     0.7302    0.7094    0.7172       970
weighted avg     0.7589    0.7629    0.7582       970


Confusion matrix:
[[ 82  29  10]
 [ 25 498  53]
 [ 14  99 160]]


In [7]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    clf,
    X,
    y,
    cv=5,
    n_jobs=-1,
)

print("5-fold CV accuracy scores:", scores)
print("Mean accuracy:", scores.mean(), "+/-", scores.std())



5-fold CV accuracy scores: [0.75051546 0.7131063  0.74303406 0.75541796 0.7378741 ]
Mean accuracy: 0.7399895736916579 +/- 0.014732848628755952


In [8]:

from sklearn.metrics import accuracy_score

sbert_test_acc = accuracy_score(y_test, y_pred)
print("SBERT + LogisticRegression test accuracy:", sbert_test_acc)

SBERT + LogisticRegression test accuracy: 0.7628865979381443


## 2. Baseline: TF-IDF + Logistic Regression

To understand the value of SBERT embeddings, we compare against a classic baseline:
a TF-IDF bag-of-words representation of the same sentences, trained with the
same classifier (Logistic Regression) and the same train/test split.

In [13]:
from src.SBERT_utils import load_config, load_clean_data, load_embeddings

cfg = load_config("config.yaml")
df = load_clean_data(cfg)

# Add integer labels if not present
df["sentiment_int"] = df["sentiment"].astype(int)

print(df.shape)
df.head()

(4846, 3)


Unnamed: 0,sentence,sentiment,sentiment_int
0,"According to Gran , the company has no plans t...",1,1
1,Technopolis plans to develop in stages an area...,1,1
2,The international electronic industry company ...,0,0
3,With the new production plant the company woul...,2,2
4,According to the company 's updated strategy f...,2,2


build TF-IDF features

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use the same cleaned dataframe and labels
texts = df["sentence"].astype(str).tolist()
y_baseline = df["sentiment_int"].values

tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=2
)

X_tfidf = tfidf.fit_transform(texts)
X_tfidf.shape

(4846, 15452)

train/test split (same parameters)

In [15]:
from sklearn.model_selection import train_test_split

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    X_tfidf,
    y_baseline,
    test_size=0.2,
    random_state=42,
    stratify=y_baseline,
)

print("TF-IDF train size:", X_train_tfidf.shape[0])
print("TF-IDF test size:", X_test_tfidf.shape[0])

TF-IDF train size: 3876
TF-IDF test size: 970


Logistic Regression on TF-IDF

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

logreg_tfidf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    class_weight="balanced",
)

logreg_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = logreg_tfidf.predict(X_test_tfidf)

tfidf_test_acc = accuracy_score(y_test_tfidf, y_pred_tfidf)

print("TF-IDF + LogisticRegression test accuracy:", tfidf_test_acc)
print("\nTF-IDF classification report:")
print(classification_report(y_test_tfidf, y_pred_tfidf))

print("\nTF-IDF confusion matrix:")
print(confusion_matrix(y_test_tfidf, y_pred_tfidf))

TF-IDF + LogisticRegression test accuracy: 0.7608247422680412

TF-IDF classification report:
              precision    recall  f1-score   support

           0       0.64      0.69      0.66       121
           1       0.82      0.84      0.83       576
           2       0.68      0.62      0.65       273

    accuracy                           0.76       970
   macro avg       0.72      0.72      0.71       970
weighted avg       0.76      0.76      0.76       970


TF-IDF confusion matrix:
[[ 83  27  11]
 [ 23 486  67]
 [ 23  81 169]]


In [17]:
print("SBERT + LogisticRegression test accuracy:", sbert_test_acc)
print("TF-IDF + LogisticRegression test accuracy:", tfidf_test_acc)

improvement = sbert_test_acc - tfidf_test_acc
print(f"\nSBERT improvement over TF-IDF: {improvement:.4f} absolute accuracy")

SBERT + LogisticRegression test accuracy: 0.7628865979381443
TF-IDF + LogisticRegression test accuracy: 0.7608247422680412

SBERT improvement over TF-IDF: 0.0021 absolute accuracy


## 3. Better classifiers on frozen SBERT embeddings

 Hyperparameter tuning for SBERT + Logistic Regression

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Base model (same as before, but we'll tune C)
base_logreg = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    n_jobs=-1,
    multi_class="auto",
)

param_grid = {
    "C": [0.1, 0.5, 1.0, 2.0, 5.0],
}

grid = GridSearchCV(
    estimator=base_logreg,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
)

grid.fit(X_train, y_train)

best_logreg = grid.best_estimator_
print("Best params:", grid.best_params_)

y_pred_best = best_logreg.predict(X_test)
best_logreg_acc = accuracy_score(y_test, y_pred_best)
print("Tuned SBERT + LogisticRegression test accuracy:", best_logreg_acc)
print("\nTuned SBERT + LogReg classification report:")
print(classification_report(y_test, y_pred_best))

Fitting 5 folds for each of 5 candidates, totalling 25 fits




Best params: {'C': 2.0}
Tuned SBERT + LogisticRegression test accuracy: 0.7587628865979381

Tuned SBERT + LogReg classification report:
              precision    recall  f1-score   support

           0       0.61      0.87      0.71       121
           1       0.87      0.76      0.81       576
           2       0.66      0.70      0.68       273

    accuracy                           0.76       970
   macro avg       0.71      0.78      0.74       970
weighted avg       0.78      0.76      0.76       970



LinearSVC on SBERT embeddings

In [19]:
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

svm_clf = LinearSVC(
    C=1.0,
    class_weight="balanced",
)

svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)

svm_acc = accuracy_score(y_test, y_pred_svm)
print("SBERT + LinearSVC test accuracy:", svm_acc)
print("\nSBERT + LinearSVC classification report:")
print(classification_report(y_test, y_pred_svm))

print("\nSBERT + LinearSVC confusion matrix:")
print(confusion_matrix(y_test, y_pred_svm))

SBERT + LinearSVC test accuracy: 0.7721649484536083

SBERT + LinearSVC classification report:
              precision    recall  f1-score   support

           0       0.64      0.83      0.72       121
           1       0.85      0.81      0.83       576
           2       0.69      0.67      0.68       273

    accuracy                           0.77       970
   macro avg       0.73      0.77      0.74       970
weighted avg       0.78      0.77      0.77       970


SBERT + LinearSVC confusion matrix:
[[100  13   8]
 [ 36 467  73]
 [ 20  71 182]]


In [20]:
print("Frozen SBERT + base LogReg accuracy:", sbert_test_acc)
print("Frozen SBERT + tuned LogReg accuracy:", best_logreg_acc)
print("Frozen SBERT + LinearSVC accuracy:", svm_acc)

Frozen SBERT + base LogReg accuracy: 0.7628865979381443
Frozen SBERT + tuned LogReg accuracy: 0.7587628865979381
Frozen SBERT + LinearSVC accuracy: 0.7721649484536083


## 4. Fine-tuning SBERT for Financial Sentiment Classification

Prepare train/validation splits for fine-tuning

In [22]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

# Re-use the cleaned dataframe
# df should be the same as in the TF-IDF section
print(df.shape)
print(df.head())

texts = df["sentence"].astype(str).tolist()
labels = df["sentiment_int"].astype(int).tolist()
num_labels = len(sorted(set(labels)))
print("Num labels:", num_labels)

X_train_text, X_val_text, y_train_label, y_val_label = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels,
)

len(X_train_text), len(X_val_text)

(4846, 3)
                                            sentence  sentiment  sentiment_int
0  According to Gran , the company has no plans t...          1              1
1  Technopolis plans to develop in stages an area...          1              1
2  The international electronic industry company ...          0              0
3  With the new production plant the company woul...          2              2
4  According to the company 's updated strategy f...          2              2
Num labels: 3


(3876, 970)

Tokenizer + PyTorch Dataset/DataLoader

In [23]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class FinanceSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = FinanceSentimentDataset(X_train_text, y_train_label, tokenizer)
val_dataset   = FinanceSentimentDataset(X_val_text,   y_val_label,   tokenizer)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size)

Define and train the classification model

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 1  # keep small, this is just a demo
log_every = 100

model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (step + 1) % log_every == 0:
            print(f"Epoch {epoch+1} step {step+1}/{len(train_loader)} "
                  f"loss={loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} average training loss: {avg_loss:.4f}")

Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 step 100/243 loss=0.8696
Epoch 1 step 200/243 loss=0.4249
Epoch 1 average training loss: 0.7622


Evaluate the fine-tuned model

In [25]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        labels = batch["labels"].numpy()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

finetuned_acc = accuracy_score(all_labels, all_preds)
print("Fine-tuned SBERT (end-to-end) validation accuracy:", finetuned_acc)
print("\nFine-tuned SBERT classification report:")
print(classification_report(all_labels, all_preds))
print("\nFine-tuned SBERT confusion matrix:")
print(confusion_matrix(all_labels, all_preds))

Fine-tuned SBERT (end-to-end) validation accuracy: 0.790721649484536

Fine-tuned SBERT classification report:
              precision    recall  f1-score   support

           0       0.66      0.81      0.73       121
           1       0.81      0.91      0.86       576
           2       0.82      0.52      0.64       273

    accuracy                           0.79       970
   macro avg       0.77      0.75      0.74       970
weighted avg       0.80      0.79      0.78       970


Fine-tuned SBERT confusion matrix:
[[ 98  19   4]
 [ 23 526  27]
 [ 27 103 143]]


In [26]:
print("Frozen SBERT + base LogReg accuracy:", sbert_test_acc)
print("Frozen SBERT + tuned LogReg accuracy:", best_logreg_acc)
print("Frozen SBERT + LinearSVC accuracy:", svm_acc)
print("TF-IDF + LogReg accuracy:", tfidf_test_acc)
print("Fine-tuned SBERT (end-to-end) accuracy (val set):", finetuned_acc)

Frozen SBERT + base LogReg accuracy: 0.7628865979381443
Frozen SBERT + tuned LogReg accuracy: 0.7587628865979381
Frozen SBERT + LinearSVC accuracy: 0.7721649484536083
TF-IDF + LogReg accuracy: 0.7608247422680412
Fine-tuned SBERT (end-to-end) accuracy (val set): 0.790721649484536
