In [1]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.optim.lr_scheduler import StepLR
import time

# Load your CSV
career_data = pd.read_csv("dataset/datacleanJobstreet.csv")

# Check class distribution
print("Original class distribution:", Counter(career_data['job_level_encoded']))

# Feature and label separation
X = career_data["descriptions"].astype(str)
y = career_data["job_level_encoded"]

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define custom Dataset for BERT
class CareerDataset(Dataset):
    def __init__(self, descriptions, labels, tokenizer, max_len):
        self.descriptions = descriptions.fillna("No description available")
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, item):
        description = str(self.descriptions.iloc[item])  # Ensure it's a string
        label = self.labels.iloc[item]

        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

  from .autonotebook import tqdm as notebook_tqdm


Original class distribution: Counter({0: 10200, 1: 8310, 2: 6763})


1. ORIGINAL

In [2]:
# Function to train and evaluate BERT
def run_bert_experiment(X, y, train_size, test_size, tokenizer, max_len=128, batch_size=16, epochs=5):
    print(f"\nTraining split: {train_size*100:.0f}% | Test split: {test_size*100:.0f}%")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=42)

    # Create datasets and loaders
    train_dataset = CareerDataset(X_train, y_train, tokenizer, max_len)
    test_dataset = CareerDataset(X_test, y_test, tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Load pre-trained BERT model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(y.unique()))

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=1e-5)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    total_start_time = time.time()  # Start total timer

    # Training loop
    for epoch in range(epochs):
        epoch_start_time = time.time()  # Start timer for epoch

        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        scheduler.step()
        avg_loss = total_loss / len(train_loader)

        epoch_end_time = time.time()  # End timer for epoch
        epoch_elapsed = epoch_end_time - epoch_start_time
        epoch_elapsed_formatted = time.strftime("%H:%M:%S", time.gmtime(epoch_elapsed))

        print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f} | Time: {epoch_elapsed_formatted}")

    total_end_time = time.time()  # End total timer
    total_elapsed = total_end_time - total_start_time
    total_elapsed_formatted = time.strftime("%H:%M:%S", time.gmtime(total_elapsed))
    print(f"\nTotal training time for {epochs} epochs: {total_elapsed_formatted}")

    # Evaluation
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Evaluation code here ...

    print("\n BERT + Fine-tuning Results:")
    print(" Accuracy:", accuracy_score(y_true, y_pred))
    print(" Classification Report:\n", classification_report(y_true, y_pred))
    print(" Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

    return model, tokenizer

In [4]:
# Example: Run for 70:30 split
model, tokenizer = run_bert_experiment(X, y, train_size=0.7, test_size=0.3, tokenizer=tokenizer)

# Save
model.save_pretrained("bert_finetuned_model")
tokenizer.save_pretrained("bert_finetuned_model")
print("✅ Model and tokenizer saved.")


Training split: 70% | Test split: 30%


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Average Loss: 0.6002 | Time: 00:13:37
Epoch 2 completed. Average Loss: 0.3307 | Time: 00:13:45
Epoch 3 completed. Average Loss: 0.3201 | Time: 00:16:32
Epoch 4 completed. Average Loss: 0.3194 | Time: 00:44:36
Epoch 5 completed. Average Loss: 0.3176 | Time: 00:11:41

Total training time for 5 epochs: 01:40:13

 BERT + Fine-tuning Results:
 Accuracy: 0.8631354425879844
 Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.79      0.88      3334
           1       0.81      0.96      0.88      2937
           2       0.82      0.83      0.82      2168

    accuracy                           0.86      8439
   macro avg       0.87      0.86      0.86      8439
weighted avg       0.88      0.86      0.86      8439

 Confusion Matrix:
 [[2644  356  334]
 [  30 2832   75]
 [  33  327 1808]]
✅ Model and tokenizer saved.


In [3]:
# Example: Run for 80:20 split
model, tokenizer = run_bert_experiment(X, y, train_size=0.8, test_size=0.2, tokenizer=tokenizer)

# Save
model.save_pretrained("bert_finetuned_ori_80/20")
tokenizer.save_pretrained("bert_finetuned_ori_80/20")
print("✅ Model and tokenizer saved.")


Training split: 80% | Test split: 20%


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Average Loss: 0.4997 | Time: 00:13:22
Epoch 2 completed. Average Loss: 0.3238 | Time: 00:13:24
Epoch 3 completed. Average Loss: 0.3116 | Time: 00:13:22
Epoch 4 completed. Average Loss: 0.3094 | Time: 00:13:23
Epoch 5 completed. Average Loss: 0.3085 | Time: 00:13:25

Total training time for 5 epochs: 01:06:57

 BERT + Fine-tuning Results:
 Accuracy: 0.8695343050124422
 Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.80      0.88      2208
           1       0.81      0.97      0.88      1994
           2       0.82      0.84      0.83      1424

    accuracy                           0.87      5626
   macro avg       0.87      0.87      0.87      5626
weighted avg       0.88      0.87      0.87      5626

 Confusion Matrix:
 [[1756  232  220]
 [  19 1936   39]
 [  10  214 1200]]
✅ Model and tokenizer saved.


In [4]:
# Example: Run for 90:10 split
model, tokenizer =run_bert_experiment(X, y, train_size=0.9, test_size=0.1, tokenizer=tokenizer)

# Save
model.save_pretrained("bert_finetuned_ori_90/10")
tokenizer.save_pretrained("bert_finetuned_ori_90/10")
print("✅ Model and tokenizer saved.")


Training split: 90% | Test split: 10%


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Average Loss: 0.4856 | Time: 00:30:18
Epoch 2 completed. Average Loss: 0.3225 | Time: 00:14:50
Epoch 3 completed. Average Loss: 0.3127 | Time: 00:14:32
Epoch 4 completed. Average Loss: 0.3118 | Time: 00:14:40
Epoch 5 completed. Average Loss: 0.3132 | Time: 00:14:46

Total training time for 5 epochs: 01:29:08

 BERT + Fine-tuning Results:
 Accuracy: 0.8709562744400995
 Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.80      0.87      1094
           1       0.81      0.97      0.88      1000
           2       0.85      0.85      0.85       719

    accuracy                           0.87      2813
   macro avg       0.88      0.87      0.87      2813
weighted avg       0.88      0.87      0.87      2813

 Confusion Matrix:
 [[874 127  93]
 [ 15 968  17]
 [ 17  94 608]]
✅ Model and tokenizer saved.


2. OVERSAMPLING

In [2]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import pandas as pd

def run_bert_oversampling(X, y, train_size, test_size, tokenizer, max_len=128, batch_size=16, epochs=5):
    # Split with stratify to maintain class distribution in splits
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, test_size=test_size, random_state=42, stratify=y)

    print("Original:", Counter(y))
    print("Training :", Counter(y_train))
    print("Test :", Counter(y_test))

    # Oversample training data
    ros = RandomOverSampler(random_state=42)
    X_train_array = X_train.values.reshape(-1, 1)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train_array, y_train)

    print("BEFORE oversampling:", Counter(y_train))
    print("AFTER oversampling:", Counter(y_train_resampled))

    # Convert back to pandas Series for your Dataset
    X_train_resampled = pd.Series(X_train_resampled.flatten())
    y_train_resampled = pd.Series(y_train_resampled)

    # Create Dataset and DataLoader for training and testing
    train_dataset = CareerDataset(X_train_resampled, y_train_resampled, tokenizer, max_len)
    test_dataset = CareerDataset(X_test, y_test, tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Load pretrained BERT model for classification
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=len(y.unique()))
    optimizer = AdamW(model.parameters(), lr=1e-5)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        scheduler.step()
        print(f"Epoch {epoch+1} completed. Avg Loss: {total_loss/len(train_loader):.4f}")

    # Evaluation on test set
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    print("\nOversampling Results:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

    return model, tokenizer

In [None]:
# Oversampling with 70/30 split
model, tokenizer =run_bert_oversampling(X, y, train_size=0.7, test_size=0.3, tokenizer=tokenizer)

# Save
model.save_pretrained("bert_finetuned_over_70")
tokenizer.save_pretrained("bert_finetuned_over_70")
print("✅ Model and tokenizer saved.")

Original: Counter({0: 10200, 1: 8310, 2: 6763})
Training : Counter({0: 7140, 1: 5817, 2: 4734})
Test : Counter({0: 3060, 1: 2493, 2: 2029})
BEFORE oversampling: Counter({0: 7140, 1: 5817, 2: 4734})
AFTER oversampling: Counter({2: 7140, 0: 7140, 1: 7140})


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Avg Loss: 0.5528
Epoch 2 completed. Avg Loss: 0.3396
Epoch 3 completed. Avg Loss: 0.3256
Epoch 4 completed. Avg Loss: 0.3233
Epoch 5 completed. Avg Loss: 0.3230

Oversampling Results:
Accuracy: 0.861514112371406
              precision    recall  f1-score   support

           0       0.97      0.81      0.88      3060
           1       0.81      0.94      0.87      2493
           2       0.79      0.85      0.82      2029

    accuracy                           0.86      7582
   macro avg       0.86      0.87      0.86      7582
weighted avg       0.87      0.86      0.86      7582

[[2467  261  332]
 [  42 2336  115]
 [  25  275 1729]]
✅ Model and tokenizer saved.


In [3]:
#Oversampling with 80/20 split
model, tokenizer =run_bert_oversampling(X, y, train_size=0.8, test_size=0.2, tokenizer=tokenizer)

# Save
model.save_pretrained("bert_finetuned_over_80_20")
tokenizer.save_pretrained("bert_finetuned_over_80_20")
print("✅ Model and tokenizer saved.")

Original: Counter({0: 10200, 1: 8310, 2: 6763})
Training : Counter({0: 8160, 1: 6648, 2: 5410})
Test : Counter({0: 2040, 1: 1662, 2: 1353})
BEFORE oversampling: Counter({0: 8160, 1: 6648, 2: 5410})
AFTER oversampling: Counter({1: 8160, 0: 8160, 2: 8160})


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Avg Loss: 0.5112
Epoch 2 completed. Avg Loss: 0.3350
Epoch 3 completed. Avg Loss: 0.3221
Epoch 4 completed. Avg Loss: 0.3188
Epoch 5 completed. Avg Loss: 0.3176

Oversampling Results:
Accuracy: 0.858160237388724
              precision    recall  f1-score   support

           0       0.98      0.79      0.87      2040
           1       0.81      0.94      0.87      1662
           2       0.79      0.86      0.82      1353

    accuracy                           0.86      5055
   macro avg       0.86      0.86      0.86      5055
weighted avg       0.87      0.86      0.86      5055

[[1612  193  235]
 [  21 1562   79]
 [  15  174 1164]]
✅ Model and tokenizer saved.


In [4]:
# Oversampling with 90/10 split
model, tokenizer =run_bert_oversampling(X, y, train_size=0.9, test_size=0.1, tokenizer=tokenizer)

# Save
model.save_pretrained("bert_finetuned_over_90_10")
tokenizer.save_pretrained("bert_finetuned_over_90_10")
print("✅ Model and tokenizer saved.")

Original: Counter({0: 10200, 1: 8310, 2: 6763})
Training : Counter({0: 9180, 1: 7479, 2: 6086})
Test : Counter({0: 1020, 1: 831, 2: 677})
BEFORE oversampling: Counter({0: 9180, 1: 7479, 2: 6086})
AFTER oversampling: Counter({1: 9180, 0: 9180, 2: 9180})


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Avg Loss: 0.5198
Epoch 2 completed. Avg Loss: 0.3289
Epoch 3 completed. Avg Loss: 0.3161
Epoch 4 completed. Avg Loss: 0.3132
Epoch 5 completed. Avg Loss: 0.3135

Oversampling Results:
Accuracy: 0.8635284810126582
              precision    recall  f1-score   support

           0       0.98      0.81      0.89      1020
           1       0.81      0.94      0.87       831
           2       0.80      0.84      0.82       677

    accuracy                           0.86      2528
   macro avg       0.86      0.87      0.86      2528
weighted avg       0.88      0.86      0.86      2528

[[827  90 103]
 [  8 784  39]
 [  7  98 572]]
✅ Model and tokenizer saved.


3. UNDERSAMPLING

In [2]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import pandas as pd

def run_bert_undersampling(X, y, train_size, test_size, tokenizer, max_len=128, batch_size=16, epochs=5):
    # Split with stratify to maintain class distribution in splits
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, test_size=test_size, random_state=42, stratify=y)

    # Undersample training data
    rus = RandomUnderSampler(random_state=42)
    X_train_array = X_train.values.reshape(-1, 1)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train_array, y_train)

    # Convert back to pandas Series for your Dataset
    X_train_resampled = pd.Series(X_train_resampled.flatten())
    y_train_resampled = pd.Series(y_train_resampled)

    # Create Dataset and DataLoader for training and testing
    train_dataset = CareerDataset(X_train_resampled, y_train_resampled, tokenizer, max_len)
    test_dataset = CareerDataset(X_test, y_test, tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Load pretrained BERT model for classification
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=len(y.unique()))
    optimizer = AdamW(model.parameters(), lr=1e-5)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        scheduler.step()
        print(f"Epoch {epoch+1} completed. Avg Loss: {total_loss/len(train_loader):.4f}")

    # Evaluation on test set
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    print("\nUndersampling Results:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
 
    return model, tokenizer

In [6]:
# Undersampling with 70/30 split
model, tokenizer =run_bert_undersampling(X, y, train_size=0.7, test_size=0.3, tokenizer=tokenizer)

# Save
model.save_pretrained("bert_finetuned_under_70")
tokenizer.save_pretrained("bert_finetuned_under_70")
print("✅ Model and tokenizer saved.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Avg Loss: 0.6982
Epoch 2 completed. Avg Loss: 0.3583
Epoch 3 completed. Avg Loss: 0.3483
Epoch 4 completed. Avg Loss: 0.3478
Epoch 5 completed. Avg Loss: 0.3480

Undersampling Results:
Accuracy: 0.8579530466895279
              precision    recall  f1-score   support

           0       1.00      0.79      0.88      3060
           1       0.80      0.96      0.87      2493
           2       0.79      0.85      0.82      2029

    accuracy                           0.86      7582
   macro avg       0.86      0.86      0.85      7582
weighted avg       0.87      0.86      0.86      7582

[[2404  308  348]
 [   5 2386  102]
 [   7  307 1715]]
✅ Model and tokenizer saved.


In [3]:
# Undersampling with 80/20 split
model, tokenizer =run_bert_undersampling(X, y, train_size=0.8, test_size=0.2, tokenizer=tokenizer)

# Save
model.save_pretrained("bert_finetuned_under_80")
tokenizer.save_pretrained("bert_finetuned_under_80")
print("✅ Model and tokenizer saved.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Avg Loss: 0.6042
Epoch 2 completed. Avg Loss: 0.3574
Epoch 3 completed. Avg Loss: 0.3483
Epoch 4 completed. Avg Loss: 0.3458
Epoch 5 completed. Avg Loss: 0.3452

Undersampling Results:
Accuracy: 0.8573689416419387
              precision    recall  f1-score   support

           0       0.98      0.79      0.87      2040
           1       0.81      0.94      0.87      1662
           2       0.79      0.85      0.82      1353

    accuracy                           0.86      5055
   macro avg       0.86      0.86      0.85      5055
weighted avg       0.87      0.86      0.86      5055

[[1613  193  234]
 [  26 1569   67]
 [  14  187 1152]]
✅ Model and tokenizer saved.


In [4]:
# Undersampling with 90/10 split
model, tokenizer =run_bert_undersampling(X, y, train_size=0.9, test_size=0.1, tokenizer=tokenizer)


# Save
model.save_pretrained("bert_finetuned_under_90")
tokenizer.save_pretrained("bert_finetuned_under_90")
print("✅ Model and tokenizer saved.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Avg Loss: 0.5768
Epoch 2 completed. Avg Loss: 0.3535
Epoch 3 completed. Avg Loss: 0.3433
Epoch 4 completed. Avg Loss: 0.3395
Epoch 5 completed. Avg Loss: 0.3396

Undersampling Results:
Accuracy: 0.8643196202531646
              precision    recall  f1-score   support

           0       0.99      0.80      0.89      1020
           1       0.81      0.95      0.87       831
           2       0.80      0.85      0.82       677

    accuracy                           0.86      2528
   macro avg       0.87      0.87      0.86      2528
weighted avg       0.88      0.86      0.87      2528

[[817  91 112]
 [  7 791  33]
 [  1  99 577]]
✅ Model and tokenizer saved.
