In [2]:
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import joblib
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Load dataset
career_data = pd.read_csv("dataset/datacleanJobstreet.csv")
X = career_data["descriptions"].astype(str)
y = career_data["job_level_encoded"]

# Load tokenizer and BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

class CareerDataset(Dataset):
    def __init__(self, descriptions, tokenizer, max_len=128):
        self.descriptions = descriptions
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, item):
        description = str(self.descriptions.iloc[item])
        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

def extract_embeddings(dataloader, model, device):
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting embeddings"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu())
    return torch.cat(embeddings).numpy()

def run_bert_experiment(X, y, train_size, test_size, tokenizer, max_len=128, batch_size=16):
    print(f"\n=== Train:Test Split -> {int(train_size*100)}:{int(test_size*100)} ===")
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=42, stratify=y)
    print("Training:", Counter(y_train))
    print("Testing :", Counter(y_test))

    train_dataset = CareerDataset(X_train, tokenizer, max_len)
    test_dataset = CareerDataset(X_test, tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    X_train_embeddings = extract_embeddings(train_loader, bert_model, device)
    X_test_embeddings = extract_embeddings(test_loader, bert_model, device)

    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_embeddings, y_train)
    y_pred = clf.predict(X_test_embeddings)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    folder = f"bert_lr_{int(train_size*100)}"
    os.makedirs(folder, exist_ok=True)
    joblib.dump(clf, f"{folder}/model.joblib")
    with open(f"{folder}/report.txt", "w") as f:
        f.write(f"BERT + Logistic Regression ({int(train_size*100)}:{int(test_size*100)})\n")
        f.write(f"Accuracy: {accuracy:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)
        f.write("\nConfusion Matrix:\n")
        f.write(np.array2string(matrix))

    print("\nAccuracy:", accuracy)
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", matrix)

def run_bert_sampling(X, y, train_size, test_size, sampler_type, tokenizer, max_len=128, batch_size=16):
    label = "undersample" if sampler_type == "under" else "oversample"
    sampler = RandomUnderSampler(random_state=42) if sampler_type == "under" else RandomOverSampler(random_state=42)

    print(f"\n=== Train:Test Split -> {int(train_size*100)}:{int(test_size*100)} with {label.capitalize()} ===")
    print("Original:", Counter(y))

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=42)
    print("Training:", Counter(y_train))
    print("Test    :", Counter(y_test))

    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train.to_frame(), y_train)
    X_train_resampled = X_train_resampled["descriptions"]
    print("AFTER sampling:", Counter(y_train_resampled))

    train_dataset = CareerDataset(X_train_resampled, tokenizer, max_len)
    test_dataset = CareerDataset(X_test, tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    X_train_embeddings = extract_embeddings(train_loader, bert_model, device)
    X_test_embeddings = extract_embeddings(test_loader, bert_model, device)

    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_embeddings, y_train_resampled)
    y_pred = clf.predict(X_test_embeddings)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    folder = f"bert_lr_{label}_{int(train_size*100)}"
    os.makedirs(folder, exist_ok=True)
    joblib.dump(clf, f"{folder}/model.joblib")
    with open(f"{folder}/report.txt", "w") as f:
        f.write(f"BERT + Logistic Regression with {label.capitalize()} ({int(train_size*100)}:{int(test_size*100)})\n")
        f.write(f"Accuracy: {accuracy:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)
        f.write("\nConfusion Matrix:\n")
        f.write(np.array2string(matrix))

    print("\nAccuracy:", accuracy)
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", matrix)

In [3]:
# Run all standard splits
for split in [(0.7, 0.3), (0.8, 0.2), (0.9, 0.1)]:
    run_bert_experiment(X, y, train_size=split[0], test_size=split[1], tokenizer=tokenizer)


=== Train:Test Split -> 70:30 ===
Training: Counter({0: 7681, 1: 6930, 2: 5078})
Testing : Counter({0: 3292, 1: 2971, 2: 2176})


Extracting embeddings: 100%|██████████| 1231/1231 [06:14<00:00,  3.28it/s]
Extracting embeddings: 100%|██████████| 528/528 [02:07<00:00,  4.14it/s]



Accuracy: 0.5801635264841806
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.63      0.61      3292
           1       0.60      0.64      0.62      2971
           2       0.53      0.42      0.47      2176

    accuracy                           0.58      8439
   macro avg       0.57      0.56      0.57      8439
weighted avg       0.58      0.58      0.58      8439

Confusion Matrix:
 [[2075  700  517]
 [ 771 1900  300]
 [ 699  556  921]]

=== Train:Test Split -> 80:20 ===
Training: Counter({0: 8778, 1: 7921, 2: 5803})
Testing : Counter({0: 2195, 1: 1980, 2: 1451})


Extracting embeddings: 100%|██████████| 1407/1407 [05:41<00:00,  4.12it/s]
Extracting embeddings: 100%|██████████| 352/352 [01:25<00:00,  4.13it/s]



Accuracy: 0.584962673302524
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.64      0.62      2195
           1       0.60      0.66      0.63      1980
           2       0.53      0.40      0.46      1451

    accuracy                           0.58      5626
   macro avg       0.58      0.57      0.57      5626
weighted avg       0.58      0.58      0.58      5626

Confusion Matrix:
 [[1407  471  317]
 [ 477 1298  205]
 [ 485  380  586]]

=== Train:Test Split -> 90:10 ===
Training: Counter({0: 9876, 1: 8911, 2: 6528})
Testing : Counter({0: 1097, 1: 990, 2: 726})


Extracting embeddings: 100%|██████████| 1583/1583 [06:22<00:00,  4.14it/s]
Extracting embeddings: 100%|██████████| 176/176 [00:42<00:00,  4.15it/s]



Accuracy: 0.5876288659793815
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.66      0.63      1097
           1       0.61      0.66      0.63       990
           2       0.52      0.39      0.45       726

    accuracy                           0.59      2813
   macro avg       0.58      0.57      0.57      2813
weighted avg       0.58      0.59      0.58      2813

Confusion Matrix:
 [[719 220 158]
 [232 650 108]
 [246 196 284]]


In [4]:
# Run all undersampling splits
for split in [(0.7, 0.3), (0.8, 0.2), (0.9, 0.1)]:
    run_bert_sampling(X, y, train_size=split[0], test_size=split[1], sampler_type="under", tokenizer=tokenizer)


=== Train:Test Split -> 70:30 with Undersample ===
Original: Counter({0: 10973, 1: 9901, 2: 7254})
Training: Counter({0: 7639, 1: 6964, 2: 5086})
Test    : Counter({0: 3334, 1: 2937, 2: 2168})
AFTER sampling: Counter({0: 5086, 1: 5086, 2: 5086})


Extracting embeddings: 100%|██████████| 954/954 [03:50<00:00,  4.13it/s]
Extracting embeddings: 100%|██████████| 528/528 [02:05<00:00,  4.21it/s]



Accuracy: 0.5661808271122171
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.54      0.58      3334
           1       0.61      0.60      0.60      2937
           2       0.46      0.56      0.50      2168

    accuracy                           0.57      8439
   macro avg       0.56      0.57      0.56      8439
weighted avg       0.58      0.57      0.57      8439

Confusion Matrix:
 [[1807  652  875]
 [ 624 1755  558]
 [ 460  492 1216]]

=== Train:Test Split -> 80:20 with Undersample ===
Original: Counter({0: 10973, 1: 9901, 2: 7254})
Training: Counter({0: 8765, 1: 7907, 2: 5830})
Test    : Counter({0: 2208, 1: 1994, 2: 1424})
AFTER sampling: Counter({0: 5830, 1: 5830, 2: 5830})


Extracting embeddings: 100%|██████████| 1094/1094 [04:20<00:00,  4.19it/s]
Extracting embeddings: 100%|██████████| 352/352 [01:22<00:00,  4.29it/s]



Accuracy: 0.5769640952719517
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.55      0.59      2208
           1       0.62      0.62      0.62      1994
           2       0.47      0.56      0.51      1424

    accuracy                           0.58      5626
   macro avg       0.57      0.58      0.57      5626
weighted avg       0.58      0.58      0.58      5626

Confusion Matrix:
 [[1220  446  542]
 [ 400 1233  361]
 [ 324  307  793]]

=== Train:Test Split -> 90:10 with Undersample ===
Original: Counter({0: 10973, 1: 9901, 2: 7254})
Training: Counter({0: 9879, 1: 8901, 2: 6535})
Test    : Counter({0: 1094, 1: 1000, 2: 719})
AFTER sampling: Counter({0: 6535, 1: 6535, 2: 6535})


Extracting embeddings: 100%|██████████| 1226/1226 [04:48<00:00,  4.25it/s]
Extracting embeddings: 100%|██████████| 176/176 [00:41<00:00,  4.28it/s]



Accuracy: 0.57518663348738
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.57      0.60      1094
           1       0.62      0.61      0.61      1000
           2       0.46      0.54      0.50       719

    accuracy                           0.58      2813
   macro avg       0.57      0.57      0.57      2813
weighted avg       0.58      0.58      0.58      2813

Confusion Matrix:
 [[623 220 251]
 [188 608 204]
 [173 159 387]]


In [5]:
# Run all oversampling splits
for split in [(0.7, 0.3), (0.8, 0.2), (0.9, 0.1)]:
    run_bert_sampling(X, y, train_size=split[0], test_size=split[1], sampler_type="over", tokenizer=tokenizer)


=== Train:Test Split -> 70:30 with Oversample ===
Original: Counter({0: 10973, 1: 9901, 2: 7254})
Training: Counter({0: 7639, 1: 6964, 2: 5086})
Test    : Counter({0: 3334, 1: 2937, 2: 2168})
AFTER sampling: Counter({0: 7639, 2: 7639, 1: 7639})


Extracting embeddings: 100%|██████████| 1433/1433 [05:37<00:00,  4.24it/s]
Extracting embeddings: 100%|██████████| 528/528 [02:03<00:00,  4.26it/s]



Accuracy: 0.5708022277521033
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.55      0.59      3334
           1       0.61      0.61      0.61      2937
           2       0.47      0.54      0.50      2168

    accuracy                           0.57      8439
   macro avg       0.57      0.57      0.57      8439
weighted avg       0.58      0.57      0.57      8439

Confusion Matrix:
 [[1847  666  821]
 [ 619 1789  529]
 [ 488  499 1181]]

=== Train:Test Split -> 80:20 with Oversample ===
Original: Counter({0: 10973, 1: 9901, 2: 7254})
Training: Counter({0: 8765, 1: 7907, 2: 5830})
Test    : Counter({0: 2208, 1: 1994, 2: 1424})
AFTER sampling: Counter({0: 8765, 1: 8765, 2: 8765})


Extracting embeddings: 100%|██████████| 1644/1644 [06:26<00:00,  4.26it/s]
Extracting embeddings: 100%|██████████| 352/352 [01:22<00:00,  4.26it/s]



Accuracy: 0.5721649484536082
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.55      0.59      2208
           1       0.62      0.62      0.62      1994
           2       0.46      0.54      0.50      1424

    accuracy                           0.57      5626
   macro avg       0.57      0.57      0.57      5626
weighted avg       0.58      0.57      0.57      5626

Confusion Matrix:
 [[1222  451  535]
 [ 406 1232  356]
 [ 340  319  765]]

=== Train:Test Split -> 90:10 with Oversample ===
Original: Counter({0: 10973, 1: 9901, 2: 7254})
Training: Counter({0: 9879, 1: 8901, 2: 6535})
Test    : Counter({0: 1094, 1: 1000, 2: 719})
AFTER sampling: Counter({0: 9879, 1: 9879, 2: 9879})


Extracting embeddings: 100%|██████████| 1853/1853 [07:16<00:00,  4.25it/s]
Extracting embeddings: 100%|██████████| 176/176 [00:40<00:00,  4.31it/s]



Accuracy: 0.5862068965517241
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.57      0.60      1094
           1       0.63      0.63      0.63      1000
           2       0.48      0.55      0.51       719

    accuracy                           0.59      2813
   macro avg       0.58      0.58      0.58      2813
weighted avg       0.59      0.59      0.59      2813

Confusion Matrix:
 [[625 222 247]
 [189 628 183]
 [169 154 396]]


In [6]:
import os
import re
import csv

def extract_accuracy(report_path):
    if not os.path.exists(report_path):
        return None
    with open(report_path, 'r') as file:
        content = file.read()
    match = re.search(r"Accuracy:\s+([\d.]+)", content)
    return float(match.group(1)) if match else None

# Define all result folders
splits = [70, 80, 90]
types = ["", "undersample_", "oversample_"]
labels = {"": "Standard", "undersample_": "Undersampling", "oversample_": "Oversampling"}

results = []

for split in splits:
    for t in types:
        folder = f"bert_lr_{t}{split}"
        report_path = os.path.join(folder, "report.txt")
        accuracy = extract_accuracy(report_path)
        acc_str = f"{accuracy:.4f}" if accuracy else "N/A"
        results.append([f"{split}/{100-split}", labels[t], acc_str])

# Print table
print(f"{'Split':<15} {'Type':<15} {'Accuracy':<10}")
print("="*40)
for row in results:
    print(f"{row[0]:<15} {row[1]:<15} {row[2]:<10}")

# Save to CSV
with open("bert_accuracy_summary.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Split", "Type", "Accuracy"])
    writer.writerows(results)

print("\n✅ Summary saved to bert_accuracy_summary.csv")

Split           Type            Accuracy  
70/30           Standard        0.5802    
70/30           Undersampling   0.5662    
70/30           Oversampling    0.5708    
80/20           Standard        0.5850    
80/20           Undersampling   0.5770    
80/20           Oversampling    0.5722    
90/10           Standard        0.5876    
90/10           Undersampling   0.5752    
90/10           Oversampling    0.5862    

✅ Summary saved to bert_accuracy_summary.csv
