In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [2]:
# Load cleaned dataset + X_bert embeddings done previously
data = pd.read_csv("full_cleaned_data.csv")
X_bert = np.load("bert_embeddings.npy")

In [5]:
print("Checking data length:", len(data))
data.head()

Checking data length: 17836


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,employment_type,required_experience,required_education,industry,function,fraudulent,description_length,req_len,null_count,description_clean
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,...,Other,Internship,,,Marketing,0,124,115,4,food fast growing james beard award winning on...
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,...,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,315,200,2,organised focused vibrant awesome passion cust...
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,...,,,,,,0,50,164,8,client located houston actively seeking experi...
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,...,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,346,176,1,company esri environmental systems research in...
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,...,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,168,89,2,job title itemization review manager location ...


In [7]:
y = data['fraudulent'].values  # we know its 0/1 value, classification ready
# Split into train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_bert, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
# Convert to torch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [11]:
batch_size = 32  # safe default

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [13]:
# Build simple classifier helper function (MLP on top of BERT embeddings)
# Reminder: MLP = Multi-layer Perceptron, classification / regression on top of pre-trained BERT embedding -> for fine tuning
# A simple 2-layer MLP classification head for BERT embeddings (768 → hidden → 2)
class BertClassifier(nn.Module):
    def __init__(self, hidden_dim=128):
        super(BertClassifier, self).__init__()
        self.fc1 = nn.Linear(768, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, 2)  # Binary classification

    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)  # logits

In [None]:
# Training setup - consider running on my Windows / for teammates should they have laptop with NVIDIA GPUs
# Reminder: cuda = Compute Unified Device Architecture, NVIDIA GPU acceleration framework
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [17]:
# Since data isn't that big, 5 epoch should be enough
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        X_batch, y_batch = batch
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

Epoch 1/5, Loss: 0.2099
Epoch 2/5, Loss: 0.1768
Epoch 3/5, Loss: 0.1694
Epoch 4/5, Loss: 0.1617
Epoch 5/5, Loss: 0.1545


In [None]:
# Epoch 5 shows least loss, which is good. However this should not be the only reference
# Could explore validation loss and/or validation accuration (F1 score)

In [19]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        X_batch, y_batch = batch
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

print(classification_report(all_labels, all_preds, digits=4))

              precision    recall  f1-score   support

           0     0.9550    1.0000    0.9770      3396
           1     1.0000    0.0698    0.1304       172

    accuracy                         0.9552      3568
   macro avg     0.9775    0.5349    0.5537      3568
weighted avg     0.9572    0.9552    0.9362      3568



In [None]:
# Confusion matrix and display
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Legit", "Fraud"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()