In [None]:
import os
import openai
import pandas as pd
from apikey import client
from sklearn.metrics import classification_report, confusion_matrix
import time
from tqdm import tqdm

In [17]:
# Load cleaned dataset
data = pd.read_csv("full_cleaned_data.csv")

In [None]:
print("Checking data length:", len(data))
data.head()

In [21]:
# # Do sampling for cost purposes and/or test purposes for first run
# sample_size = 1000
# data_sample = data.sample(n=sample_size, random_state=42).reset_index(drop=True)

descriptions = data["description_clean"].tolist()
true_labels = data["fraudulent"].tolist()

In [23]:
# Batch to prevent API limit
batch_size = 30
max_tokens = 150
temperature = 0
model = "gpt-4o"       # change model name accordingly here, but 4o was proven the better model compared to 3.5-turbo
delay_between_batches = 1.5
preds = []
failed_batches = []    # to keep track of failures

# Helper function for-loop to run the API calls
def ask_gpt_for_classification(desc_list):
    user_content = "\n\n".join([f"Job {i+1}:\n{desc}" for i, desc in enumerate(desc_list)])
    messages = [
        {
            "role": "system",
            "content": (
                "You are an assistant that classifies job descriptions as either 'Fake' or 'Real'.\n"
                "Given each job description, respond with one line per job in the same order, "
                "e.g. 'Job 1: Fake', 'Job 2: Real' etc."
            )
        },
        {
            "role": "user",
            "content": user_content
        }
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content

In [None]:
for i in range(0, len(descriptions), batch_size):
    batch = descriptions[i:i+batch_size]
    reply = ask_gpt_for_classification(batch)

    # Fail batch counter, may be able to improve (cost considerations applied)
    if reply is None:
        preds.extend([None] * len(batch))  # skips failed batch
        failed_batches.append((i, batch))
        print(f"Batch {i//batch_size + 1} failed.")
        continue

    lines = reply.strip().split("\n")
    batch_preds = []
    for line in lines:
        if "Fake" in line:
            batch_preds.append(1)
        elif "Real" in line:
            batch_preds.append(0)
        else:
            batch_preds.append(None)
            
    # pad in case of incomplete return
    preds.extend(batch_preds)
    print(f"Batch {i//batch_size + 1} done ({len(preds)} total)...")
    time.sleep(delay_between_batches)


print("Final Lengths â€” Preds:", len(preds), "| Labels:", len(true_labels))
print(f"Failed batches: {len(failed_batches)}")

In [27]:
print("Mismatch lengths:", len(preds), len(true_labels))

Mismatch lengths: 14866 17836


In [29]:
print(f"Sampled {len(descriptions)} job descriptions for GPT classification")

Sampled 17836 job descriptions for GPT classification


In [45]:
# Debug why lengths returned mismatches original sample length
# Theoretical answer = ChatGPT returned empty string or API failure somewhere
# Due to cost consideration, can't rerun
if len(batch_preds) != len(batch):
    print(f"Incomplete GPT return: Got {len(batch_preds)} vs Expected {len(batch)}")

# Definitely a point of improvement, potentially there are guides out there on how to methodologically run API calls better

In [51]:
print(set(true_labels))
print(set(preds))
print(f"Total predictions: {len(preds)} vs True Labels: {len(true_labels)}")

{0, 1}
{0, 1}
Total predictions: 14866 vs True Labels: 17836


In [53]:
# Reindexing and model output results
true_labels_partial = true_labels[:len(preds)]

print("Cleaned Classification Report:")
print(confusion_matrix(true_labels_partial, preds))

print("\nCleaned Confusion Matrix:")
print(classification_report(true_labels_partial, preds))

# Model shows clear imbalance between 0 (real job posts) and 1 (fake job posts)

Cleaned Classification Report:
[[10903  3443]
 [  391   129]]

Cleaned Confusion Matrix:
              precision    recall  f1-score   support

           0       0.97      0.76      0.85     14346
           1       0.04      0.25      0.06       520

    accuracy                           0.74     14866
   macro avg       0.50      0.50      0.46     14866
weighted avg       0.93      0.74      0.82     14866



In [42]:
# This is old chatGPT with Model 3o
print("Cleaned Classification Report:")
print(classification_report(true_final, preds_final))

print("\nCleaned Confusion Matrix:")
print(confusion_matrix(true_final, preds_final))

Cleaned Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.80      0.87      6796
           1       0.04      0.18      0.07       340

    accuracy                           0.77      7136
   macro avg       0.50      0.49      0.47      7136
weighted avg       0.91      0.77      0.83      7136


Cleaned Confusion Matrix:
[[5431 1365]
 [ 279   61]]
