style - arial
Font size - 18
heading - 40


In [1]:
import os
import joblib
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)

In [2]:
# Config
DATA_PATH = "data/fake_job_postings.csv"
MODEL_PATH = "model/model.pkl"
VECTORIZER_PATH = "model/vectorizer.pkl"

TFIDF_FEATURES = 5000
RANDOM_STATE = 42
TEST_SIZE = 0.2

TEXT_COLUMNS = ["title", "company_profile", "description", "requirements", "benefits"]

In [3]:
def combine_text(row):
    parts = []
    for col in TEXT_COLUMNS:
        val = row.get(col, "")
        if pd.notna(val) and str(val).strip():
            parts.append(str(val).strip())
    return " ".join(parts)


def clean_text(text):
    import re
    text = str(text).lower()
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [4]:
df = pd.read_csv(DATA_PATH)
print("Dataset Shape:", df.shape)
print("\nFraud Distribution:\n", df["fraudulent"].value_counts())

Dataset Shape: (17880, 18)

Fraud Distribution:
 fraudulent
0    17014
1      866
Name: count, dtype: int64


In [5]:
#preprocess
df["combined_text"] = df.apply(combine_text, axis=1).apply(clean_text)
df = df[df["combined_text"].str.len() > 10].reset_index(drop=True)
print("Usable Rows After Cleaning:", len(df))

Usable Rows After Cleaning: 17880


In [6]:
#test-train
X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"],
    df["fraudulent"],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=df["fraudulent"])
print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 14304
Test size: 3576


In [7]:
#tf-idf vectorizatin
vectorizer = TfidfVectorizer(
    max_features=TFIDF_FEATURES,
    ngram_range=(1, 2),
    sublinear_tf=True,
    min_df=2,
    stop_words="english")

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print("Vocabulary Size:", len(vectorizer.vocabulary_))

Vocabulary Size: 5000


In [8]:
model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    solver="lbfgs",
    C=1.0,
    random_state=RANDOM_STATE,)
model.fit(X_train_vec, y_train)
print("Model training completed.")

Model training completed.


Testing--

In [9]:
#evaluation
y_pred = model.predict(X_test_vec)
y_proba = model.predict_proba(X_test_vec)[:, 1]
print(classification_report(y_test, y_pred, target_names=["Genuine", "Fraudulent"]))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print("\nConfusion Matrix:")
print(cm)

print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")

              precision    recall  f1-score   support

     Genuine       1.00      0.98      0.99      3403
  Fraudulent       0.68      0.91      0.78       173

    accuracy                           0.98      3576
   macro avg       0.84      0.94      0.88      3576
weighted avg       0.98      0.98      0.98      3576


Confusion Matrix:
[[3330   73]
 [  16  157]]

ROC-AUC Score: 0.9870


In [10]:
#Top predictive words-
feature_names = vectorizer.get_feature_names_out()
coef = model.coef_[0]

top_fraud_idx = np.argsort(coef)[-15:][::-1]
top_legit_idx = np.argsort(coef)[:15]

print("Top Fraud Predictive Words:")
print([feature_names[i] for i in top_fraud_idx])

print("\nTop Genuine Predictive Words:")
print([feature_names[i] for i in top_legit_idx])

Top Fraud Predictive Words:
['link', 'earn', 'data entry', 'phone', 'high school', '000', 'aptitude', 'oil gas', 'money', 'offshore', 'clerk', 'cash', 'email', 'requirements', 'work home']

Top Genuine Predictive Words:
['companies', 'recruitment', 'english', 'team', 'growing', 'software', 'web', 'digital', 'fun', 'clients', 'client', 'based', 'love', 'creative', 'search']


In [11]:
#Manual test
sample_job = """
We are hiring remote data entry operators.
No experience required.
Earn $300 daily.
Training fee required before onboarding.
"""

sample_clean = clean_text(sample_job)
sample_vec = vectorizer.transform([sample_clean])

prediction = model.predict(sample_vec)[0]
probability = model.predict_proba(sample_vec)[0][1]

print("Prediction:", "Fraudulent" if prediction == 1 else "Genuine")
print("Fraud Probability:", round(probability * 100, 2), "%")

Prediction: Fraudulent
Fraud Probability: 97.23 %


In [12]:
#Save model
os.makedirs("model", exist_ok=True)
joblib.dump(model, MODEL_PATH)
joblib.dump(vectorizer, VECTORIZER_PATH)
print("Model and vectorizer saved.")

Model and vectorizer saved.


In [13]:
#Reload and verify
loaded_model = joblib.load(MODEL_PATH)
loaded_vectorizer = joblib.load(VECTORIZER_PATH)

test_vec = loaded_vectorizer.transform([sample_clean])
reload_prediction = loaded_model.predict(test_vec)[0]

print("Reloaded Model Prediction:",
      "Fraudulent" if reload_prediction == 1 else "Genuine")

Reloaded Model Prediction: Fraudulent
