# Fraudulent Job Posting Detectors

The following notebook is aimed to create and compare classifier tools to detect fraudulent job postings using only the title, description, and requirement.

## Datasets used

Two datasets are used from Kaggle:
<ol>
    <li> Fake Postings.csv (https://www.kaggle.com/datasets/srisaisuhassanisetty/fake-job-postings)
    <li> data job posts.csv (https://www.kaggle.com/datasets/madhab/jobposts)
    <li> job_train.csv (https://www.kaggle.com/datasets/prxshetty/fake-real-job-listings-dataset)
    <li> (https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction)
</ol>

Dataset #1 has 10,000 rows, all of which are fraudulent.

Dataset #2 has 19,000 rows, all of which are assumed to be legitimate with some NA values. Those with NA in the title/description/requirements are dropped. This resulted in 13,124 rows remaining.

Dataset #3 has 8,940 rows, some of which are legitimate, and some fraudulent.

Dataset #4 has 17,880 rows, some of which are legitimate, and some fraudulent.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier

pd.set_option('display.max_columns', None)

In [2]:
# Step 1: Load the datasets
df1 = pd.read_csv('Fake Postings.csv')
df2 = pd.read_csv('data job posts.csv')
df3 = pd.read_csv('job_train.csv')
df4 = pd.read_csv('fake_job_postings.csv')

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df3.head()

In [None]:
df4.head()

In [None]:
len(df4)

In [8]:
df1['text'] = df1['title'].fillna('') + ' ' + df1['description'].fillna('') + ' ' + df1['requirements'].fillna('')
df1['fraudulent'] = 1

df2.dropna(subset=['Title', 'JobDescription', 'JobRequirment', 'RequiredQual'], inplace=True)
df2['text'] = (
    df2['Title'].fillna('') + ' ' + df2['JobDescription'].fillna('') + ' ' +
    df2['JobRequirment'].fillna('') + ' ' + df2['RequiredQual'].fillna('')
)
df2['fraudulent'] = 0

df3['title'] = df3['title'].apply(lambda x: x.split('-')[0].strip())
df3['text'] = df3['title'].fillna('') + ' ' + df3['description'].fillna('') + ' ' + df3['requirements'].fillna('')

df4['title'] = df4['title'].apply(lambda x: x.split('-')[0].strip())
df4['text'] = df4['title'].fillna('') + ' ' + df4['description'].fillna('') + ' ' + df4['requirements'].fillna('')

train_df = pd.concat([df1[['text', 'fraudulent']], df2[['text', 'fraudulent']], df3[['text', 'fraudulent']], df4[['text', 'fraudulent']]], ignore_index=True)

In [None]:
len(df1) + len(df3[df3['fraudulent'] == 1]) + len(df4[df4['fraudulent'] == 1])

In [None]:
len(df1) + len(df3[df3['fraudulent'] == 0]) + len(df4[df4['fraudulent'] == 0])

In [11]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(train_df, test_size=0.2, random_state=42)

In [12]:
from imblearn.over_sampling import SMOTE

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['text']).toarray()
y_train = train_data['fraudulent']

X_test = vectorizer.transform(test_data['text']).toarray()
y_test = test_data['fraudulent']

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [13]:
all_stats_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

## K-Nearest Neighbours

In [None]:
knn = KNeighborsClassifier(n_neighbors = 8)
knn.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred = knn.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraudulent', 'Fraudulent'], yticklabels=['Not Fraudulent', 'Fraudulent'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Create a new DataFrame with the metrics

if 'K-Nearest Neighbours' not in all_stats_df['Classifier'].values:
    new_stats_df = pd.DataFrame([{
        'Classifier': 'K-Nearest Neighbours',
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }])

    all_stats_df = pd.concat([all_stats_df, new_stats_df], ignore_index=True)

all_stats_df

### Result

Accuracy: 83.4%

Precision: 58.4%

Recall: 99.2%

F1-Score: 73.5%

## Logistics Regression

In [None]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))

In [None]:
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_log_reg, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraudulent', 'Fraudulent'], yticklabels=['Not Fraudulent', 'Fraudulent'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix: Logistic Regression')
plt.show()

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_log_reg)
precision = precision_score(y_test, y_pred_log_reg)
recall = recall_score(y_test, y_pred_log_reg)
f1 = f1_score(y_test, y_pred_log_reg)

# Create a new DataFrame with the metrics

if 'Logistics Regression' not in all_stats_df['Classifier'].values:
    new_stats_df = pd.DataFrame([{
        'Classifier': 'Logistics Regression',
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }])

    all_stats_df = pd.concat([all_stats_df, new_stats_df], ignore_index=True)

all_stats_df

### Result

Accuracy: 98.6%

Precision: 98.8%

Recall: 95.3%

F1-Score: 97.0%

## Support Vector Machine + PCA

From Trial and Error, SVM alone takes a long time to run. Hence, PCA will also be used to reduce complexity.

In [None]:
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_resampled)
X_test_pca = pca.transform(X_test)
sum(list(pca.explained_variance_ratio_))

In [None]:
svm = SVC(random_state=42, kernel='linear')
svm.fit(X_train_pca, y_train_resampled)
y_pred_svm = svm.predict(X_test_pca)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

In [None]:
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_svm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraudulent', 'Fraudulent'], yticklabels=['Not Fraudulent', 'Fraudulent'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix: SVM')
plt.show()

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm)
recall = recall_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm)

# Create a new DataFrame with the metrics

if 'Support Vector Machine' not in all_stats_df['Classifier'].values:
    new_stats_df = pd.DataFrame([{
        'Classifier': 'Support Vector Machine',
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }])

    all_stats_df = pd.concat([all_stats_df, new_stats_df], ignore_index=True)

all_stats_df

### Result

Accuracy: 98.0%

Precision: 97.7%

Recall: 93.4%

F1-Score: 95.5%

## Decision Tree

To be continued...

In [None]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_resampled, y_train_resampled)
y_pred_tree = tree.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_tree))

In [None]:
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_tree, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraudulent', 'Fraudulent'], yticklabels=['Not Fraudulent', 'Fraudulent'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix: Decision Tree')
plt.show()

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm)
recall = recall_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm)

# Create a new DataFrame with the metrics

if 'Decision Tree' not in all_stats_df['Classifier'].values:
    new_stats_df = pd.DataFrame([{
        'Classifier': 'Decision Tree',
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }])

    all_stats_df = pd.concat([all_stats_df, new_stats_df], ignore_index=True)

all_stats_df

### Result


Accuracy: --%

Precision: --%

Recall: --%

F1-Score: --%

## Ensemble Learning

In [None]:
# Define base classifiers
base_classifiers = [
    ('svm', SVC(kernel='linear', random_state=42)),
    ('log_reg', LogisticRegression(random_state=42))
]

# Define the stacking classifier
stacking_clf = StackingClassifier(
    estimators=base_classifiers,
    final_estimator=DecisionTreeClassifier(random_state=42),
    cv=5
)

# Fit the stacking classifier on the resampled training data
stacking_clf.fit(X_train_pca, y_train_resampled)

# Predict on the test data
y_pred_stacking = stacking_clf.predict(X_test_pca)

# Print classification report
print("Stacking Classifier Classification Report:")
print(classification_report(y_test, y_pred_stacking))

In [None]:
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_stacking, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraudulent', 'Fraudulent'], yticklabels=['Not Fraudulent', 'Fraudulent'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix: Stacking Classifier')
plt.show()

In [None]:
accuracy = accuracy_score(y_test, y_pred_stacking)
precision = precision_score(y_test, y_pred_stacking)
recall = recall_score(y_test, y_pred_stacking)
f1 = f1_score(y_test, y_pred_stacking)

# Create a new DataFrame with the metrics
if 'Stacking Classifier' not in all_stats_df['Classifier'].values:
    new_stats_df = pd.DataFrame([{
        'Classifier': 'Stacking Classifier',
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }])

    all_stats_df = pd.concat([all_stats_df, new_stats_df], ignore_index=True)

all_stats_df