In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sb
from sklearn.neighbors import KNeighborsClassifier

In [None]:
url= 'fake_job_postings.csv'

In [None]:
df_job=pd.read_csv(url)

In [None]:
df_job.head(50)

In [None]:
#Shape of data
df_job.shape

In [None]:
#Columns
df_job.columns

In [None]:
#Check for null values
df_job.isna() .apply(pd.value_counts)

In [None]:
#Check for number of null values
df_job.isnull() .sum()

In [None]:
#Drop duplicate rows from the data
df_job.drop_duplicates(inplace=True)
df_job.duplicated().sum()

In [None]:
#Replace ? with NaN and fix Dtypes (fix salary)
df_job.replace("?", np.nan, inplace = True)
#df_job[["salary_range"]] = df_job[["salary_range"]].astype(float, copy=True)
df_job.head(50)

In [None]:
#Remove and Rename Unecessary Columns
df_job.rename(columns={'fraudulent': 'fake_posting'}, inplace=True)
df_job.columns

In [None]:
df_job.drop('department', axis=1, inplace= True)

In [None]:
df_job.info()

In [None]:
import plotly.express as px

In [None]:
#Count of Fraudulent Entries
count_fraudulent = df_job.groupby('fake_posting').count()
count_fraudulent.reset_index(inplace=True)

# Plot counts
fig = px.bar(count_fraudulent, x='fake_posting', y='job_id', 
             labels={'job_id': 'count'})
fig.show()

In [None]:
# Required Experience for Job
experience = dict(df_job.required_experience.value_counts())
plt.figure(figsize=(10,5))
plt.bar(experience.keys(), experience.values())
plt.title('No. of Jobs with Experience')
plt.xlabel('Experience', size=10)
plt.ylabel('No. of jobs', size=10)
plt.xticks(rotation=35)
plt.show()

In [None]:
# Most Frequent Jobs Listed
print(df_job.title.value_counts()[:10])

In [None]:
#The Titles and Count of Fraudulent Jobs
print(df_job[df_job.fake_posting==1].title.value_counts()[:25])

In [None]:
#Fake Jobs by Employment Type
fake_jobs_by_type = df_job.groupby('employment_type')['fake_posting'].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x='employment_type', y='fake_posting', data=fake_jobs_by_type)
plt.xlabel('Employment Type')
plt.ylabel('Number of Fake Jobs')
plt.title('Number of Fake Jobs by Employment Type')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Plotting Fake postings of USA vs other countries

us_fake_postings = df_job[(df_job['location'].str.contains("US,", na=False)) & (df_job['fake_posting'] == 1)]

non_us_fake_postings = df_job[(~df_job['location'].str.contains("US,", na=False)) & (df_job['fake_posting'] == 1)]

us_fake_count = us_fake_postings.shape[0]
non_us_fake_count = non_us_fake_postings.shape[0]


plt.figure(figsize=(8, 6))
plt.bar(['US', 'Non-US'], [us_fake_count, non_us_fake_count], color=['b', 'r'])
plt.xlabel('Location Type')
plt.ylabel('Number of Fake Postings')
plt.title('Number of Fake Postings for US and Non-US Locations')
plt.show()

In [None]:
# Import test and train datasets
df_train = pd.read_csv('fake_job_postings.csv')
df_test = pd.read_csv('fake_job_postings.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [None]:
Y = df_job.fake_posting

In [None]:
#Note- initial variables included because these are numeric data with no missing values
X = df_job[['telecommuting','has_company_logo','has_questions']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=0)

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [None]:
# Train the model using the training sets
regr.fit(X_train, Y_train)

In [None]:
#Predicting using Testing Set
Y_pred = regr.predict(X_test)

In [None]:
# Coefficients
print('Coefficients:', regr.coef_)

In [None]:
# The mean squared error
mse = mean_squared_error(Y_test, Y_pred)
print('Coefficient Mean squared error:', mse)

In [None]:
# Rsquared
r2 = r2_score(Y_test, Y_pred)
print('Coefficient R-Squared:', r2)

In [None]:
#Random Forest

X_train, X_test = X[:-90], X[-90:]
Y_train, Y_test = Y[:-90], Y[-90:]

model = RandomForestRegressor(n_estimators=100, max_depth=3, bootstrap=True)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

# The mean squared error
mse = mean_squared_error(Y_test, Y_pred)
print('Random Forest Mean squared error:', mse)

# R Squared
r2 = r2_score(Y_test, Y_pred)
print('Random Forest R-Squared:', r2)
#Note- adjusting max_depth did not significantly improve model

In [None]:
# Create and train the Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier(random_state=0)
decision_tree_model.fit(X_train, Y_train)

# Predict using the Decision Tree Classifier
Y_pred = decision_tree_model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(Y_test, Y_pred)
print('Accuracy:', accuracy)

# Generate the confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Classification report
print('Classification Report:')
class_report = classification_report(Y_test, Y_pred, zero_division=1)
print(class_report)

In [None]:
#K Nearest Neighbors
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train,Y_train)

In [None]:
Y_pred = knn.predict(X_test)

In [None]:
#Test the Model
df_knn = pd.DataFrame({'Y_test': Y_test , 'Y_pred': Y_pred}) 
df_knn

In [None]:
#Calculate the Accuracy
accuracy_score(Y_pred,Y_test)

In [None]:
# We create dummy variables to include our non-numeric data in the model and improve accuracy
# Creating dummy variables for 'required_experience', 'location', description, function, employment type.
# Dummy Variables are chosen using graphs above, other variables added with minimal missing values to increase model accuracy
dum_required_experience = pd.get_dummies(df_job['required_experience'], prefix='req_exp')
dum_location = pd.get_dummies(df_job['location'])
dum_description = pd.get_dummies(df_job['description'])
dum_function = pd.get_dummies(df_job['function'])
dum_employment = pd.get_dummies(df_job['employment_type'])

#Note: during trials, almost all combinations of dummy variables and numeric data overfit data for all models. 
#Only the combination of binary variables and the dummy variables for location and experience improved model without overfitting for regression,other models did show evidence of overfitting.

# Combine the dummy variables with df_job
df_job = pd.concat([df_job, dum_required_experience, dum_location], axis=1)

In [None]:
#Drop remaining non numeric columns
df_job=df_job.drop(['description','function','industry','required_education','required_experience','benefits','requirements','employment_type','company_profile','salary_range','location','title','job_id'], axis=1)



In [None]:
# Import test and train datasets
df_train = df_job
df_test = df_job

In [None]:
Y = df_job.fake_posting
df_job.drop('fake_posting',axis=1)

In [None]:

X = df_train

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=0)

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [None]:
# Train the model using the training sets
regr.fit(X_train, Y_train)

In [None]:
#Predicting using Testing Set
Y_pred = regr.predict(X_test)

In [None]:
# Coefficients
print('Coefficients:', regr.coef_)

In [None]:
# The mean squared error
mse = mean_squared_error(Y_test, Y_pred)
print('Coefficient Mean squared error:', mse)

In [None]:
# Rsquared
r2 = r2_score(Y_test, Y_pred)
print('Coefficient R-Squared:', r2)

In [None]:
#Random Forest

X_train, X_test = X[:-90], X[-90:]
Y_train, Y_test = Y[:-90], Y[-90:]

model = RandomForestRegressor(n_estimators=100, max_depth=3, bootstrap=True)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

# The mean squared error
mse = mean_squared_error(Y_test, Y_pred)
print('Random Forest Mean squared error:', mse)

# R Squared
r2 = r2_score(Y_test, Y_pred)
print('Random Forest R-Squared:', r2)



In [None]:
# Create and train the Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier(random_state=1)
decision_tree_model.fit(X_train, Y_train)

# Predict using the Decision Tree Classifier
Y_pred = decision_tree_model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(Y_test, Y_pred)
print('Accuracy:', accuracy)

# Generate the confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Classification report
print('Classification Report:')
class_report = classification_report(Y_test, Y_pred, zero_division=1)
print(class_report)

In [None]:
#K Nearest Neighbors
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train,Y_train)

In [None]:
Y_pred = knn.predict(X_test)

In [None]:
#Test the Model
df_knn = pd.DataFrame({'Y_test': Y_test , 'Y_pred': Y_pred}) 
df_knn

In [None]:
#Calculate the Accuracy
accuracy_score(Y_pred,Y_test)