Brief Description of dataset:

This dataset comprises 18K job descriptions, about 800 of which are fraudulent. 
The data includes both textual and meta-information about the jobs.

Objective of the project:
Predict which job descriptions are fraudulent or authentic. 

Metadata

Name of dataset: Real or Fake Jobs

Collaborators: Shivam Bansal (Owner)

GEOSPATIAL COVERAGE: Worldwide

Tags: education, classification, data visualization, data analytics, jobs and career, employment

Modification Date: Updated 2 months ago

SOURCES: http://emscad.samos.aegean.gr/

License: https://creativecommons.org/publicdomain/zero/1.0/

Expected Update Frequency: Never

In [None]:
#Importing Libraries
import re
import string
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report, confusion_matrix
from wordcloud import WordCloud
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.svm import SVC

# Reading dataset


In [None]:
data = pd.read_csv('fake_job_postings.csv')

In [None]:
# display first 5 records
data.head()

In [None]:
data['employment_type'].unique()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
data.isna().sum()

In [None]:
# deleting unnecessary columns
# axis =1 specifies that the values are column value and inplace=true to make these changes permanent (ie. make these dropes of columns permanent in the data set)
# We have droped salary range because 70% approx null value
# also job_id and other irrelvent columns because they does not have any logical meaning
data.drop(['job_id', 'salary_range', 'telecommuting', 'has_company_logo', 'has_questions'], 
        axis=1, inplace=True)

In [None]:
data.shape
data.columns

In [None]:
# filling missing values with blank
data.fillna(' ', inplace=True)

In [None]:
#Create independent and Dependent Features

#converting column names into list
columns = data.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["fraudulent"]]
# Store the variable we are predicting 
target = "fraudulent"
# Define a random state 
state = np.random.RandomState(42)
X = data[columns]
Y = data["fraudulent"]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)
from imblearn.under_sampling import RandomUnderSampler  

under_sampler = RandomUnderSampler()
X_res, y_res = under_sampler.fit_resample(X, Y)

df1 = pd.DataFrame(X_res)
  
df2 = pd.DataFrame(y_res)
  
# the default behaviour is join='outer'
# inner join
  
result = pd.concat([df1, df2], axis=1, join='inner')
display(result)
data=result;

In [None]:
from imblearn.under_sampling import RandomUnderSampler  

under_sampler = RandomUnderSampler()
X_res, y_res = under_sampler.fit_resample(X, Y)

df1 = pd.DataFrame(X_res)
  
df2 = pd.DataFrame(y_res)
  
# the default behaviour is join='outer'
# inner join
  
result = pd.concat([df1, df2], axis=1, join='inner')
display(result)
data=result;

In [None]:
data.isnull().sum()

# Explaratory Data Analysis

In [None]:
#39 Checking for distribution of percentages belonging to real class and fraud class
# 1 = Fake post, 2 = real post

labels = 'Fake', 'Real'
sizes = [data.fraudulent[data['fraudulent']== 1].count(), data.fraudulent[data['fraudulent']== 0].count()]
explode = (0, 0.1) 
fig1, ax1 = plt.subplots(figsize=(8, 6)) #size of the pie chart
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.2f%%',
        shadow=True, startangle=120) #autopct %1.2f%% for 2 digit precision
ax1.axis('equal')
plt.title("Proportion of Fraudulent")
plt.show()

In [None]:
# Visualizing which country is posting most of the jobs

def split(location):
    l = location.split(',')
    return l[0]

data['country'] = data.location.apply(split)
data['country']

In [None]:
# this will give unique country values
data['country'].nunique()

In [None]:
# top 10 country that post jobs 
data['country'].value_counts()[:11]

In [None]:
# creating a dictionary(key-value pair) with top 10 country

country = dict(data.country.value_counts()[:11])
del country[' '] #deleting country with space values

plt.figure(figsize=(9,5))
plt.title('Country-wise Job Posting', size=15)
plt.bar(country.keys(), country.values()) #(xaxis,yaxis)
plt.ylabel('No. of jobs', size=10)
plt.xlabel('Countries', size=10)
country.keys()

In [None]:
# visualizing jobs based on experience

experience = dict(data.required_experience.value_counts())
del experience[' ']

plt.figure(figsize=(9,5))
plt.bar(experience.keys(), experience.values())
plt.title('No. of Jobs with Experience')
plt.xlabel('Experience', size=10)
plt.ylabel('No. of jobs', size=10)
plt.xticks(rotation=35)
plt.show()

In [None]:
#Most frequent jobs
print(data.title.value_counts()[:10])

In [None]:
# checking for most fake jobs based on title
print(data[data.fraudulent==1].title.value_counts()[:10])

In [None]:
# For textual type data we will try to create word cloud 
# but before that we will try to create text combining all the data present in
# our database.
data['text'] = data['title']+' '+data['location']+' '+data['company_profile']+' '+data['description']+' '+data['requirements']+' '+data['benefits']+' '+data['industry']+' '+data['function']+' '+data['country']+' '+data['employment_type']

del data['title']
del data['location']
del data['department']
del data['company_profile']
del data['description']
del data['requirements']
del data['benefits']
del data['required_experience']
del data['required_education']
del data['industry']
del data['function']
del data['country']
del data['employment_type']

In [None]:
data.head()

In [None]:
from wordcloud import WordCloud

# visualizing all the words in our data using the wordcloud plot
all_words = ''.join([text for text in data["text"]])

wordcloud = WordCloud(width = 800, height = 500, random_state=21, max_font_size=120).generate(all_words)

plt.figure(figsize=(10,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# visualizing common words in real job posting

real_post = ''.join([text for text in data["text"][data['fraudulent']==0]])
wordcloud = WordCloud(width = 800, height = 500, random_state=21, max_font_size=120).generate(real_post)

plt.figure(figsize=(10,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# visualizing common words in real job posting

fraud_post = ''.join([text for text in data["text"][data['fraudulent'] == 1]])
wordcloud = WordCloud(width = 800, height = 500, random_state=21, max_font_size=120).generate(fraud_post)

plt.figure(figsize=(10,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Data Preapration

In [None]:
# NLTK :: Natural Language Toolkit
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

In [None]:
print(stopwords.words("english"))

#loading the stopwords
stop_words = set(stopwords.words("english"))

#converting all the text to lower case
data['text'] = data['text'].apply(lambda x:x.lower())

#removing the stop words from the corpus
data['text'] = data['text'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))

data['text'][0]

In [None]:
from sklearn.model_selection import train_test_split

# Splitting dataset in train and test
X_train, X_test, y_train, y_test = train_test_split(data.text, data.fraudulent, test_size=0.3)

# what does X-train and y_train contain
print(y_train)
print(X_train)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Converting the data into vector format

#  instantiate the vectorizer
vect = CountVectorizer()

# learn training data vocabulary, then use it to create a document-term matrix
# fit
vect.fit(X_train)

# transform training data
X_train_dtm = vect.transform(X_train)

In [None]:
# examine the document-term matrix
X_train_dtm

#how X_train_dtm is looking
print(X_train_dtm)
# This is Matrix representation,non 0 valued cells are not printed

In [None]:
# transform testing data using fitted vocabulary into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

## Model Building

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import pickle

## Logistic Reg. Model

In [None]:
lr = LogisticRegression()

%time lr.fit(X_train_dtm, y_train)


In [None]:
# make class predictions for X_test_dtm
y_pred_lr = lr.predict(X_test_dtm)

# Model Accuracy
print("Classification Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report\n")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix\n")
print(confusion_matrix(y_test, y_pred_lr))

In [None]:
# Confusion Matrix

import seaborn as sns
cm = confusion_matrix(y_test,y_pred_lr)

plt.figure(figsize = (10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

## Naive Bayes Classifier

In [None]:
# Naive Bayes Classifier
# we are using Multinomial Naive Bayes approach because the data here is not symmetrical.
# generally if there are data in the form of this long text,it is advisable to 
# %time will give the time taken by the system for execution

nb = MultinomialNB()
%time nb.fit(X_train_dtm, y_train)

In [None]:
y_pred_nb = nb.predict(X_test_dtm)

accuracy_score(y_test, y_pred_nb)
print("Classification Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report\n")
print(classification_report(y_test, y_pred_nb))
print("Confusion Matrix\n")
print(confusion_matrix(y_test, y_pred_nb))


cm = confusion_matrix(y_test,y_pred_nb)

plt.figure(figsize = (10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

## Linear SVC (Liblinear lib.)

In [None]:
lrsvc = LinearSVC()
%time lrsvc.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_lrsvc = lrsvc.predict(X_test_dtm)

# Model Accuracy
print("Classification Accuracy:", accuracy_score(y_test, y_pred_lrsvc))
print("Classification Report\n")
print(classification_report(y_test, y_pred_lrsvc))
print("Confusion Matrix\n")
print(confusion_matrix(y_test, y_pred_lrsvc))

In [None]:
# Confusion Matrix

import seaborn as sns
cm = confusion_matrix(y_test,y_pred_lrsvc)

plt.figure(figsize = (10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

## RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
%time rf.fit(X_train_dtm, y_train)

In [None]:
y_pred_rf = rf.predict(X_test_dtm)

# Model Accuracy
print("Classification Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report\n")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix\n")
print(confusion_matrix(y_test, y_pred_rf))

## Decision Tree Classifier

In [None]:
#instantiate a Decision Tree Classifier
dt = DecisionTreeClassifier()

#train the model 
# using X_train_dtm (timing it with an IPython "magic command")
%time dt.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = dt.predict(X_test_dtm)

# Model Accuracy
print("Classification Accuracy:", accuracy_score(y_test, y_pred_class))
print("Classification Report\n")
print(classification_report(y_test, y_pred_class))
print("Confusion Matrix\n")
print(confusion_matrix(y_test, y_pred_class))

In [None]:
# Confusion Matrix

import seaborn as sns
cm = confusion_matrix(y_test,y_pred_class)

plt.figure(figsize = (10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

## Building a Predictive System

In [None]:
# Predicting for a data set
X_train

In [None]:
print(X_train[184])

In [None]:
input_data=["internship (advertising) gr, , initiative led talented, energetic inspirational group young greek leaders called “global shapers athens hub”, umbrella world economic forum. nutshell, objective corporate-business community align global shapers order develop implement internship program, whereby companies commit taking board selected talented graduates 6-month internships – order offer work experience / skill building ultimately support development young talent enter local market. 6-month full-time paid internship position largest corporations greece. internship program includes classroom job training, team projects, networking profound business leaders greek job market, community service non-governmental organizations. information: #url_3a192fa44cc0cec563d796313a1fbbbaf5543bb685aa98e0143dc082adc1ab4f#candidates applying one job families, matched job family relevant studies/profile. bachelor's degree majors (aei, tei, college)limited working experiencefluent english working experience leading corporationsholistic classroom training personal developmenton job coaching specific projectsregular sessions senior leaders participating organizations marketing advertising advertising gr full-time"]

In [None]:
# convert text to feature vectors
input_data_features = vect.transform(input_data)

# making prediction

prediction = rf.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Fraudulant Job')

else:
  print('Real Job')

In [None]:
#checking wether predicted result was correct or not
print(y_train[184])

In [None]:
# converting the model into pickle file
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report, confusion_matrix
from flask import Flask, render_template, request, jsonify, flash
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

data = pd.read_csv('clean_fakejobs.csv')

# Splitting dataset in train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.text, data.fraudulent, test_size=0.3)

# Converting the data into vector format
#  instantiate the vectorizer
vect = CountVectorizer()

# learn training data vocabulary, then use it to create a document-term matrix
# fit
vect.fit(X_train)

# transform training data
X_train_dtm = vect.transform(X_train)

X_test_dtm = vect.transform(X_test)

# instantiate a Decision Tree Classifier
rf = RandomForestClassifier()

clf = rf.fit(X_train_dtm, y_train)
y_pred = clf.predict(X_test_dtm)

# Save the vectorizer
vectfile = 'vectorizer.pkl'
pickle.dump(vect, open(vectfile, 'wb'))

# Saving model to disk
pickle.dump(clf, open('model.pkl','wb'))

model = pickle.load(open('model.pkl', 'rb'))