In [1]:
import numpy as np
import pandas as pd
import random

jobs = pd.read_csv('./fake-job-posting.csv')

In [2]:
pd.set_option('display.max_rows', 200)

In [3]:
jobs.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [4]:
jobs.shape

(17880, 18)

In [5]:
jobs.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [6]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [7]:
for col in jobs.columns:
    print('{0:<20}\t Percent of NaN value: {1:.2f}%'.format(col, 100 * (jobs[col].isnull().sum() / jobs[col].shape[0])))

job_id              	 Percent of NaN value: 0.00%
title               	 Percent of NaN value: 0.00%
location            	 Percent of NaN value: 1.94%
department          	 Percent of NaN value: 64.58%
salary_range        	 Percent of NaN value: 83.96%
company_profile     	 Percent of NaN value: 18.50%
description         	 Percent of NaN value: 0.01%
requirements        	 Percent of NaN value: 15.07%
benefits            	 Percent of NaN value: 40.32%
telecommuting       	 Percent of NaN value: 0.00%
has_company_logo    	 Percent of NaN value: 0.00%
has_questions       	 Percent of NaN value: 0.00%
employment_type     	 Percent of NaN value: 19.41%
required_experience 	 Percent of NaN value: 39.43%
required_education  	 Percent of NaN value: 45.33%
industry            	 Percent of NaN value: 27.42%
function            	 Percent of NaN value: 36.10%
fraudulent          	 Percent of NaN value: 0.00%


In [8]:
# We will classify based on job title, descriptions, requirements, and company_profile and drop all other columns for evaluation
jobs = jobs[jobs['description'].notna()]
jobs = jobs[jobs['requirements'].notna()]
jobs = jobs[jobs['company_profile'].notna()]

In [9]:
jobs.shape

(12631, 18)

In [10]:
fake = jobs[jobs['fraudulent'] == 1].index

In [11]:
fake.size

272

In [12]:
real = jobs[jobs['fraudulent'] == 0].index

In [13]:
real.size

12359

In [14]:
# Subset equal samples from real and fake postings (200 each) and then randomly select 100 for evaluation.
real_random = random.sample(list(real), 200)
fake_random = random.sample(list(fake), 200)

In [15]:
random_sample = real_random + fake_random

In [16]:
len(random_sample)

400

In [17]:
# sample 100 data from 200 real + 200 fake postings
eval_sample = random.sample(random_sample, 100)

In [18]:
subset = jobs.loc[eval_sample]

In [19]:
# subset

In [20]:
# Drop columns other than job title, descriptions, requirements, and company_profile
subset = subset.drop(['location', 'salary_range', 'telecommuting', 'benefits', 'industry', \
                      'department', 'telecommuting', \
                      'has_company_logo', 'has_questions', 'employment_type', 'required_experience', \
                      'required_education', 'function'], axis=1)

In [21]:
subset 

Unnamed: 0,job_id,title,company_profile,description,requirements,fraudulent
11572,11573,Director of Engineering,Aptitude Staffing Solutions has redesigned the...,Reporting to the VP of Service Provider Engine...,Skills and Requirements: MSEE with five years ...,1
8692,8693,Payroll Clerk,At DynCorp International it is our commitment ...,Payroll Clerk Job Purpose: Responsible for Com...,Required Tools Needed below:1. A good computer...,1
5808,5809,Employee Relations Specialist,As the industry’s largest supply contracting c...,Act as a first point of contact for employee-r...,Minimum of three (3) years of work Novationoff...,1
1405,1406,Software Engineer / PHP Developer,PINT is a leader in the web industry with 20 y...,PINT is seeking an associate software engineer...,BS and/or Masters in CS or equivalent work exp...,0
1960,1961,Mobile Developer-Back End,Aptitude Staffing Solutions has redesigned the...,"Managing over 200 TB of data, including 34 tri...",What You Offer:Experience in writing clean and...,1
6788,6789,M14 5HT Business Admin Apprenticeship availabl...,Established on the principles that full time e...,This is fantastic opportunity for someone want...,Governmnt funding is only available for 16-18 ...,0
2368,2369,Director of Product Marketing for Advertisers,Aptitude Staffing Solutions has redesigned the...,Seeking a Director of Product Marketing to pos...,Skills and Requirements:BS/BA or equivalent fr...,1
17391,17392,Associate Product Manager,Want to build a 21st century financial service...,"TransferWise is a VC-backed, international mon...",2+ years of product management experienceAnaly...,0
15395,15396,Blipp Architect (Full Stack Developer),"As augmented reality jobs go, one at Blippar i...",We’re looking for phenomenal full-stack develo...,"We need bright, engaging individuals who are e...",0
7998,7999,Graduates: English Teacher Abroad (Conversatio...,We help teachers get safe &amp; secure jobs ab...,"Play with kids, get paid for it Love travel? J...",University degree required. TEFL / TESOL / CEL...,0


In [22]:
# remove fraudulent classification
human_eval = subset.drop(['fraudulent'], axis=1)

In [23]:
human_eval

Unnamed: 0,job_id,title,company_profile,description,requirements
11572,11573,Director of Engineering,Aptitude Staffing Solutions has redesigned the...,Reporting to the VP of Service Provider Engine...,Skills and Requirements: MSEE with five years ...
8692,8693,Payroll Clerk,At DynCorp International it is our commitment ...,Payroll Clerk Job Purpose: Responsible for Com...,Required Tools Needed below:1. A good computer...
5808,5809,Employee Relations Specialist,As the industry’s largest supply contracting c...,Act as a first point of contact for employee-r...,Minimum of three (3) years of work Novationoff...
1405,1406,Software Engineer / PHP Developer,PINT is a leader in the web industry with 20 y...,PINT is seeking an associate software engineer...,BS and/or Masters in CS or equivalent work exp...
1960,1961,Mobile Developer-Back End,Aptitude Staffing Solutions has redesigned the...,"Managing over 200 TB of data, including 34 tri...",What You Offer:Experience in writing clean and...
6788,6789,M14 5HT Business Admin Apprenticeship availabl...,Established on the principles that full time e...,This is fantastic opportunity for someone want...,Governmnt funding is only available for 16-18 ...
2368,2369,Director of Product Marketing for Advertisers,Aptitude Staffing Solutions has redesigned the...,Seeking a Director of Product Marketing to pos...,Skills and Requirements:BS/BA or equivalent fr...
17391,17392,Associate Product Manager,Want to build a 21st century financial service...,"TransferWise is a VC-backed, international mon...",2+ years of product management experienceAnaly...
15395,15396,Blipp Architect (Full Stack Developer),"As augmented reality jobs go, one at Blippar i...",We’re looking for phenomenal full-stack develo...,"We need bright, engaging individuals who are e..."
7998,7999,Graduates: English Teacher Abroad (Conversatio...,We help teachers get safe &amp; secure jobs ab...,"Play with kids, get paid for it Love travel? J...",University degree required. TEFL / TESOL / CEL...


In [25]:
# create csv file for project members to evaluate 100 postings
human_eval.to_csv('human_eval.csv', sep = ',', encoding='utf-8-sig', index=False)

In [27]:
# len(subset[subset['fraudulent'] == 1].index)