#### Import Libraries Needed

In [1]:
# Import the libraries needed
import pandas as pd

#### Exploratory Data Analysis

In [2]:
# Load the dataset
jobs_df = pd.read_csv("emscad_v1.csv")

In [3]:
# View the dataset
jobs_df.head(5)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
0,Marketing Intern,"US, NY, New York",Marketing,,"<h3>We're Food52, and we've created a groundbr...","<p>Food52, a fast-growing, James Beard Award-w...",<ul>\r\n<li>Experience with content management...,,f,t,f,Other,Internship,,,Marketing,f,f
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"<h3>90 Seconds, the worlds Cloud Video Product...",<p>Organised - Focused - Vibrant - Awesome!<br...,<p><b>What we expect from you:</b></p>\r\n<p>Y...,<h3><b>What you will get from us</b></h3>\r\n<...,f,t,f,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,f,f
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,<h3></h3>\r\n<p>Valor Services provides Workfo...,"<p>Our client, located in Houston, is actively...",<ul>\r\n<li>Implement pre-commissioning and co...,,f,t,f,,,,,,f,f
3,Account Executive - Washington DC,"US, DC, Washington",Sales,,<p>Our passion for improving quality of life t...,<p><b>THE COMPANY: ESRI – Environmental System...,<ul>\r\n<li>\r\n<b>EDUCATION: </b>Bachelor’s o...,<p>Our culture is anything but corporate—we ha...,f,t,f,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,f,f
4,Bill Review Manager,"US, FL, Fort Worth",,,<p>SpotSource Solutions LLC is a Global Human ...,<p><b>JOB TITLE:</b> Itemization Review Manage...,<p><b>QUALIFICATIONS:</b></p>\r\n<ul>\r\n<li>R...,<p>Full Benefits Offered</p>,f,t,t,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,f,f


In [4]:
# Look at the columns involved
jobs_df.columns

Index(['title', 'location', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'in_balanced_dataset'],
      dtype='object')

In [5]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
title                  17880 non-null object
location               17534 non-null object
department             6333 non-null object
salary_range           2868 non-null object
company_profile        14572 non-null object
description            17880 non-null object
requirements           15191 non-null object
benefits               10684 non-null object
telecommuting          17880 non-null object
has_company_logo       17880 non-null object
has_questions          17880 non-null object
employment_type        14409 non-null object
required_experience    10830 non-null object
required_education     9775 non-null object
industry               12977 non-null object
function               11425 non-null object
fraudulent             17880 non-null object
in_balanced_dataset    17880 non-null object
dtypes: object(18)
memory usage: 2.5+ MB


In [6]:
# Check for missing values
jobs_df.isnull().sum()

title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                0
requirements            2689
benefits                7196
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
in_balanced_dataset        0
dtype: int64

#### Data Wrangling

In [7]:
# Deal with missing values and drop unneccessary columns

# Location missing values will be assigned none
jobs_df['location'] = jobs_df.location.fillna('none')

# department missing values will be assigned not specified
jobs_df['department'] = jobs_df.department.fillna('not specified')

# drop salary range, benefits, telecommuting, has_questions (not compulsory) in the context of Nigeria
jobs_df.drop(['salary_range', 'benefits','telecommuting', 'in_balanced_dataset','has_questions'],
             axis=1, inplace=True)

# Company profile missing values will be assigned none
jobs_df['company_profile'] = jobs_df.company_profile.fillna('none')

# Company profile missing values will be assigned not specified
jobs_df['requirements'] = jobs_df.requirements.fillna('not specified')

# employment_type missing values will be assigned not specified
jobs_df['employment_type'] = jobs_df.employment_type.fillna('not specified')

# required_experience missing values will be assigned not specified
jobs_df['required_experience'] = jobs_df.required_experience.fillna('not specified')

# required_education missing values will be assigned not specified
jobs_df['required_education'] = jobs_df.required_education.fillna('not specified')

# industry missing values will be assigned not specified
jobs_df['industry'] = jobs_df.industry.fillna('not specified')

# function missing values will be assigned not specified
jobs_df['function'] = jobs_df.function.fillna('not specified')

In [8]:
# Confirm that missing values have been dealt with
jobs_df.isnull().sum()

title                  0
location               0
department             0
company_profile        0
description            0
requirements           0
has_company_logo       0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
dtype: int64

In [9]:
jobs_df.head()


Unnamed: 0,title,location,department,company_profile,description,requirements,has_company_logo,employment_type,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,"<h3>We're Food52, and we've created a groundbr...","<p>Food52, a fast-growing, James Beard Award-w...",<ul>\r\n<li>Experience with content management...,t,Other,Internship,not specified,not specified,Marketing,f
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,"<h3>90 Seconds, the worlds Cloud Video Product...",<p>Organised - Focused - Vibrant - Awesome!<br...,<p><b>What we expect from you:</b></p>\r\n<p>Y...,t,Full-time,Not Applicable,not specified,Marketing and Advertising,Customer Service,f
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",not specified,<h3></h3>\r\n<p>Valor Services provides Workfo...,"<p>Our client, located in Houston, is actively...",<ul>\r\n<li>Implement pre-commissioning and co...,t,not specified,not specified,not specified,not specified,not specified,f
3,Account Executive - Washington DC,"US, DC, Washington",Sales,<p>Our passion for improving quality of life t...,<p><b>THE COMPANY: ESRI – Environmental System...,<ul>\r\n<li>\r\n<b>EDUCATION: </b>Bachelor’s o...,t,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,f
4,Bill Review Manager,"US, FL, Fort Worth",not specified,<p>SpotSource Solutions LLC is a Global Human ...,<p><b>JOB TITLE:</b> Itemization Review Manage...,<p><b>QUALIFICATIONS:</b></p>\r\n<ul>\r\n<li>R...,t,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,f


In [10]:
# Remove html tags
jobs_df['company_profile'] = jobs_df.company_profile.str.replace(r'<[^>]*>', '')
jobs_df['description'] = jobs_df.description.str.replace(r'<[^>]*>', '')
jobs_df['requirements'] = jobs_df.requirements.str.replace(r'<[^>]*>', '')

In [11]:
jobs_df.head()

Unnamed: 0,title,location,department,company_profile,description,requirements,has_company_logo,employment_type,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",\r\nExperience with content management systems...,t,Other,Internship,not specified,not specified,Marketing,f
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:\r\nYour key responsib...,t,Full-time,Not Applicable,not specified,Marketing and Advertising,Customer Service,f
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",not specified,\r\nValor Services provides Workforce Solution...,"Our client, located in Houston, is actively se...",\r\nImplement pre-commissioning and commission...,t,not specified,not specified,not specified,not specified,not specified,f
3,Account Executive - Washington DC,"US, DC, Washington",Sales,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,\r\n\r\nEDUCATION: Bachelor’s or Master’s in G...,t,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,f
4,Bill Review Manager,"US, FL, Fort Worth",not specified,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review Manager\r\nLOCAT...,QUALIFICATIONS:\r\n\r\nRN license in the State...,t,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,f


In [12]:
# Remove non word characters and trailing white space 
for column in jobs_df.columns:
    jobs_df[column] = jobs_df[column].str.replace(r'\W', ' ').str.replace(r'\s$','')


#### Vectorization of text columns

In [13]:
jobs_df.head()

Unnamed: 0,title,location,department,company_profile,description,requirements,has_company_logo,employment_type,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,US NY New York,Marketing,We re Food52 and we ve created a groundbreaki...,Food52 a fast growing James Beard Award winn...,Experience with content management systems a...,t,Other,Internship,not specified,not specified,Marketing,f
1,Customer Service Cloud Video Production,NZ Auckland,Success,90 Seconds the worlds Cloud Video Production ...,Organised Focused Vibrant Awesome Do you...,What we expect from you Your key responsibil...,t,Full time,Not Applicable,not specified,Marketing and Advertising,Customer Service,f
2,Commissioning Machinery Assistant CMA,US IA Wever,not specified,Valor Services provides Workforce Solutions ...,Our client located in Houston is actively se...,Implement pre commissioning and commissionin...,t,not specified,not specified,not specified,not specified,not specified,f
3,Account Executive Washington DC,US DC Washington,Sales,Our passion for improving quality of life thro...,THE COMPANY ESRI Environmental Systems Rese...,EDUCATION Bachelor s or Master s in GIS ...,t,Full time,Mid Senior level,Bachelor s Degree,Computer Software,Sales,f
4,Bill Review Manager,US FL Fort Worth,not specified,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE Itemization Review Manager LOCATIO...,QUALIFICATIONS RN license in the State of ...,t,Full time,Mid Senior level,Bachelor s Degree,Hospital Health Care,Health Care Provider,f


In [14]:
# Encode binary columns t as 1 and f as 0
jobs_df['has_company_logo'] = jobs_df.has_company_logo.map({'t': 1, 'f':0})
jobs_df['fraudulent'] = jobs_df.fraudulent.map({'t': 1, 'f':0})

In [15]:
# TFIDF vectorization of text

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
vect = CountVectorizer()

In [18]:
# jobs_df["requirements"] = vect.fit_transform(jobs_df.requirements).toarray()

In [19]:
jobs_df

Unnamed: 0,title,location,department,company_profile,description,requirements,has_company_logo,employment_type,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,US NY New York,Marketing,We re Food52 and we ve created a groundbreaki...,Food52 a fast growing James Beard Award winn...,Experience with content management systems a...,1,Other,Internship,not specified,not specified,Marketing,0
1,Customer Service Cloud Video Production,NZ Auckland,Success,90 Seconds the worlds Cloud Video Production ...,Organised Focused Vibrant Awesome Do you...,What we expect from you Your key responsibil...,1,Full time,Not Applicable,not specified,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant CMA,US IA Wever,not specified,Valor Services provides Workforce Solutions ...,Our client located in Houston is actively se...,Implement pre commissioning and commissionin...,1,not specified,not specified,not specified,not specified,not specified,0
3,Account Executive Washington DC,US DC Washington,Sales,Our passion for improving quality of life thro...,THE COMPANY ESRI Environmental Systems Rese...,EDUCATION Bachelor s or Master s in GIS ...,1,Full time,Mid Senior level,Bachelor s Degree,Computer Software,Sales,0
4,Bill Review Manager,US FL Fort Worth,not specified,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE Itemization Review Manager LOCATIO...,QUALIFICATIONS RN license in the State of ...,1,Full time,Mid Senior level,Bachelor s Degree,Hospital Health Care,Health Care Provider,0
5,Accounting Clerk,US MD,not specified,none,Job Overview Apex is an environmental consult...,not specified,0,not specified,not specified,not specified,not specified,not specified,0
6,Head of Content m f,DE BE Berlin,ANDROIDPIT,Founded in 2009 the Fonpit AG rose with its i...,Your Responsibilities Manage the Englis...,Your Know How ...,1,Full time,Mid Senior level,Master s Degree,Online Media,Management,0
7,Lead Guest Service Specialist,US CA San Francisco,not specified,Airenvy s mission is to provide lucrative yet ...,Who is Airenvy Hey there We are seasoned en...,Experience with CRM software live chat and...,1,not specified,not specified,not specified,not specified,not specified,0
8,HP BSM SME,US FL Pensacola,not specified,Solutions3 is a woman owned small business who...,Implementation Configuration Testing Train...,MUST BE A US CITIZEN An active TS SCI cleara...,1,Full time,Associate,not specified,Information Technology and Services,not specified,0
9,Customer Service Associate Part Time,US AZ Phoenix,not specified,Novitex Enterprise Solutions formerly Pitney ...,The Customer Service Associate will be based i...,Minimum Requirements Minimum of 6 months c...,1,Part time,Entry level,High School or equivalent,Financial Services,Customer Service,0


In [20]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

In [21]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [22]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 13 columns):
title                  17880 non-null object
location               17880 non-null object
department             17880 non-null object
company_profile        17880 non-null object
description            17880 non-null object
requirements           17880 non-null object
has_company_logo       17880 non-null int64
employment_type        17880 non-null object
required_experience    17880 non-null object
required_education     17880 non-null object
industry               17880 non-null object
function               17880 non-null object
fraudulent             17880 non-null int64
dtypes: int64(2), object(11)
memory usage: 1.8+ MB


In [23]:
def clean_series(column_name):
    jobs_df[column_name] = jobs_df[column_name].apply(lambda x: " ".join([i for i in x.lower().split() if i not in stop_words]))

In [24]:
text_columns = ['title', 'location', 'department', 'company_profile', 'description', 'requirements','employment_type','required_experience','required_education','industry','function']
for column in text_columns:
    clean_series(column)

In [25]:
jobs_df

Unnamed: 0,title,location,department,company_profile,description,requirements,has_company_logo,employment_type,required_experience,required_education,industry,function,fraudulent
0,marketing intern,us ny new york,marketing,food52 created groundbreaking award winning co...,food52 fast growing james beard award winning ...,experience content management systems major pl...,1,,internship,specified,specified,marketing,0
1,customer service cloud video production,nz auckland,success,90 seconds worlds cloud video production servi...,organised focused vibrant awesome passion cust...,expect key responsibility communicate client 9...,1,full time,applicable,specified,marketing advertising,customer service,0
2,commissioning machinery assistant cma,us ia wever,specified,valor services provides workforce solutions me...,client located houston actively seeking experi...,implement pre commissioning commissioning proc...,1,specified,specified,specified,specified,specified,0
3,account executive washington dc,us dc washington,sales,passion improving quality life geography heart...,company esri environmental systems research in...,education bachelor master gis business adminis...,1,full time,mid senior level,bachelor degree,computer software,sales,0
4,bill review manager,us fl fort worth,specified,spotsource solutions llc global human capital ...,job title itemization review manager location ...,qualifications rn license state texas diploma ...,1,full time,mid senior level,bachelor degree,hospital health care,health care provider,0
5,accounting clerk,us md,specified,none,job overview apex environmental consulting fir...,specified,0,specified,specified,specified,specified,specified,0
6,head content f,de berlin,androidpit,founded 2009 fonpit ag rose international web ...,responsibilities manage english speaking edito...,know university college degree journalism medi...,1,full time,mid senior level,master degree,online media,management,0
7,lead guest service specialist,us ca san francisco,specified,airenvy mission provide lucrative yet hassle f...,airenvy hey seasoned entrepreneurs heart san f...,experience crm software live chat phones inclu...,1,specified,specified,specified,specified,specified,0
8,hp bsm sme,us fl pensacola,specified,solutions3 woman owned small business whose fo...,implementation configuration testing training ...,must us citizen active ts sci clearance requir...,1,full time,associate,specified,information technology services,specified,0
9,customer service associate part time,us az phoenix,specified,novitex enterprise solutions formerly pitney b...,customer service associate based phoenix az ri...,minimum requirements minimum 6 months customer...,1,part time,entry level,high school equivalent,financial services,customer service,0


In [26]:
# creates a function that will use TFIDF vectorizer to vectorize all text data
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize the vectorizer
vectorizer = TfidfVectorizer()

def vectorize_data(column):
    jobs_df[column] = list(vectorizer.fit_transform(jobs_df[column]))
text_columns = ['title','location', 'department', 'company_profile', 'description', 'requirements','employment_type','required_experience','required_education','industry','function']

# Loop through the text columns and vectorize each
for column in text_columns:
    vectorize_data(column)

In [27]:
jobs_df.head()

Unnamed: 0,title,location,department,company_profile,description,requirements,has_company_logo,employment_type,required_experience,required_education,industry,function,fraudulent
0,"(0, 2573)\t0.6288422241252729\n (0, 2180)\t...","(0, 2110)\t0.21706654053831323\n (0, 1544)\...","(0, 603)\t1.0","(0, 5099)\t0.2524693768126376\n (0, 3140)\t...","(0, 12440)\t0.3870868746517863\n (0, 11835)...","(0, 7568)\t0.02432877660181197\n (0, 4608)\...",1,,"(0, 5)\t1.0","(0, 15)\t1.0","(0, 169)\t1.0","(0, 29)\t1.0",0
1,"(0, 1139)\t0.29957132073424836\n (0, 3732)\...","(0, 1547)\t0.6854601229029301\n (0, 204)\t0...","(0, 898)\t1.0","(0, 9865)\t0.06223663516589484\n (0, 9711)\...","(0, 11835)\t0.030987910065148127\n (0, 1370...","(0, 7568)\t0.06319480174944025\n (0, 12154)...",1,"(0, 1)\t0.7236816152835696\n (0, 5)\t0.6901...","(0, 0)\t1.0","(0, 15)\t1.0","(0, 105)\t0.7071067811865476\n (0, 2)\t0.70...","(0, 12)\t0.7071067811865475\n (0, 43)\t0.70...",0
2,"(0, 976)\t0.5522509653607304\n (0, 2519)\t0...","(0, 2110)\t0.13262695274623532\n (0, 1022)\...","(0, 865)\t1.0","(0, 8413)\t0.053307093398233744\n (0, 5481)...","(0, 27503)\t0.0998738183938091\n (0, 6133)\...","(0, 4191)\t0.07551776042991121\n (0, 19583)...",1,"(0, 3)\t1.0","(0, 9)\t1.0","(0, 15)\t1.0","(0, 169)\t1.0","(0, 44)\t1.0",0
3,"(0, 188)\t0.3359182045580619\n (0, 1547)\t0...","(0, 2110)\t0.20303951769611242\n (0, 638)\t...","(0, 818)\t1.0","(0, 5516)\t0.10715105551850897\n (0, 4579)\...","(0, 21801)\t0.02912913550820874\n (0, 34191...","(0, 7568)\t0.16469734074378767\n (0, 7448)\...",1,"(0, 1)\t0.7236816152835696\n (0, 5)\t0.6901...","(0, 7)\t0.6173605717152858\n (0, 8)\t0.6173...","(0, 1)\t0.7273530111753033\n (0, 6)\t0.6862...","(0, 29)\t0.6923313651264267\n (0, 167)\t0.7...","(0, 41)\t1.0",0
4,"(0, 595)\t0.6912149302348252\n (0, 3526)\t0...","(0, 2110)\t0.14120914071134305\n (0, 798)\t...","(0, 865)\t1.0","(0, 3140)\t0.05912221605528912\n (0, 2000)\...","(0, 15957)\t0.07857712466263424\n (0, 30256...","(0, 7568)\t0.09861152468320698\n (0, 4169)\...",1,"(0, 1)\t0.7236816152835696\n (0, 5)\t0.6901...","(0, 7)\t0.6173605717152858\n (0, 8)\t0.6173...","(0, 1)\t0.7273530111753033\n (0, 6)\t0.6862...","(0, 78)\t0.5904112938872235\n (0, 76)\t0.55...","(0, 23)\t0.5773502691896256\n (0, 8)\t0.577...",0
