In [1]:
import json as j # for parsing text in json before loading into dataframe 
import pandas as pd # for creating and manipulating the dataframes
from pandas.io.json import json_normalize #package for flattening json in pandas df

import re # REGEX for text
from nltk.corpus import stopwords # for removal of stop words
from nltk.stem import SnowballStemmer # for word stemming

from nltk import word_tokenize # for tokenization of words
from nltk.util import ngrams # for generating unigrams, bigrams, trigrams
import numpy as np # for working with matrices 
import matplotlib.pyplot as plt # for charting

from sklearn.pipeline import Pipeline # for creating an analysis pipeline
from sklearn.model_selection import train_test_split # for splitting dataset into training and testing
from sklearn.feature_extraction.text import TfidfVectorizer # for TF-IDF
from sklearn.cluster import KMeans #for generating document clusters
from sklearn.metrics import adjusted_rand_score #for generating random index
from sklearn.metrics import precision_recall_fscore_support #for evaluating model
from sklearn.svm import LinearSVC # for generating model
from sklearn.naive_bayes import MultinomialNB # for generating model
from sklearn.ensemble import RandomForestClassifier # for generating model

# Step 1: Parse and Load

In [2]:
#load first json object
with open('D:/CAPSTONE_NEW/indeed_com-jobs_us_deduped_n_merged_20170817_113447896253141.json', encoding="utf-8-sig") as data_file:
    json_data1 = j.load(data_file)

In [3]:
#put data file 1 into a pandas df
#parent node is 'page'
#print first 5 rows
jobs_data1 = json_normalize(json_data1['page'])
jobs_data1.head(5)

Unnamed: 0,pageurl,record.company_name,record.description,record.direct_apply_link_on_the_job_page,record.direct_link_to_the_job_page,record.location,record.postedAt,record.postedAtText,record.sub_domain,record.title,record.uniq_id
0,http://www.indeed.com/cmp/Tileland-Interior-De...,Tileland Interior Decor LLC,Office assistantResponsibilities include: Prep...,,,,2016-08-11 12:27:57 +0000,7 days ago,indeed.com,office assistant,f458526241a5e46962bad2a97d8da5d3
1,http://www.indeed.com/viewjob?jk=c5d36d1df92067d8,ASRC Federal,Financial Investigators - Asset Forfeiture -El...,http://www.indeed.com/rc/clk?jk=c5d36d1df92067...,http://www.indeed.com/rc/clk?jk=c5d36d1df92067...,,2016-08-01 12:27:57 +0000,17 days ago,indeed.com,Financial Investigators - Asset Forfeiture -El...,d47b5ae26d622ddb898cda70325236db
2,http://www.indeed.com/viewjob?jk=eb4d64f6aee216f0,"MSLA, A Medical Corporation",Employment Summary\nSummary of Job Function an...,http://www.indeed.com/rc/clk?jk=eb4d64f6aee216...,http://www.indeed.com/rc/clk?jk=eb4d64f6aee216...,,2016-08-02 12:27:57 +0000,16 days ago,indeed.com,Provider Network Development Manager,e91c1b43c93e7d78d4ea8694e19c3e79
3,http://www.indeed.com/viewjob?jk=b0b80ca728afedee,TriWest Healthcare Alliance,"Profile\n\n\nJob Summary\n\n\nSupervises, trai...",http://www.indeed.com/rc/clk?jk=b0b80ca728afed...,http://www.indeed.com/rc/clk?jk=b0b80ca728afed...,,2016-08-04 12:27:57 +0000,14 days ago,indeed.com,"Supervisor, DSPO (El Paso TX)",7b2c2fe8b0b241934465f53007c90944
4,http://www.indeed.com/viewjob?jk=50d5a5844e56e24b,Tyco,Position Summary:\nThe National Account Field ...,http://www.indeed.com/rc/clk?jk=50d5a5844e56e2...,http://www.indeed.com/rc/clk?jk=50d5a5844e56e2...,,2016-08-17 12:27:57 +0000,1 day ago,indeed.com,Commercial Sales Estimator (Proposal Support),9c864078cd298c8047a0e60e8ac63e3f


In [4]:
#load second json object
with open('D:/CAPSTONE_NEW/indeed_com-jobs_us_deduped_n_merged_20170817_113502269355122.json', encoding="utf-8-sig") as data_file:
    json_data2 = j.load(data_file)

In [5]:
#put data file 2 into a pandas df
#parent node is 'page'
#print first 5 rows
jobs_data2 = json_normalize(json_data2['page'])
jobs_data2.head(5)

Unnamed: 0,pageurl,record.company_name,record.description,record.direct_apply_link_on_the_job_page,record.direct_link_to_the_job_page,record.location,record.postedAt,record.postedAtText,record.sub_domain,record.title,record.uniq_id
0,http://www.indeed.com/viewjob?jk=3ff48bf17c8ba7d8,United States Postal Service,United States Postal Service\nExternal Publica...,,,,2016-09-30 08:32:27 +0000,2 days ago,indeed.com,HOLIDAY CLERK ASSISTANT,e21916b80e1a180d36acda730b3bb3c8
1,http://www.indeed.com/viewjob?jk=5a4065bc5279c2ed,The Home Depot,"Department Supervisors train, coach and develo...",http://www.indeed.com/rc/clk?jk=5a4065bc5279c2...,http://www.indeed.com/rc/clk?jk=5a4065bc5279c2...,,2016-09-02 08:32:27 +0000,30+ days ago,indeed.com,Department Supervisor,9080f8ba6a86029047743e1d7cf24e80
2,http://www.indeed.com/viewjob?jk=36a5566bd1b6e2a9,UNITED PARCEL SERVICE,"UPS is accepting applications for temporary, s...",http://www.indeed.com/rc/clk?jk=36a5566bd1b6e2...,http://www.indeed.com/rc/clk?jk=36a5566bd1b6e2...,,2016-09-22 08:32:27 +0000,10 days ago,indeed.com,Package Delivery Driver,b6b9d20292d8f226a045ed0811f81685
3,http://www.indeed.com/company/Stat-Staff-Profe...,Stat Staff Professionals Inc.,We're a staffing organization for RN's / Allie...,,,,2016-09-27 08:32:27 +0000,5 days ago,indeed.com,CST / ST - Surgical Tech,c838ad0ea83d3bde3186a480da7b2ce5
4,http://www.indeed.com/company/Walgreens/jobs/P...,Walgreens,Assists in the control of extra shifts and ove...,,,,2016-09-02 08:32:27 +0000,30+ days ago,indeed.com,Pharmacist,dc92c84e227efb05f48d240782ce7cab


In [6]:
#load second json object
with open('D:/CAPSTONE_NEW/indeed_com-jobs_us_deduped_n_merged_20170817_113515380318786.json', encoding="utf-8-sig") as data_file:
    json_data3 = j.load(data_file)

In [7]:
#put data file 3 into a pandas df
#parent node is 'page'
#print first 5 rows
jobs_data3 = json_normalize(json_data3['page'])
jobs_data3.head(5)

Unnamed: 0,pageurl,record.company_name,record.description,record.direct_apply_link_on_the_job_page,record.direct_link_to_the_job_page,record.location,record.postedAt,record.postedAtText,record.sub_domain,record.title,record.uniq_id
0,http://www.indeed.com/viewjob?jk=be7edbb74b6be0bb,Hilton Garden Inn,"Prepare ingredients for cooking, including por...",http://www.indeed.com/rc/clk?jk=be7edbb74b6be0...,http://www.indeed.com/rc/clk?jk=be7edbb74b6be0...,,2016-08-13 08:31:42 +0000,30+ days ago,indeed.com,Banquet Cook,1e8651519814524b570a34c577b5e1a3
1,http://www.indeed.com/viewjob?jk=67ff3f5c954799ca,Onepath,Job Description\nOnepath is seeking skilled lo...,http://www.indeed.com/rc/clk?jk=67ff3f5c954799...,http://www.indeed.com/rc/clk?jk=67ff3f5c954799...,,2016-08-27 08:31:42 +0000,16 days ago,indeed.com,Alarm Installation Technician,ea94198a6d1d55d6070dc941f17f1eba
2,http://www.indeed.com/cmp/Doss-Logistics-Inc./...,Doss Logistics Inc.,Doss Logistics is hiring for a OTR Truck Drive...,,,,2016-09-03 08:31:42 +0000,9 days ago,indeed.com,OTR Truck Driver (Home every weekend),bce1f30c1f041a2ecadec346831a3ecd
3,http://www.indeed.com/viewjob?jk=0485d12ddd522e15,Clovis Community Medical Center,Job Title: Infection Control Spec Fullt Time 8...,http://www.indeed.com/rc/clk?jk=0485d12ddd522e...,http://www.indeed.com/rc/clk?jk=0485d12ddd522e...,,2016-08-13 08:31:42 +0000,30+ days ago,indeed.com,Infection Control Spec Fullt Time 8 Hour Days,c382d4dc18ea3589a16b84861d47e483
4,http://www.indeed.com/viewjob?jk=72ce244fc23fc878,City of Fresno,The City of Fresno is currently accepting appl...,http://www.indeed.com/rc/clk?jk=72ce244fc23fc8...,http://www.indeed.com/rc/clk?jk=72ce244fc23fc8...,,2016-09-08 08:31:42 +0000,4 days ago,indeed.com,Industrial Electrician,046a4f5a8423554ae0aca69ff2cb91b9


In [8]:
#load second json object
with open('D:/CAPSTONE_NEW/indeed_com-jobs_us_deduped_n_merged_20170817_113529266152000.json', encoding="utf-8-sig") as data_file:
    json_data4 = j.load(data_file)

In [9]:
#put data file 4 into a pandas df
#parent node is 'page'
#print first 5 rows
jobs_data4 = json_normalize(json_data4['page'])
jobs_data4.head(5)

Unnamed: 0,pageurl,record.company_name,record.description,record.direct_apply_link_on_the_job_page,record.direct_link_to_the_job_page,record.location,record.postedAt,record.postedAtText,record.sub_domain,record.title,record.uniq_id
0,http://www.indeed.com/viewjob?jk=b4c7156c7d1cce8c,Academy Sports + Outdoors,Team Sports Sales Associate\n\nDescription\n\n...,http://www.indeed.com/rc/clk?jk=b4c7156c7d1cce...,http://www.indeed.com/rc/clk?jk=b4c7156c7d1cce...,"Tuscaloosa, AL",2016-11-08 00:27:12 +0000,22 hours ago,indeed.com,Team Sports Sales Associate,33f7c38ef0c59b051085d693b0bcd4f3
1,http://www.indeed.com/viewjob?jk=13612cef68d6fc1d,Wells Fargo,"Job Description\n\n\nAt Wells Fargo, our visio...",http://www.indeed.com/rc/clk?jk=13612cef68d6fc...,http://www.indeed.com/rc/clk?jk=13612cef68d6fc...,"Forest, VA",2016-11-08 13:27:12 +0000,9 hours ago,indeed.com,TELLER,77d8ec26352b981399b9cd87d2c7a194
2,http://www.indeed.com/viewjob?jk=1969d1d3e19628cb,Interactions,Provides a comprehensive event marketing strat...,http://www.indeed.com/rc/clk?jk=1969d1d3e19628...,http://www.indeed.com/rc/clk?jk=1969d1d3e19628...,"Hanover, PA",2016-11-07 22:27:12 +0000,1 day ago,indeed.com,DI Special Services Sales Advisor - Hanover Ma...,ee9477f210efa6b2b38f2d6f05b62cff
3,http://www.indeed.com/company/College-Chefs/jo...,College Chefs,"Progressive, rapidly expanding company looking...",,,"Tuscaloosa, AL",2016-10-09 22:27:12 +0000,30+ days ago,indeed.com,Executive Chef,c56653b7e5d6d0095cec262a02630053
4,http://www.indeed.com/viewjob?jk=dad38502b724b95a,Randall Reilly,Our Sales Division is looking for a dynamic sa...,http://www.indeed.com/rc/clk?jk=dad38502b724b9...,http://www.indeed.com/rc/clk?jk=dad38502b724b9...,"Tuscaloosa, AL",2016-11-08 16:27:12 +0000,6 hours ago,indeed.com,Sales National Acct Executive,344f7f7fd3e11c4a4968adda502e21b8


In [10]:
#merge jobs dataframes
full_data = pd.concat([jobs_data1, jobs_data2, jobs_data3, jobs_data4], ignore_index=True)

In [11]:
full_data.dtypes

pageurl                                     object
record.company_name                         object
record.description                          object
record.direct_apply_link_on_the_job_page    object
record.direct_link_to_the_job_page          object
record.location                             object
record.postedAt                             object
record.postedAtText                         object
record.sub_domain                           object
record.title                                object
record.uniq_id                              object
dtype: object

In [12]:
#save clean dataframe to disk
full_data.to_pickle('D:/CAPSTONE_NEW/jobs_data_full.pkl')

# Step 2: Text Pre-Processing

In [13]:
#load pickled dataframe
jobs_data = pd.read_pickle('D:/CAPSTONE_NEW/jobs_data_full.pkl')

In [14]:
jobs_data.nunique()

pageurl                                     200444
record.company_name                          44951
record.description                          161355
record.direct_apply_link_on_the_job_page    158999
record.direct_link_to_the_job_page          158999
record.location                              13371
record.postedAt                              79704
record.postedAtText                             81
record.sub_domain                                1
record.title                                113237
record.uniq_id                              200444
dtype: int64

In [47]:
#determine 50 most frequent job titles
freq_titles1 = jobs_data.groupby(['record.title']).size().reset_index(name='counts').sort_values('counts', ascending=False).head(50)
freq_titles1

Unnamed: 0,record.title,counts
68729,Physical Therapist,317
24229,Customer Service Representative,247
3362,Administrative Assistant,229
15732,Case Manager,224
73424,Project Manager,217
53322,Maintenance Technician,196
42059,Help Wanted,185
24307,Customer Service Representative - State Farm A...,178
2163,Account Associate - State Farm Agent Team Member,175
60890,Occupational Therapist,171


In [15]:
jobs_data.tail()

Unnamed: 0,pageurl,record.company_name,record.description,record.direct_apply_link_on_the_job_page,record.direct_link_to_the_job_page,record.location,record.postedAt,record.postedAtText,record.sub_domain,record.title,record.uniq_id
200439,http://www.indeed.com/viewjob?jk=eb30c82a2581c715,SERG Restaurant Group,Responsible for serving both alcoholic & non-a...,,,,2016-08-14 14:43:52 +0000,30+ days ago,indeed.com,Cocktail Server - Poseidon & The Rooftop Bar,35284039646bd91580494441c918b1c1
200440,http://www.indeed.com/viewjob?jk=ebb9dc956329c930,Michaels,Business Unit :\n\n\nMichaels Stores\n\n\nTitl...,http://www.indeed.com/rc/clk?jk=ebb9dc956329c9...,http://www.indeed.com/rc/clk?jk=ebb9dc956329c9...,,2016-08-14 14:43:52 +0000,30+ days ago,indeed.com,Department Manager,94351cf60f862b36adb6c1edc67089e9
200441,http://www.indeed.com/viewjob?jk=1bcdaa85ff13fb43,MedPro Healthcare Staffing,"MedPro Healthcare Staffing, a Joint Commission...",,,,2016-08-14 14:43:52 +0000,30+ days ago,indeed.com,Travel Physical Therapist,ed0dbad4ac8b8b382aeb28a308cacebc
200442,http://www.indeed.com/cmp/BENEDICT-COLLEGE/job...,ANGELO STATE UNIVERSITY,We are looking for several adjunct Instructors...,,,,2016-08-17 14:43:52 +0000,27 days ago,indeed.com,NEEDING PUBLIC SPEAKING INSTRUCTORS,bb8f74edc980dc911b5c3b4c9b77494a
200443,http://www.indeed.com/viewjob?jk=5b6643a9ede1ccc8,Chenega Corporation,The\n\nUnarmed Guard\n\nis primarily responsib...,http://www.indeed.com/rc/clk?jk=5b6643a9ede1cc...,http://www.indeed.com/rc/clk?jk=5b6643a9ede1cc...,,2016-08-14 14:43:52 +0000,30+ days ago,indeed.com,Unarmed Guard USCIS (FT),6d05b1f7629b8f8683cb6a134f207605


In [16]:
#create word stemming object
stemmer = SnowballStemmer('english')
#create stop words object
s_words = stopwords.words('english')

In [17]:
#create clean role description data
#special character removal, stemming and stop word removal
#seperate new stemmed words with space only
jobs_data['clean_description'] = jobs_data['record.description'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", str(x)).split() if i not in s_words]).lower())

In [18]:
#print first 5 rows to show clean role description data
jobs_data['clean_description'].head(5)

0    offic assistantrespons includ prepar sale orde...
1    financi investig asset forfeitur el paso asrc ...
2    employ summari summari job function purpos man...
3    profil job summari supervis train coach develo...
4    posit summari the nation account field support...
Name: clean_description, dtype: object

In [19]:
#create clean job title data
#special character removal, stemming and stop word removal
#seperate new stemmed words with space only
jobs_data['clean_titles'] = jobs_data['record.title'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", str(x)).split() if i not in s_words]).lower())

In [20]:
#print first 5 rows to show clean job title data
jobs_data['clean_titles'].head(5)

0                                offic assist
1    financi investig asset forfeitur el paso
2                provid network develop manag
3                  supervisor dspo el paso tx
4          commerci sale estim propos support
Name: clean_titles, dtype: object

In [21]:
#count number of unique cases per column 
jobs_data.nunique()
#it is important to reduce the number of unique job titles
#unique job title count after text pre-processing has improved from 33898 out of 50295 (67%) to 32286 out of 50295 (64%)
#in comparison to job titles, all job descriptions should be unique to avoid a skew in tf-idf results
#only 42831 out of 50296 jobs are actually unique after cleaning, so duplicates must be removed by matching

pageurl                                     200444
record.company_name                          44951
record.description                          161355
record.direct_apply_link_on_the_job_page    158999
record.direct_link_to_the_job_page          158999
record.location                              13371
record.postedAt                              79704
record.postedAtText                             81
record.sub_domain                                1
record.title                                113237
record.uniq_id                              200444
clean_description                           155880
clean_titles                                105475
dtype: int64

In [22]:
#remove duplicate job postings based on clean descriptions
jobs_data.drop_duplicates(subset='clean_description', keep='first', inplace=True)

In [23]:
#count number of unique cases per column 
jobs_data.nunique()
#there are now 42831 total job postings, where 7465 duplicates were removed

pageurl                                     155880
record.company_name                          44279
record.description                          155879
record.direct_apply_link_on_the_job_page    118909
record.direct_link_to_the_job_page          118909
record.location                              11640
record.postedAt                              68744
record.postedAtText                             79
record.sub_domain                                1
record.title                                105060
record.uniq_id                              155880
clean_description                           155880
clean_titles                                 98562
dtype: int64

In [24]:
#remove duplicate job postings based on combination of same company name and same job title
jobs_data.drop_duplicates(subset=['record.company_name','clean_titles'], keep='first', inplace=True)

In [25]:
jobs_data.nunique()
#there are now 39464 total job postings, where 3367 duplicates were removed

pageurl                                     135826
record.company_name                          44279
record.description                          135825
record.direct_apply_link_on_the_job_page    102574
record.direct_link_to_the_job_page          102574
record.location                              10634
record.postedAt                              63077
record.postedAtText                             79
record.sub_domain                                1
record.title                                103469
record.uniq_id                              135826
clean_description                           135826
clean_titles                                 98562
dtype: int64

In [26]:
#vectors of unhelpful double characters will be created, so we need to remove them
jobs_data['clean_description'].replace(to_replace=r"\s([a-z])\1+", value="", regex=True, inplace=True)

In [27]:
#there are a noticeable number of job titles with 'state farm agent team member'
#this text should be removed as it is branding that reduces number of cases per job title
jobs_data['clean_titles'].replace({' state farm agent team member':''}, regex=True, inplace=True)

In [28]:
jobs_data.nunique()
#there are now 39464 total job postings, where 3367 duplicates were removed

pageurl                                     135826
record.company_name                          44279
record.description                          135825
record.direct_apply_link_on_the_job_page    102574
record.direct_link_to_the_job_page          102574
record.location                              10634
record.postedAt                              63077
record.postedAtText                             79
record.sub_domain                                1
record.title                                103469
record.uniq_id                              135826
clean_description                           135825
clean_titles                                 98542
dtype: int64

In [29]:
#save clean dataframe to disk and free memory
jobs_data.to_pickle('D:/CAPSTONE_NEW/jobs_data_clean.pkl')
import gc
gc.collect()

7

# Step 3: Text Mining

In [30]:
#load pickled dataframe
final_data = pd.read_pickle('D:/CAPSTONE_NEW/jobs_data_clean.pkl')

In [31]:
#determine 200 most frequent job titles for use as predictors, counts by job title are n=30 or more
#this is because kmeans clustering is not effective as dimensional reduction due to all of the noisy text
freq_titles = final_data.groupby(['clean_titles']).size().reset_index(name='counts').sort_values('counts', ascending=False).head(200)
freq_titles

Unnamed: 0,clean_titles,counts
22242,custom servic repres,442
64501,physic therapist,331
533,account manag,293
1498,administr assist,261
12182,case manag,245
69250,project manag,232
49818,mainten technician,210
301,account associ,190
38772,help want,186
19888,cook,181


In [32]:
#remove all cases from dataframe except those with top 200 job titles 
final_data = final_data[jobs_data['clean_titles'].isin(freq_titles['clean_titles'])]
final_data.nunique()
#there are now 4318 unique total job postings, where 35146 were removed that had same job title counts of n<10

pageurl                                     14148
record.company_name                         10538
record.description                          14148
record.direct_apply_link_on_the_job_page     9357
record.direct_link_to_the_job_page           9357
record.location                              3653
record.postedAt                             11766
record.postedAtText                            61
record.sub_domain                               1
record.title                                  796
record.uniq_id                              14148
clean_description                           14148
clean_titles                                  200
dtype: int64

In [33]:
#generate numerical category labels for job titles to be used during classification
final_data["clean_titles_cat"] = final_data["clean_titles"].astype('category').cat.codes
final_data.head()

Unnamed: 0,pageurl,record.company_name,record.description,record.direct_apply_link_on_the_job_page,record.direct_link_to_the_job_page,record.location,record.postedAt,record.postedAtText,record.sub_domain,record.title,record.uniq_id,clean_description,clean_titles,clean_titles_cat
0,http://www.indeed.com/cmp/Tileland-Interior-De...,Tileland Interior Decor LLC,Office assistantResponsibilities include: Prep...,,,,2016-08-11 12:27:57 +0000,7 days ago,indeed.com,office assistant,f458526241a5e46962bad2a97d8da5d3,offic assistantrespons includ prepar sale orde...,offic assist,121
5,http://www.indeed.com/viewjob?jk=b02626aaeaa5b670,Franchise: Audi Pasadena,Sales Consultant\n\n\nThis position contribute...,http://www.indeed.com/rc/clk?jk=b02626aaeaa5b6...,http://www.indeed.com/rc/clk?jk=b02626aaeaa5b6...,,2016-08-12 12:27:57 +0000,6 days ago,indeed.com,Sales Consultant,63c2ebf15b716afdea9598fd8833eb97,sale consult this posit contribut success rusn...,sale consult,166
6,http://www.indeed.com/cmp/Network-Medical-Mana...,Network Medical Management,SUMMARYDirectly responsible for coordinating a...,,,,2016-08-10 12:27:57 +0000,8 days ago,indeed.com,Human Resources Assistant,98daf5ad5ded0346d76d43068d0118e3,summarydirect respons coordin aspect locat via...,human resourc assist,78
9,http://www.indeed.com/viewjob?jk=74b4486fba962240,KIK Custom Products,COMPANY:\n\nAs one of North America’s largest ...,http://www.indeed.com/rc/clk?jk=74b4486fba9622...,http://www.indeed.com/rc/clk?jk=74b4486fba9622...,,2016-07-22 12:27:57 +0000,27 days ago,indeed.com,Human Resources Generalist,e71d38c8cddc483737d077bb10e778c4,compani as one north america largest independ ...,human resourc generalist,80
12,http://www.indeed.com/viewjob?jk=210f7df8479839f1,"CST Brands, Inc.",WE ARE THE CORNERSTORE!\n\n\nWe are recruiting...,http://www.indeed.com/rc/clk?jk=210f7df8479839...,http://www.indeed.com/rc/clk?jk=210f7df8479839...,,2016-07-25 12:27:57 +0000,24 days ago,indeed.com,Store Manager,16e23c7845fdfce13680d64a8f38a645,we are the cornerstor we recruit store manag n...,store manag,188


In [34]:
#create tokenized job descriptions as independent features for classification
#(unigrams and bigrams) from words with more than 1 letter in clean job descriptions
#print first 20 feature names
vectorizer = TfidfVectorizer(token_pattern=r'\w{2,}', ngram_range=(1,2), stop_words='english', sublinear_tf=True)
desc_matrix = vectorizer.fit_transform(final_data['clean_description'])
feature_names = vectorizer.get_feature_names()
print(feature_names[:20])

['aaa', 'aaa car', 'aaa northeast', 'aacr', 'aacr wac', 'aamco', 'aamco transmiss', 'aaron', 'aaron provid', 'aaski', 'aaski technolog', 'ab', 'ab ac', 'ab ae', 'ab brake', 'ab car', 'ab function', 'ab maximimum', 'ab maximum', 'ab pay']


In [35]:
#display the total number of features produced for job descriptions
len(feature_names)

808932

In [39]:
#display top 100 features from job descriptions based on Tf-Idf scores
indices = np.argsort(vectorizer.idf_)[::-1]
top_n = 100
top_features = [feature_names[i] for i in indices[:top_n]]
print(top_features)

['zynex suit', 'hns organ', 'hns mission', 'hns manag', 'hns', 'hne bod', 'hne', 'hnbhs regulatori', 'hnbhs facilit', 'hnbhs', 'hna fleet', 'hna dress', 'hna', 'hn summari', 'hn hs', 'hmw high', 'hmw', 'hns nurs', 'hns perform', 'hmso', 'hns polici', 'hoa fine', 'hoa duti', 'hoa certif', 'hoa build', 'ho mental', 'ho cultur', 'hntb team', 'hntb servic', 'hntb presenc', 'hntb local', 'hntb current', 'hntb cultur', 'hntb corpor', 'hntb', 'hns support', 'hmso revis', 'hmsnm org', 'hoa notic', 'hmo ncqa', 'hmo manag', 'hmo dental', 'hmo case', 'hmis program', 'hmis increas', 'hmis dhmh', 'hmis client', 'hmis catch', 'hmis basic', 'hmi robot', 'hmi program', 'hmi logix', 'hmi laboratori', 'hmi graphic', 'hmi experi', 'hmo mco', 'hmo option', 'hmsnm', 'hmo plan', 'hms support', 'hms provid', 'hms profit', 'hms polici', 'hms offer', 'hms non', 'hms feder', 'hms competit', 'hmrc partner', 'hmrc', 'hmoo tpa', 'hmoo medicar', 'hmoo health', 'hmoo dental', 'hmo requir', 'hoa manag', 'hoa prior', 

In [40]:
#save final new dataframe to disk and free memory
final_data.to_pickle('D:/CAPSTONE_NEW/jobs_data_final.pkl')
gc.collect()

98

# Step 4: Text Classification

In [41]:
#load pickled dataframe
model_data = pd.read_pickle('D:/CAPSTONE_NEW/jobs_data_final.pkl')

In [42]:
#create classifier object with train and test data sets
#include assessment tests within object
#job skills as independent variable and job titles as dependent variable

def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

    classifier.fit(X_train, y_train)
    print('Initial Model Accuracy: %s' % classifier.score(X_test, y_test))
   
    predicted = classifier.predict(X_test)
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F-score: {}'.format(fscore))
    print('Support: {}'.format(support))

    return classifier

In [43]:
#run classification using NaiveBayes
clf1 = Pipeline([
    ('vectorizer', TfidfVectorizer(token_pattern=r'\w{2,}', ngram_range=(1,2), stop_words='english', sublinear_tf=True)),
    ('classifier', MultinomialNB(alpha=0.05)),
])
 
nb_model = train(clf1, model_data['clean_description'], model_data['clean_titles_cat'])
#produces extremely poor accuracy

Initial Model Accuracy: 0.27043580683156654
Precision: 0.38250341153014505
Recall: 0.27043580683156654
F-score: 0.21984668507027957
Support: None


  'precision', 'predicted', average, warn_for)


In [44]:
#run classification using RandomForests
clf2 = Pipeline([
    ('vectorizer', TfidfVectorizer(token_pattern=r'\w{2,}', ngram_range=(1,2), stop_words='english', sublinear_tf=True)),
    ('classifier', RandomForestClassifier()),
])
 
rf_model = train(clf2, model_data['clean_description'], model_data['clean_titles_cat'])
#produces poor accuracy

Initial Model Accuracy: 0.3491166077738516
Precision: 0.38404365554211395
Recall: 0.3491166077738516
F-score: 0.3302604970853345
Support: None


  'precision', 'predicted', average, warn_for)


In [45]:
#run classification using SVM
clf3 = Pipeline([
    ('vectorizer', TfidfVectorizer(token_pattern=r'\w{2,}', ngram_range=(1,2), stop_words='english', sublinear_tf=True)),
    ('classifier', LinearSVC(C=1.0, penalty='l1', max_iter=1000, dual=False)),
])
 
svm_model = train(clf3, model_data['clean_description'], model_data['clean_titles_cat'])
#produces good accuracy

Initial Model Accuracy: 0.6713780918727915
Precision: 0.6975553576659304
Recall: 0.6713780918727915
F-score: 0.6594118359383865
Support: None


  'precision', 'predicted', average, warn_for)
