 # NLP for job listing classification

### Using NLP for job listing classification in the city of New York

In [195]:
# For exploratory data analysis and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#for model building
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

# For text preprocessing
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en import English
from spacy.lang.es import Spanish

# For BERT pre-trained Language Model
from transformers import BertModel
from transformers import  BertTokenizer, BertForSequenceClassification

import tensorflow as tf
print ("TF version:", tf.__version__)


TF version: 2.10.0


# Loading the data

In [196]:
# Loading the data
df = pd.read_csv("../Data/NYC_Jobs.csv", low_memory=False)

# Data exploration

In [197]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362 entries, 0 to 6361
Data columns (total 30 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Job ID                         6362 non-null   int64  
 1   Agency                         6362 non-null   object 
 2   Posting Type                   6362 non-null   object 
 3   # Of Positions                 6362 non-null   int64  
 4   Business Title                 6362 non-null   object 
 5   Civil Service Title            6362 non-null   object 
 6   Title Classification           6362 non-null   object 
 7   Title Code No                  6362 non-null   object 
 8   Level                          6362 non-null   object 
 9   Job Category                   6362 non-null   object 
 10  Full-Time/Part-Time indicator  6118 non-null   object 
 11  Career Level                   6362 non-null   object 
 12  Salary Range From              6362 non-null   f

In [198]:
df.head()

Unnamed: 0,Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Classification,Title Code No,Level,Job Category,...,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
0,585084,DEPT OF HEALTH/MENTAL HYGIENE,Internal,1,Deputy EEO Officer (Agency Attorney),AGENCY ATTORNEY,Non-Competitive-5,30087,3,Health Legal Affairs,...,SPECIAL NOTE 1.\tSelected candidate will be re...,To Apply: Please submit resume and cover lette...,,,,New York City residency is generally required ...,07/12/2023,,07/12/2023,08/01/2023
1,577063,DEPT OF HEALTH/MENTAL HYGIENE,External,5,Institutional Aide (per diem),INSTITUTIONAL AIDE,Non-Competitive-5,81803,0,Building Operations & Maintenance,...,SPECIAL NOTE 1.\tSelected candidates will be r...,"TO APPLY, PLEASE SUBMIT RESUME AND COVER LETTE...",,,,New York City residency is generally required ...,03/10/2023,,05/04/2023,08/01/2023
2,540287,DEPT OF ENVIRONMENT PROTECTION,External,1,Machinist,MACHINIST,Competitive-1,92610,0,Building Operations & Maintenance,...,Appointments are subject to OMB approval. For...,Click the âApply Nowâ button,,,,New York City residency is generally required ...,07/16/2022,,07/16/2022,08/01/2023
3,573619,DEPT OF ENVIRONMENT PROTECTION,External,2,Policy Analyst,STRATEGIC INITIATIVE SPECIALIS,Non-Competitive-5,50940,0,"Policy, Research & Analysis",...,Driver License Requirement: At the time of app...,Click on âApply Nowâ and submit a resume a...,35 hours/week,59-17 Junction Blvd Corona Ny,,New York City residency is generally required ...,02/15/2023,,02/15/2023,08/01/2023
4,589409,OFF OF PAYROLL ADMINISTRATION,Internal,2,Help Desk Level 1 Representative,CLERICAL ASSOCIATE,Competitive-1,10251,2,"Technology, Data & Innovation Policy, Research...",...,#O-143 & O-154,Current NYC employees may apply to Job ID: 589...,35 Hours/Day Shift,5 Manhattan West,,New York City residency is generally required ...,06/06/2023,,06/21/2023,08/01/2023


In [199]:
df.describe()

Unnamed: 0,Job ID,# Of Positions,Salary Range From,Salary Range To,Recruitment Contact
count,6362.0,6362.0,6362.0,6362.0,0.0
mean,575095.083622,2.417322,61685.868704,85714.222621,
std,21232.953278,8.991682,30822.616545,45143.110546,
min,468473.0,1.0,0.0,15.45,
25%,568104.5,1.0,49033.0,61438.0,
50%,582919.0,1.0,60000.0,82504.5,
75%,590094.0,1.0,75504.0,109409.0,
max,595861.0,250.0,231796.0,252165.0,


In [200]:
df.columns

Index(['Job ID', 'Agency', 'Posting Type', '# Of Positions', 'Business Title',
       'Civil Service Title', 'Title Classification', 'Title Code No', 'Level',
       'Job Category', 'Full-Time/Part-Time indicator', 'Career Level',
       'Salary Range From', 'Salary Range To', 'Salary Frequency',
       'Work Location', 'Division/Work Unit', 'Job Description',
       'Minimum Qual Requirements', 'Preferred Skills',
       'Additional Information', 'To Apply', 'Hours/Shift', 'Work Location 1',
       'Recruitment Contact', 'Residency Requirement', 'Posting Date',
       'Post Until', 'Posting Updated', 'Process Date'],
      dtype='object')

# Data cleaning

In [201]:
clean_df = df.copy(deep=True)

In [202]:
clean_df['Posting Date'].head()

0    07/12/2023
1    03/10/2023
2    07/16/2022
3    02/15/2023
4    06/06/2023
Name: Posting Date, dtype: object

In [203]:
# Parsing 'Job Description' column
clean_df = pd.read_csv("../Data/NYC_Jobs.csv",
                low_memory=False,
                parse_dates = ['Posting Date'])

In [204]:
clean_df['Posting Date'].dtype

dtype('<M8[ns]')

In [205]:
df['Posting Date'].head(20)

0     07/12/2023
1     03/10/2023
2     07/16/2022
3     02/15/2023
4     06/06/2023
5     05/17/2023
6     05/04/2023
7     07/28/2023
8     06/23/2023
9     07/31/2023
10    02/25/2023
11    07/21/2023
12    05/30/2023
13    04/12/2023
14    02/15/2023
15    02/15/2023
16    01/04/2023
17    01/09/2023
18    06/30/2022
19    10/25/2022
Name: Posting Date, dtype: object

In [206]:
# Sort DataFrame in posting date
clean_df.sort_values(by=['Posting Date'], inplace=True, ascending=True)
clean_df['Posting Date'].head(20)

3235   2020-06-30
4488   2021-01-28
1928   2021-07-14
1615   2021-07-26
1109   2021-07-26
4308   2021-07-30
287    2021-07-30
5008   2021-08-02
5154   2021-08-02
4640   2021-08-04
6313   2021-08-04
4099   2021-08-11
5459   2021-08-11
1056   2021-08-24
3209   2021-08-24
285    2021-08-26
453    2021-08-26
2834   2021-08-30
6287   2021-09-03
44     2021-09-03
Name: Posting Date, dtype: datetime64[ns]

In [207]:
clean_df['Post Until'].isna().sum()

4310

In [208]:
# Imputing values
clean_df['Post Until'].fillna('No Deadline', inplace=True)

In [209]:
clean_df['Post Until'].head()

3235    No Deadline
4488    No Deadline
1928    No Deadline
1615    No Deadline
1109    No Deadline
Name: Post Until, dtype: object

In [210]:
# Identify duplicates (rows and columns)
duplicates_r = clean_df.duplicated(subset=['Job ID', 'Agency', 'Posting Type', '# Of Positions', 'Business Title',
       'Civil Service Title', 'Title Classification', 'Title Code No', 'Level',
       'Job Category', 'Full-Time/Part-Time indicator', 'Career Level',
       'Salary Range From', 'Salary Range To', 'Salary Frequency',
       'Work Location', 'Division/Work Unit', 'Job Description',
       'Minimum Qual Requirements', 'Preferred Skills',
       'Additional Information', 'To Apply', 'Hours/Shift', 'Work Location 1',
       'Recruitment Contact', 'Residency Requirement', 'Posting Date',
       'Post Until', 'Posting Updated', 'Process Date'])
duplicate_rows = clean_df[duplicates_r]
duplicate_rows.shape[0]

112

In [211]:
#Transposing the DataFrame
transposed_clean_df = clean_df.transpose()
duplicated_columns_b = transposed_clean_df.duplicated()
duplicated_columns = transposed_clean_df[duplicated_columns_b]
duplicated_columns

Unnamed: 0,3235,4488,1928,1615,1109,4308,287,5008,5154,4640,...,187,2703,2160,4318,4340,3846,1994,433,5703,419


In [212]:
transposed_clean_df.head()

Unnamed: 0,3235,4488,1928,1615,1109,4308,287,5008,5154,4640,...,187,2703,2160,4318,4340,3846,1994,433,5703,419
Job ID,534657,512652,468473,469953,469953,469360,469360,470441,470441,468476,...,595693,595694,595693,595688,595686,595687,595861,595694,595686,595691
Agency,NYC HOUSING AUTHORITY,HRA/DEPT OF SOCIAL SERVICES,DEPARTMENT OF TRANSPORTATION,HRA/DEPT OF SOCIAL SERVICES,HRA/DEPT OF SOCIAL SERVICES,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,...,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS
Posting Type,Internal,Internal,Internal,External,Internal,External,Internal,External,Internal,Internal,...,External,External,Internal,External,Internal,Internal,External,Internal,External,Internal
# Of Positions,1,2,1,1,1,1,1,1,1,1,...,2,2,2,6,20,2,1,2,20,1
Business Title,Elevator Oversight Team Administrator,UNIT CLERK,OFFICE MANAGER - Brooklyn B/C Office,CONTRACT ANALYST,CONTRACT ANALYST,Chief,Chief,Heating Oversight Team Specialist,Heating Oversight Team Specialist,Director of Public Housing Tenancy Operations,...,OATH/ECB Hearing Attorney,OATH/ECB Hearing Representative,OATH/ECB Hearing Attorney,Plumbing Inspector,Construction Inspector,Electrical Inspector,Plan Examiner,OATH/ECB Hearing Representative,Construction Inspector,Assistant Plan Examiner


In [213]:
# Checking the values of different columns
clean_df['Work Location'].value_counts()

55 Water St Ny Ny                 436
42-09 28th Street                 425
96-05 Horace Harding Expway       321
30-30 Thomson Ave L I City Qns    292
4 World Trade Center              284
                                 ... 
1278 Sedgwick Ave., Bronx           1
50-16 59Th Pl., Queens              1
92-24 Rockaway Beach Blvd Quee      1
120 W 82Nd St., N.Y.                1
Vendor & Contract Mgmt              1
Name: Work Location, Length: 382, dtype: int64

In [214]:
# Checking Equality Comparison between columns
clean_df['Work Location'] == clean_df['Work Location 1']

3235    False
4488    False
1928    False
1615    False
1109    False
        ...  
3846    False
1994    False
433     False
5703    False
419     False
Length: 6362, dtype: bool

In [215]:
# Checking duplicates
clean_df['Job ID'].is_unique

False

In [216]:
# Removing the duplicates
clean_df.drop_duplicates(subset=['Job ID'], inplace=True)

In [217]:
clean_df['Job ID'].is_unique

True

In [218]:
clean_df_1 = clean_df.set_index('Job ID')

In [219]:
clean_df_1.head(3)

Unnamed: 0_level_0,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Classification,Title Code No,Level,Job Category,Full-Time/Part-Time indicator,...,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
534657,NYC HOUSING AUTHORITY,Internal,1,Elevator Oversight Team Administrator,SUPERVISOR OF ELEVATOR MAINTEN,Competitive-1,10076,M2,"Public Safety, Inspections, & Enforcement",F,...,1.\tPreference will be given to employees who ...,Click the Apply Now button.,,,,NYCHA has no residency requirements.,2020-06-30,No Deadline,08/30/2022,08/01/2023
512652,HRA/DEPT OF SOCIAL SERVICES,Internal,2,UNIT CLERK,CLERICAL ASSOCIATE,Competitive-1,10251,03,Constituent Services & Community Programs Comm...,F,...,**LOAN FORGIVENESS The federal government pro...,APPLICANTS MUST BE PERMANENT IN THE CLERICAL A...,,,,New York City residency is generally required ...,2021-01-28,No Deadline,08/11/2022,08/01/2023
468473,DEPARTMENT OF TRANSPORTATION,Internal,1,OFFICE MANAGER - Brooklyn B/C Office,ADM MANAGER-NON-MGRL FRM M1/M2,Competitive-1,1002C,02,Constituent Services & Community Programs,,...,Note: This position is open to qualified perso...,*** IN ORDER TO BE CONSIDERED FOR THIS POSITIO...,,"16 Court Street, Brooklyn, New York",,New York City residency is generally required ...,2021-07-14,No Deadline,07/14/2021,08/01/2023


In [220]:
clean_df.columns

Index(['Job ID', 'Agency', 'Posting Type', '# Of Positions', 'Business Title',
       'Civil Service Title', 'Title Classification', 'Title Code No', 'Level',
       'Job Category', 'Full-Time/Part-Time indicator', 'Career Level',
       'Salary Range From', 'Salary Range To', 'Salary Frequency',
       'Work Location', 'Division/Work Unit', 'Job Description',
       'Minimum Qual Requirements', 'Preferred Skills',
       'Additional Information', 'To Apply', 'Hours/Shift', 'Work Location 1',
       'Recruitment Contact', 'Residency Requirement', 'Posting Date',
       'Post Until', 'Posting Updated', 'Process Date'],
      dtype='object')

In [221]:
hours_shift = clean_df['Hours/Shift']
hours_shift

3235                   NaN
4488                   NaN
1928                   NaN
1615                   NaN
4308                   NaN
               ...        
5128                   NaN
4648                   NaN
187                    NaN
2703                   NaN
4340    40 hours (minimum)
Name: Hours/Shift, Length: 3296, dtype: object

In [222]:
clean_df['Hours/Shift'].fillna('Not specified', inplace=True)
hours_shift

3235         Not specified
4488         Not specified
1928         Not specified
1615         Not specified
4308         Not specified
               ...        
5128         Not specified
4648         Not specified
187          Not specified
2703         Not specified
4340    40 hours (minimum)
Name: Hours/Shift, Length: 3296, dtype: object

# Text preprocessing

In [223]:
#Selecting the relevant columns we are going to work with
text_columns = ['Business Title','Job Category', 'Career Level',
       'Work Location', 'Job Description',
       'Minimum Qual Requirements', 'Preferred Skills',
       'Additional Information', 'To Apply', 'Work Location 1']

In [224]:
#Converting the columns to lowercase
clean_df[text_columns] = clean_df[text_columns].apply(lambda x: x.str.lower())
clean_df[text_columns].head(3)

Unnamed: 0,Business Title,Job Category,Career Level,Work Location,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Work Location 1
3235,elevator oversight team administrator,"public safety, inspections, & enforcement",manager,environmental health & safety,the new york city housing authority (nycha) is...,1. seven years of satisfactory full-time exper...,"1. possess strong conceptual, organ...",1.\tpreference will be given to employees who ...,click the apply now button.,
4488,unit clerk,constituent services & community programs comm...,experienced (non-manager),"400 8th ave., n.y.",the hiv/aids services administration (hasa) is...,qualification requirements a four-year high s...,,**loan forgiveness the federal government pro...,applicants must be permanent in the clerical a...,
1928,office manager - brooklyn b/c office,constituent services & community programs,experienced (non-manager),16 court st,*** in order to be considered for this positio...,1. a baccalaureate degree from an accredited c...,,note: this position is open to qualified perso...,*** in order to be considered for this positio...,"16 court street, brooklyn, new york"


In [225]:
#Filling NaN values
clean_df['Preferred Skills'].fillna('', inplace=True) 

In [226]:
clean_df['Work Location 1'].fillna('', inplace=True) 

In [227]:
clean_df[text_columns].head(5)

Unnamed: 0,Business Title,Job Category,Career Level,Work Location,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Work Location 1
3235,elevator oversight team administrator,"public safety, inspections, & enforcement",manager,environmental health & safety,the new york city housing authority (nycha) is...,1. seven years of satisfactory full-time exper...,"1. possess strong conceptual, organ...",1.\tpreference will be given to employees who ...,click the apply now button.,
4488,unit clerk,constituent services & community programs comm...,experienced (non-manager),"400 8th ave., n.y.",the hiv/aids services administration (hasa) is...,qualification requirements a four-year high s...,,**loan forgiveness the federal government pro...,applicants must be permanent in the clerical a...,
1928,office manager - brooklyn b/c office,constituent services & community programs,experienced (non-manager),16 court st,*** in order to be considered for this positio...,1. a baccalaureate degree from an accredited c...,,note: this position is open to qualified perso...,*** in order to be considered for this positio...,"16 court street, brooklyn, new york"
1615,contract analyst,administration & human resources social services,experienced (non-manager),4 world trade center,career services/procurement & contract adminis...,1. a masterâs degree from an accredited coll...,,**loan forgiveness the federal government pro...,applicants must be in the permanent in the sta...,
4308,chief,legal affairs,manager,law-housing litigation,the new york city housing authority (nycha) la...,admission to the new york state bar; and four ...,â¢ ability to perform complex tasks and mana...,1. resume and cover letter must also include ...,click the apply now button.,


In [228]:
#Converting non-string data types to strings
clean_df[text_columns] = clean_df[text_columns].astype(str)

#Removing Special Characters and Punctuation:
clean_df[text_columns] = clean_df[text_columns].apply(lambda tokens: [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens])

In [229]:
#Stopword removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
clean_df[text_columns] = clean_df[text_columns].apply(lambda tokens: [token for token in tokens if token not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\R\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [230]:
clean_df[text_columns].head(3)

Unnamed: 0,Business Title,Job Category,Career Level,Work Location,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Work Location 1
3235,elevatoroversightteamadministrator,publicsafetyinspectionsenforcement,manager,environmentalhealthsafety,thenewyorkcityhousingauthoritynychaisthenation...,1sevenyearsofsatisfactoryfulltimeexperienceint...,1possessstrongconceptualorganizationalanalytic...,1preferencewillbegiventoemployeeswhohaveserved...,clicktheapplynowbutton,
4488,unitclerk,constituentservicescommunityprogramscommunicat...,experiencednonmanager,4008thaveny,thehivaidsservicesadministrationhasaisthemostc...,qualificationrequirementsafouryearhighschooldi...,,loanforgivenessthefederalgovernmentprovidesstu...,applicantsmustbepermanentintheclericalassociat...,
1928,officemanagerbrooklynbcoffice,constituentservicescommunityprograms,experiencednonmanager,16courtst,inordertobeconsideredforthispositioncandidates...,1abaccalaureatedegreefromanaccreditedcollegean...,,notethispositionisopentoqualifiedpersonswithad...,inordertobeconsideredforthispositioncandidates...,16courtstreetbrooklynnewyork
