 # NLP for job listing classification

### Using NLP for job listing classification in the city of New York

In [1]:
# For exploratory data analysis and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#for model building
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

# For text preprocessing
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en import English
from spacy.lang.es import Spanish

# For BERT pre-trained Language Model
from transformers import BertModel
from transformers import  BertTokenizer, BertForSequenceClassification

import tensorflow as tf
print ("TF version:", tf.__version__)


TF version: 2.10.0


# Loading the data

In [2]:
# Loading the data
df = pd.read_csv("../Data/NYC_Jobs.csv", low_memory=False)

# Data exploration

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362 entries, 0 to 6361
Data columns (total 30 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Job ID                         6362 non-null   int64  
 1   Agency                         6362 non-null   object 
 2   Posting Type                   6362 non-null   object 
 3   # Of Positions                 6362 non-null   int64  
 4   Business Title                 6362 non-null   object 
 5   Civil Service Title            6362 non-null   object 
 6   Title Classification           6362 non-null   object 
 7   Title Code No                  6362 non-null   object 
 8   Level                          6362 non-null   object 
 9   Job Category                   6362 non-null   object 
 10  Full-Time/Part-Time indicator  6118 non-null   object 
 11  Career Level                   6362 non-null   object 
 12  Salary Range From              6362 non-null   f

In [4]:
df.head()

Unnamed: 0,Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Classification,Title Code No,Level,Job Category,...,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
0,585084,DEPT OF HEALTH/MENTAL HYGIENE,Internal,1,Deputy EEO Officer (Agency Attorney),AGENCY ATTORNEY,Non-Competitive-5,30087,3,Health Legal Affairs,...,SPECIAL NOTE 1.\tSelected candidate will be re...,To Apply: Please submit resume and cover lette...,,,,New York City residency is generally required ...,07/12/2023,,07/12/2023,08/01/2023
1,577063,DEPT OF HEALTH/MENTAL HYGIENE,External,5,Institutional Aide (per diem),INSTITUTIONAL AIDE,Non-Competitive-5,81803,0,Building Operations & Maintenance,...,SPECIAL NOTE 1.\tSelected candidates will be r...,"TO APPLY, PLEASE SUBMIT RESUME AND COVER LETTE...",,,,New York City residency is generally required ...,03/10/2023,,05/04/2023,08/01/2023
2,540287,DEPT OF ENVIRONMENT PROTECTION,External,1,Machinist,MACHINIST,Competitive-1,92610,0,Building Operations & Maintenance,...,Appointments are subject to OMB approval. For...,Click the âApply Nowâ button,,,,New York City residency is generally required ...,07/16/2022,,07/16/2022,08/01/2023
3,573619,DEPT OF ENVIRONMENT PROTECTION,External,2,Policy Analyst,STRATEGIC INITIATIVE SPECIALIS,Non-Competitive-5,50940,0,"Policy, Research & Analysis",...,Driver License Requirement: At the time of app...,Click on âApply Nowâ and submit a resume a...,35 hours/week,59-17 Junction Blvd Corona Ny,,New York City residency is generally required ...,02/15/2023,,02/15/2023,08/01/2023
4,589409,OFF OF PAYROLL ADMINISTRATION,Internal,2,Help Desk Level 1 Representative,CLERICAL ASSOCIATE,Competitive-1,10251,2,"Technology, Data & Innovation Policy, Research...",...,#O-143 & O-154,Current NYC employees may apply to Job ID: 589...,35 Hours/Day Shift,5 Manhattan West,,New York City residency is generally required ...,06/06/2023,,06/21/2023,08/01/2023


In [5]:
df.describe()

Unnamed: 0,Job ID,# Of Positions,Salary Range From,Salary Range To,Recruitment Contact
count,6362.0,6362.0,6362.0,6362.0,0.0
mean,575095.083622,2.417322,61685.868704,85714.222621,
std,21232.953278,8.991682,30822.616545,45143.110546,
min,468473.0,1.0,0.0,15.45,
25%,568104.5,1.0,49033.0,61438.0,
50%,582919.0,1.0,60000.0,82504.5,
75%,590094.0,1.0,75504.0,109409.0,
max,595861.0,250.0,231796.0,252165.0,


In [6]:
df.columns

Index(['Job ID', 'Agency', 'Posting Type', '# Of Positions', 'Business Title',
       'Civil Service Title', 'Title Classification', 'Title Code No', 'Level',
       'Job Category', 'Full-Time/Part-Time indicator', 'Career Level',
       'Salary Range From', 'Salary Range To', 'Salary Frequency',
       'Work Location', 'Division/Work Unit', 'Job Description',
       'Minimum Qual Requirements', 'Preferred Skills',
       'Additional Information', 'To Apply', 'Hours/Shift', 'Work Location 1',
       'Recruitment Contact', 'Residency Requirement', 'Posting Date',
       'Post Until', 'Posting Updated', 'Process Date'],
      dtype='object')

# Data cleaning

In [7]:
df['Posting Date'].head()

0    07/12/2023
1    03/10/2023
2    07/16/2022
3    02/15/2023
4    06/06/2023
Name: Posting Date, dtype: object

In [8]:
# Parsing 'Job Description' column
df = pd.read_csv("../Data/NYC_Jobs.csv",
                low_memory=False,
                parse_dates = ['Posting Date'])

In [9]:
df['Posting Date'].dtype

dtype('<M8[ns]')

In [10]:
df['Posting Date'].head(20)

0    2023-07-12
1    2023-03-10
2    2022-07-16
3    2023-02-15
4    2023-06-06
5    2023-05-17
6    2023-05-04
7    2023-07-28
8    2023-06-23
9    2023-07-31
10   2023-02-25
11   2023-07-21
12   2023-05-30
13   2023-04-12
14   2023-02-15
15   2023-02-15
16   2023-01-04
17   2023-01-09
18   2022-06-30
19   2022-10-25
Name: Posting Date, dtype: datetime64[ns]

In [11]:
# Sort DataFrame in posting date
df.sort_values(by=['Posting Date'], inplace=True, ascending=True)
df['Posting Date'].head(20)

3235   2020-06-30
4488   2021-01-28
1928   2021-07-14
1615   2021-07-26
1109   2021-07-26
4308   2021-07-30
287    2021-07-30
5008   2021-08-02
5154   2021-08-02
4640   2021-08-04
6313   2021-08-04
4099   2021-08-11
5459   2021-08-11
1056   2021-08-24
3209   2021-08-24
285    2021-08-26
453    2021-08-26
2834   2021-08-30
6287   2021-09-03
44     2021-09-03
Name: Posting Date, dtype: datetime64[ns]

In [12]:
df['Post Until'].isna().sum()

4310

In [13]:
# Imputing values
df['Post Until'].fillna('No Deadline', inplace=True)

In [14]:
df['Post Until'].tail

<bound method NDFrame.tail of 3235    No Deadline
4488    No Deadline
1928    No Deadline
1615    No Deadline
1109    No Deadline
           ...     
3846    31-AUG-2023
1994    31-AUG-2023
433     31-AUG-2023
5703    31-AUG-2023
419     31-AUG-2023
Name: Post Until, Length: 6362, dtype: object>

In [15]:
# Identify duplicates (rows and columns)
duplicates_r = df.duplicated(subset=['Job ID', 'Agency', 'Posting Type', '# Of Positions', 'Business Title',
       'Civil Service Title', 'Title Classification', 'Title Code No', 'Level',
       'Job Category', 'Full-Time/Part-Time indicator', 'Career Level',
       'Salary Range From', 'Salary Range To', 'Salary Frequency',
       'Work Location', 'Division/Work Unit', 'Job Description',
       'Minimum Qual Requirements', 'Preferred Skills',
       'Additional Information', 'To Apply', 'Hours/Shift', 'Work Location 1',
       'Recruitment Contact', 'Residency Requirement', 'Posting Date',
       'Post Until', 'Posting Updated', 'Process Date'])
duplicate_rows = df[duplicates_r]
duplicate_rows.shape[0]

112

In [16]:
#Transposing the DataFrame
transposed_df = df.transpose()
duplicated_columns_b = transposed_df.duplicated()
duplicated_columns = transposed_df[duplicated_columns_b]
duplicated_columns

Unnamed: 0,3235,4488,1928,1615,1109,4308,287,5008,5154,4640,...,187,2703,2160,4318,4340,3846,1994,433,5703,419


In [21]:
transposed_df.head()

Unnamed: 0,3235,4488,1928,1615,1109,4308,287,5008,5154,4640,...,187,2703,2160,4318,4340,3846,1994,433,5703,419
Job ID,534657,512652,468473,469953,469953,469360,469360,470441,470441,468476,...,595693,595694,595693,595688,595686,595687,595861,595694,595686,595691
Agency,NYC HOUSING AUTHORITY,HRA/DEPT OF SOCIAL SERVICES,DEPARTMENT OF TRANSPORTATION,HRA/DEPT OF SOCIAL SERVICES,HRA/DEPT OF SOCIAL SERVICES,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,...,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS,DEPARTMENT OF BUILDINGS
Posting Type,Internal,Internal,Internal,External,Internal,External,Internal,External,Internal,Internal,...,External,External,Internal,External,Internal,Internal,External,Internal,External,Internal
# Of Positions,1,2,1,1,1,1,1,1,1,1,...,2,2,2,6,20,2,1,2,20,1
Business Title,Elevator Oversight Team Administrator,UNIT CLERK,OFFICE MANAGER - Brooklyn B/C Office,CONTRACT ANALYST,CONTRACT ANALYST,Chief,Chief,Heating Oversight Team Specialist,Heating Oversight Team Specialist,Director of Public Housing Tenancy Operations,...,OATH/ECB Hearing Attorney,OATH/ECB Hearing Representative,OATH/ECB Hearing Attorney,Plumbing Inspector,Construction Inspector,Electrical Inspector,Plan Examiner,OATH/ECB Hearing Representative,Construction Inspector,Assistant Plan Examiner


In [23]:
# Checking the values of different columns
df['Work Location'].value_counts()

55 Water St Ny Ny                 436
42-09 28th Street                 425
96-05 Horace Harding Expway       321
30-30 Thomson Ave L I City Qns    292
4 World Trade Center              284
                                 ... 
1278 Sedgwick Ave., Bronx           1
50-16 59Th Pl., Queens              1
92-24 Rockaway Beach Blvd Quee      1
120 W 82Nd St., N.Y.                1
Vendor & Contract Mgmt              1
Name: Work Location, Length: 382, dtype: int64

# Text preprocessing

In [None]:
#Converting the columns to lowercase
#df.columns = df.columns.str.lower()