# Import Necessary Libraries

In [1]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])
from nltk.tokenize import word_tokenize

#Data loading/ Data manipulation
import pandas as pd
import numpy as np

# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Import the CSV file

In [2]:
file_path = 'data/Linkedin Job Postings (2023-2024)/postings.csv'
data = pd.read_csv(file_path)

# Display the first few rows
# print(data.head())



In [3]:
# put data into a df
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())



     job_id            company_name   
0    921716   Corcoran Sawyer Smith  \
1   1829192                     NaN   
2  10998357  The National Exemplar    
3  23221523  Abrams Fensterman, LLP   
4  35982263                     NaN   

                                               title   
0                              Marketing Coordinator  \
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description  max_salary pay_period   
0  Job descriptionA leading real estate firm in N...        20.0     HOURLY  \
1  At Aspen Therapy and Wellness , we are committ...        50.0     HOURLY   
2  The National Exemplar is accepting application...     65000.0     YEARLY   
3  Senior Associate Attorney - Elder Law / Trusts...    175000.0     YEARLY   
4  Looking for HVAC service tech 

In [4]:
# Keep only the columns we need "title", "description", "skills_desc"
df = df[["title", "description", "skills_desc"]]
print(df.head())


                                               title   
0                              Marketing Coordinator  \
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description   
0  Job descriptionA leading real estate firm in N...  \
1  At Aspen Therapy and Wellness , we are committ...   
2  The National Exemplar is accepting application...   
3  Senior Associate Attorney - Elder Law / Trusts...   
4  Looking for HVAC service tech with experience ...   

                                         skills_desc  
0  Requirements: \n\nWe are seeking a College or ...  
1                                                NaN  
2  We are currently accepting resumes for FOH - A...  
3  This position requires a baseline understandin...  
4                                                Na

In [5]:
# Combine the description and skills_desc columns
df['combined_skills_desc'] = df['description'] + df['skills_desc']
print(df.head())

                                               title   
0                              Marketing Coordinator  \
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description   
0  Job descriptionA leading real estate firm in N...  \
1  At Aspen Therapy and Wellness , we are committ...   
2  The National Exemplar is accepting application...   
3  Senior Associate Attorney - Elder Law / Trusts...   
4  Looking for HVAC service tech with experience ...   

                                         skills_desc   
0  Requirements: \n\nWe are seeking a College or ...  \
1                                                NaN   
2  We are currently accepting resumes for FOH - A...   
3  This position requires a baseline understandin...   
4                                             

In [6]:
# Save this to a new csv file
df.to_csv('data/Linkedin Job Postings (2023-2024)/postings_combined_desc.csv', index=False)

In [7]:
# Remove nan rows
df = df.dropna()
print(df.head())

                                               title   
0                              Marketing Coordinator  \
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
8                              Respiratory Therapist   
9                                     Worship Leader   

                                         description   
0  Job descriptionA leading real estate firm in N...  \
2  The National Exemplar is accepting application...   
3  Senior Associate Attorney - Elder Law / Trusts...   
8  At Children’s, the region’s only full-service ...   
9  It is an exciting time to be a part of our chu...   

                                         skills_desc   
0  Requirements: \n\nWe are seeking a College or ...  \
2  We are currently accepting resumes for FOH - A...   
3  This position requires a baseline understandin...   
8  • Requires the ability to communicate effectiv...   
9  Knowledge, Skills and Abilities: 1. Profici

In [8]:
# remove descriptions and skills_desc columns
df = df.drop(columns=['description', 'skills_desc'])
print(df.head())

                                               title   
0                              Marketing Coordinator  \
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
8                              Respiratory Therapist   
9                                     Worship Leader   

                                combined_skills_desc  
0  Job descriptionA leading real estate firm in N...  
2  The National Exemplar is accepting application...  
3  Senior Associate Attorney - Elder Law / Trusts...  
8  At Children’s, the region’s only full-service ...  
9  It is an exciting time to be a part of our chu...  


In [9]:
# !python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "skill_patterns.jsonl"

# Test of the Spacy library

In [9]:
# import spacy
# from spacy.matcher import Matcher
# import json

# # Load the spaCy model
# nlp = spacy.load("en_core_web_lg")

# # Path to the JSONL skill pattern file
# skill_pattern_path = "jz_skill_patterns.jsonl"

# # Load skill patterns from the JSONL file
# with open(skill_pattern_path, "r") as file:
#     skill_patterns = [json.loads(line) for line in file]

# # Initialize the Matcher
# matcher = Matcher(nlp.vocab)

# # Add skill patterns to the Matcher
# for pattern in skill_patterns:
#     matcher.add(pattern["label"], [pattern["pattern"]])

# # Example resume text
# resume_text = """
#          HR ADMINISTRATOR/MARKETING ASSOCIATE

# HR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    Completed courses and seminars in customer service, sales strategies, inventory control, loss prevention, safety, time management, leadership and performance assessment.        Experience      HR Administrator/Marketing Associate

# HR Administrator     Dec 2013   to   Current      Company Name   ï¼   City  ,   State     Helps to develop policies, directs and coordinates activities such as employment, compensation, labor relations, benefits, training, and employee services.  Prepares employee separation notices and related documentation  Keeps records of benefits plans participation such as insurance and pension plan, personnel transactions such as hires, promotions, transfers, performance reviews, and terminations, and employee statistics for government reporting.  Advises management in appropriate resolution of employee relations issues.  Administers benefits programs such as life, health, dental, insurance, pension plans, vacation, sick leave, leave of absence, and employee assistance.     Marketing Associate Â    Designed and created marketing collateral for sales meetings, trade shows and company executives.  Managed the in-house advertising program consisting of print and media collateral pieces.  Assisted in the complete design and launch of the company's website in 2 months.  Created an official company page on Facebook to facilitate interaction with customers.  Analyzed ratings and programming features of competitors to evaluate the effectiveness of marketing strategies.         Advanced Medical Claims Analyst     Mar 2012   to   Dec 2013      Company Name   ï¼   City  ,   State     Reviewed medical bills for the accuracy of the treatments, tests, and hospital stays prior to sanctioning the claims.  Trained to interpret the codes (ICD-9, CPT) and terminology commonly used in medical billing to fully understand the paperwork that is submitted by healthcare providers.  Required to have organizational and analytical skills as well as computer skills, knowledge of medical terminology and procedures, statistics, billing standards, data analysis and laws regarding medical billing.         Assistant General Manager     Jun 2010   to   Dec 2010      Company Name   ï¼   City  ,   State     Performed duties including but not limited to, budgeting and financial management, accounting, human resources, payroll and purchasing.  Established and maintained close working relationships with all departments of the hotel to ensure maximum operation, productivity, morale and guest service.  Handled daily operations and reported directly to the corporate office.  Hired and trained staff on overall objectives and goals with an emphasis on high customer service.  Marketing and Advertising, working on public relations with the media, government and local businesses and Chamber of Commerce.         Executive Support / Marketing Assistant     Jul 2007   to   Jun 2010      Company Name   ï¼   City  ,   State     Provided assistance to various department heads - Executive, Marketing, Customer Service, Human Resources.  Managed front-end operations to ensure friendly and efficient transactions.  Ensured the swift resolution of customer issues to preserve customer loyalty while complying with company policies.  Exemplified the second-to-none customer service delivery in all interactions with customers and potential clients.         Reservation & Front Office Manager     Jun 2004   to   Jul 2007      Company Name   ï¼   City  ,   State          Owner/ Partner     Dec 2001   to   May 2004      Company Name   ï¼   City  ,   State          Price Integrity Coordinator     Aug 1999   to   Dec 2001      Company Name   ï¼   City  ,   State          Education      N/A  ,   Business Administration   1999     Jefferson College   ï¼   City  ,   State       Business Administration  Marketing / Advertising         High School Diploma  ,   College Prep. studies   1998     Sainte Genevieve Senior High   ï¼   City  ,   State       Awarded American Shrubel Leadership Scholarship to Jefferson College         Skills     Accounting, ads, advertising, analytical skills, benefits, billing, budgeting, clients, Customer Service, data analysis, delivery, documentation, employee relations, financial management, government relations, Human Resources, insurance, labor relations, layout, Marketing, marketing collateral, medical billing, medical terminology, office, organizational, payroll, performance reviews, personnel, policies, posters, presentations, public relations, purchasing, reporting, statistics, website.    
# """

# # Preprocess the text
# resume_text = re.sub(r"\s+", " ", resume_text)
# resume_text = resume_text.strip()
# # To lowercase
# resume_text = resume_text.lower()

# # Process the text
# doc = nlp(resume_text)

# # Apply the matcher
# matches = matcher(doc)

# # Extract matched skills
# # extracted_skills = [doc[start:end].text for match_id, start, end in matches]
# # print("Extracted Skills:", extracted_skills)

# # Extract matched skills without duplicates
# extracted_skills = list(set([doc[start:end].text for match_id, start, end in matches]))
# print("Extracted Skills:", extracted_skills)

In [10]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

We will create two python functions to extract all the skills within a resume and create an array containing all the skills. Later we are going to apply this function to our dataset and create a new feature called skill. This will help us visualize trends and patterns within the dataset

In [11]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

# Cleaning Resume Text using nltk

In [12]:
print(df.columns)

data = df.copy()


clean = []
for i in range(data.shape[0]):
    review = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        data["combined_skills_desc"].iloc[i],
    )
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [
        lm.lemmatize(word)
        for word in review
        if not word in set(stopwords.words("english"))
    ]
    review = " ".join(review)
    clean.append(review)

Index(['title', 'combined_skills_desc'], dtype='object')


In [13]:
print(df.head())

# Create a CSV for the Cleaned Data
df["cleaned_JD"] = clean
df.to_csv("data/Linkedin Job Postings (2023-2024)/cleaned_JD.csv", index=False)

                                               title   
0                              Marketing Coordinator  \
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
8                              Respiratory Therapist   
9                                     Worship Leader   

                                combined_skills_desc  
0  Job descriptionA leading real estate firm in N...  
2  The National Exemplar is accepting application...  
3  Senior Associate Attorney - Elder Law / Trusts...  
8  At Children’s, the region’s only full-service ...  
9  It is an exciting time to be a part of our chu...  


In [15]:
data["Clean_JD"] = clean
data["skills"] = data["Clean_JD"].str.lower().apply(get_skills)
data["skills"] = data["skills"].apply(unique_skills)
data.head()

Unnamed: 0,title,combined_skills_desc,Clean_JD,skills
0,Marketing Coordinator,Job descriptionA leading real estate firm in N...,job descriptiona leading real estate firm new ...,"[medium, advertising, graphic design, email ma..."
2,Assitant Restaurant Manager,The National Exemplar is accepting application...,national exemplar accepting application assist...,"[teamwork, organization, customer service]"
3,Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,senior associate attorney elder law trust esta...,"[search engine, analytics, marketing, problem ..."
8,Respiratory Therapist,"At Children’s, the region’s only full-service ...",child region full service pediatric healthcare...,"[documentation, support, schedule, professiona..."
9,Worship Leader,It is an exciting time to be a part of our chu...,exciting time part church looking right energe...,"[organization, medium, graphic design, twitter..."


In [36]:
df = data.copy()

# show columns
print(df.columns)

# reove the combined_skills_desc column
df = df.drop(columns=["combined_skills_desc"])

# rename Clean_JD to text
df = df.rename(columns={"Clean_JD": "text"})
print(df.head())



Index(['title', 'combined_skills_desc', 'Clean_JD', 'skills'], dtype='object')
                                               title   
0                              Marketing Coordinator  \
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
8                              Respiratory Therapist   
9                                     Worship Leader   

                                                text   
0  job descriptiona leading real estate firm new ...  \
2  national exemplar accepting application assist...   
3  senior associate attorney elder law trust esta...   
8  child region full service pediatric healthcare...   
9  exciting time part church looking right energe...   

                                              skills  
0  [medium, advertising, graphic design, email ma...  
2         [teamwork, organization, customer service]  
3  [search engine, analytics, marketing, problem ...  
8  [documentation, support,

In [40]:
# Save the data to a new CSV file
df.to_csv("data/Linkedin Job Postings (2023-2024)/jd_final.csv", index=False)

: 