In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

## Loading the Occupation-Education Dataset

In [2]:
occ_edu_df = pd.read_csv("data/occupation_education.csv", delimiter="|")
occ_edu_df.shape

(2210, 2)

## Loading the Occupation-Skillset Dataset

In [3]:
occ_skill_df = pd.read_csv("data/occupation_skill.csv", delimiter="|")
occ_skill_df.head()

Unnamed: 0,OccupationCode,Skill
0,11-3012.00,Learning Strategies
1,11-3031.03,Mathematics
2,11-3111.00,Instructing
3,11-9032.00,Reading Comprehension
4,11-9121.00,Science


## Loading the Occupation-Experience Dataset

In [4]:
occ_exp_df = pd.read_csv("data/occupation_experience.csv", delimiter="|")
occ_exp_df.head()

Unnamed: 0,OccupationCode,ExperienceLevel
0,11-1011.00,"Over 1 year, up to and including 2 years"
1,11-1011.00,Over 10 years
2,11-1011.00,"Over 2 years, up to and including 4 years"
3,11-1011.00,"Over 4 years, up to and including 6 years"
4,11-1011.00,"Over 6 years, up to and including 8 years"


## Loading the Occupation Dataset

In [5]:
occ_df = pd.read_csv("data/occupation.csv", delimiter="|")
occ_df.head()

Unnamed: 0,OccupationCode,OccupationTitle,OccupationDescription
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [6]:
# Removing duplicates in the OccupationTitle column
occ_df = occ_df.drop_duplicates(subset=["OccupationTitle"])
print(occ_df.shape[0])

1016


## Merge the Occupation and Occupation-Skillset DataFrames

In [7]:
jobs = occ_df.merge(occ_edu_df, on="OccupationCode")
jobs = jobs.merge(occ_skill_df, on="OccupationCode")
jobs = jobs.merge(occ_exp_df, on="OccupationCode")
jobs = jobs.rename(columns={"OccupationCode": "JobCode",
                            "OccupationTitle": "Title",
                            "OccupationDescription": "Description"})
jobs.head()

Unnamed: 0,JobCode,Title,Description,EducationLevel,Skill,ExperienceLevel
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,Bachelor's Degree,Technology Design,"Over 1 year, up to and including 2 years"
1,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,Bachelor's Degree,Technology Design,Over 10 years
2,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,Bachelor's Degree,Technology Design,"Over 2 years, up to and including 4 years"
3,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,Bachelor's Degree,Technology Design,"Over 4 years, up to and including 6 years"
4,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,Bachelor's Degree,Technology Design,"Over 6 years, up to and including 8 years"


In [8]:
# The INPUT: user's information
user_profile = {
    "skills": "Wilhelm Publishing Threshold",
    "education": "Associate's Degree",
    "experience": "3 years of experience"
}

# Convert the above object into a DataFrame
user_df = pd.DataFrame([user_profile])

In [9]:
# Function to perform tokenization, stop word removal, and
# stemming/lemmatization
def tokenize(text):
    tokens = text.split() # simple tokenization by splitting text on spaces
    tokens = [token.lower() for token in tokens if token.isalpha()] # remove non-alphabetic tokens and convert to lowercase
    return tokens

In [10]:
# Vectorize the text features using TF-IDF
tfidf = TfidfVectorizer(stop_words="english")

# Concatenate 'Title' and 'Description' columns for each job
corpus = jobs["Skill"] + " " + \
            jobs["Description"] + " " + \
            jobs["EducationLevel"]
tfidf_matrix = tfidf.fit_transform(corpus)

# Vectorize user's skills
user_data = user_df["skills"] + " " + \
            user_df["education"] + " " + \
            user_df["experience"]
user_data_tokens = tokenize(user_data[0])
user_data = " ".join(user_data_tokens)
user_skill_vector = tfidf.transform([user_data])

# Choose top k-related jobs
k = 5

# Calculate the cosine similarity between the user and each job based on
# the text features
cosine_similarity_indices = cosine_similarity(
                                user_skill_vector, \
                                tfidf_matrix).argsort()[0][::-1]
recommended_titles = jobs.iloc[cosine_similarity_indices]["Title"] \
                        .drop_duplicates()
recommended_jobs = []
for i in range(0, k):
    recommended_jobs.append(recommended_titles.iloc[i])
print(recommended_jobs)

['Plumbers, Pipefitters, and Steamfitters', 'Carpenters', 'Drywall and Ceiling Tile Installers', 'Advertising and Promotions Managers', 'Compliance Managers']


In [21]:
occ_postings_df = pd.read_csv("data/occupation_jobposting.csv",
                              delimiter="|",
                             encoding="cp1252")
occ_postings_df.head()

Unnamed: 0,OccupationCode,JobTitle,JobPostingLink
0,25-9049.00,Instructor's Assistant,https://www.glassdoor.ca/partner/jobListing.ht...
1,29-1213.00,Brow Expert Beautician Estheticians,https://www.glassdoor.ca/partner/jobListing.ht...
2,53-7051.00,AZ / Class 1 Owner Operator Long Haul,https://www.glassdoor.ca/partner/jobListing.ht...
3,31-9091.00,"Associate Dentist - Kallanpally Dental, Slave ...",https://www.glassdoor.ca/partner/jobListing.ht...
4,43-6014.00,Executive Administrative Assistant,https://www.glassdoor.ca/partner/jobListing.ht...


In [25]:
job_recommended_list = {"jobs": []}
for job in recommended_jobs:
    job_title_postings = {}
    job_title_postings["title"] = job
    job_title_postings["url"] = []
    for i in range(occ_postings_df.shape[0]):
        if occ_postings_df.iloc[i, 1].lower() in job.lower():
            job_title_postings["url"].append(occ_postings_df.iloc[i, 2])
    job_recommended_list["jobs"].append(job_title_postings)

print(job_recommended_list)


{'jobs': [{'title': 'Plumbers, Pipefitters, and Steamfitters', 'url': ['https://www.glassdoor.com/job-listing/pipefitter-stc-industrial-group-JV_KO0,10_KE11,31.htm?jl=1008459218076&pos=102&ao=1110586&s=58&guid=000001872c2ba5939d7f2ac1d4a2fef1&src=GD_JOB_AD&t=SR&vt=w&uido=A63304B0593C1B92ACDF619295BA197D&ea=1&cs=1_82a19d4b&cb=1680073271021&jobListingId=1008459218076&cpc=451933188B21919D&jrtk=3-0-1gsm2n9dlkhq7801-1gsm2n9f5h7je801-760538a7cef9a07f--6NYlbfkN0BSj-91pRXoy0lNcNWaeAUDdWkq-KtJsWcP0fl2cdJD4kg_C1JmXuuJfVula4ExGZy85kCb-btkDzI33suV9LwoiCKfJz7G6mNNtSdtDZsbBOqBkS79SuzMq0QRDcQhG3oBs87JowPmawyRos8zHVYRgsrka5p-X8k4Pv00b-2Zf-g4C31jFIPckZMdg8B3eRGGxXqRrZjoM_q89c32XU-n5LoD2J_Kpnkqrg-WYczxLykDWWZ37IqlM4r_OhDrmRucNJ8YL_fdy-__WAcDdQ2c8HA6Rj42zm85eQePqEbesenrMbglThbhwZuqMnQy7ZK5x7MiX9n3ppYNrKpb46pAQQ358jaXple4i4wEVOtYyToLyJjptMAmp7c7b1ghfHL2uNuVCWyjTXK73vgzpZ6dywoPv4dh1bh3Wc9ydyyYF7JuPcpAZCtiz_9CmxamI6gmRni4I1yLnEy_ERGXXYHhp5q925pRf0NcJe-p4GYKzMeQpyfdroAe-Y_Jal_TU08OfEmIjMc0rqYCNxSi5uQakFPSYwL