In [1]:
import pandas as pd
import numpy as np
import re
import math
import spacy
from tqdm import tqdm
from gensim.models.phrases import Phrases, Phraser
from collections import defaultdict
import multiprocessing
from gensim.models import Word2Vec, KeyedVectors
from time import time
import matplotlib.pyplot as plt
import os
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

In [2]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

In [3]:
stop_words = set(stopwords.words('english'))

# Assembling Corpus

In [4]:
# If you do not want to build word embeddings based on the sentences of a particular dataset, set it to False
TO_READ = {
    "jobs" : True,
    "modules" : True,
    "lightcast" : True,
    "stackof" : True,
    "github" : True,
    "chatgpt" : True
}

In [5]:
sentences = [] # Stores all sentences to be fed into Gensim's Word2Vec model

### Gathering Jobs Datasets

In [6]:
JOB_READ = "../../Data/jobs/"
MODULE_READ = "../../../Data/university_courses/"
LIGHTCAST_READ = "../../Data/skills/"
STACKOF_READ = "../../Data/NER_annotated_data/StackOverflow/"
GH_READ = "../../Data/NER_annotated_data/GitHub/"
CHATGPT_READ = "../../Data/NER_annotated_data/ChatGPT/"

WRITE_PATH = "./Models/"

In [7]:
if TO_READ["jobs"]:
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_science.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_analyst.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-machine_learning_engineer.csv")['Description'].values.tolist())

    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())

In [8]:
# sentences contains all the descriptions in the four scraped datasets
print(len(sentences))
sentences[-1]

1258


"Why Work for Us We Power the Nation. Make the most of your talents and develop products that can create impact on a national scale. We are an in-house software team, assembled to move with speed and deliver with quality. We Build Reliable Solutions. For Customers, Company and Country. You will be part of the Digital Technology Team and together, you will innovate, create, and deploy digital products that will empower more than 3,800 employees within SP Group and improve the quality of life for the 1.6 million commercial, industrial and residential customers that SP Group serves. We build solutions that enable sustainable high quality lifestyles and help consumers save energy and cost, as well as supporting national goals for a sustainable livable city. Now, imagine the impact you can create. What You’ll Do: Create and maintain multiple robust and high-performance data processing pipeline within Cloud, Private Data Centre and Hybrid data ecosystem Assemble large, complex data sets from

### Gathering Module Datasets

In [9]:
UNI_MODDESC_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_desc",
    "NTU_course_info.csv" : "Course Aims",
    "SMU_course_info.csv" : "Description",
    "SUSS_course_info.csv" : "module description",
    "SUTD_course_info.csv" : "Module description",
    "SIT_Module_Info.csv" : "Module Description "
}

SKIP_ROWS = {
    "nus_dsa_mods.xlsx" : 0,
    "NTU_course_info.csv" : 0,
    "SMU_course_info.csv" : 1,
    "SUSS_course_info.csv" : 0,
    "SUTD_course_info.csv" : 5,
    "SIT_Module_Info.csv" : 0
}

In [10]:
mod_descriptions = pd.Series([], dtype='object')
for uni, description_col in UNI_MODDESC_MAPPING.items():
    print(f"Gathering module descriptions from {uni}")
    try:
        table = pd.read_excel(MODULE_READ + uni, skiprows=SKIP_ROWS[uni])
    except:
        table = pd.read_csv(MODULE_READ + uni, skiprows=SKIP_ROWS[uni], encoding_errors='ignore')
    
    table_desc = table[UNI_MODDESC_MAPPING[uni]].dropna().reset_index(drop=True)
    mod_descriptions = pd.concat([mod_descriptions, table_desc]).reset_index(drop=True)
    
display(mod_descriptions)

Gathering module descriptions from nus_dsa_mods.xlsx
Gathering module descriptions from NTU_course_info.csv
Gathering module descriptions from SMU_course_info.csv
Gathering module descriptions from SUSS_course_info.csv
Gathering module descriptions from SUTD_course_info.csv
Gathering module descriptions from SIT_Module_Info.csv


0      This module introduces the fundamental concept...
1      The abundance of data being harvested from var...
2      This module is a first course in linear algebr...
3      This is a course in single-variable calculus. ...
4      This module introduces students to the design ...
                             ...                        
173    Students will be grouped into teams of 5-6 and...
174    To keep up-to-date with the advances in techno...
175    Students will undertake an eight-month Integra...
176    This is a major individual project that is to ...
177    This module will endow students with the under...
Length: 178, dtype: object

In [11]:
if TO_READ['modules']:
    for description in mod_descriptions:
        sentences.append(description)

In [12]:
print(len(sentences))
sentences[-1]

1436


'This module will endow students with the understanding of the new challenges big data introduces, in particular in the area of IoT and the currently available solutions. These include (i) challenges pertaining to the modelling, accessing, and storing of big data, (ii) an understanding of the fundamentals of systems designed to store and access big data, (iii) programming paradigms for efficient scalable access to big data, and (iv) data processing methodology to facilitate big data analytics. The module will have a particular emphasis on the impact of the desiderata of scalability and efficiency in big data infrastructures, and expose students with a number of different cloud-based NoSQL systems and their design and implementation details, showing how they can achieve efficiency and scalability. '

### Gathering Lightcast Datasets

In [13]:
file_path = "lightcast_skills_queries-data_analysis_machine learning_ML_statistic.csv"

In [14]:
skills = pd.read_csv(LIGHTCAST_READ + file_path)
skill_descriptions = skills['Skill_Description']

In [15]:
if TO_READ['lightcast']:
    for description in skill_descriptions:
        sentences.append(description)

In [16]:
print(len(sentences))
sentences[-1]

1923


'The National Vital Statistics System (NVSS) is an inter-governmental system of sharing data on the vital statistics of the population of the United States. It involves coordination between the different state health departments of the US states and the National Center for Health Statistics, a division of the Centers for Disease Control and Prevention.'

### Gathering StackOverflow and GitHub Datasets

In [17]:
stack_of_txt_files = [filename for filename in os.listdir(STACKOF_READ) if ".txt" in filename and "2" not in filename]
stack_of_txt_files

['dev.txt', 'test.txt', 'train.txt']

In [18]:
if TO_READ['stackof']:
    for dataset in stack_of_txt_files:
        with open(f"./../../Data/NER_annotated_data/StackOverflow/{dataset}", "r", encoding = "utf-8") as f:
            sentence = ""
            for line in f:
                if line == "\n":
                    sentences.append(sentence[1:])
                    sentence = ""
                else:
                    word, man_label, comp_label = line.split("\t")
                    word = word.strip()
                    if word in string.punctuation:
                        sentence += word
                    else:
                        sentence += " " + word

In [19]:
if TO_READ['github']:
    github_filename = "gh_test.txt"
    with open(GH_READ + github_filename, "r", encoding = "utf-8") as f:
        sentence = ""
        for line in f:
            if line == "\n":
                sentences.append(sentence[1:])
                sentence = ""
            else:
                word, man_label, comp_label = line.split("\t")
                word = word.strip()
                if word in string.punctuation:
                    sentence += word
                else:
                    sentence += " " + word

In [20]:
print(len(sentences))
sentences[-1]

23485


'Change-type: patch Signed-off-by: Theodor Gherzan theodor@resin.io'

### Gathering ChatGPT Dataset

In [21]:
if TO_READ['chatgpt']:
    chatgpt_filename = "chatgpt_sentences.txt"
    with open(CHATGPT_READ + chatgpt_filename, "r", encoding = 'utf-8') as f:
        for line in f:
            sentences.append(line)

In [22]:
print(len(sentences))
sentences[-1]

23735


'Familiarity with agile software development methodologies'

### Data Cleaning

In [23]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

In [24]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [25]:
brief_cleaning = [re.sub(r'http\S+', ' ', row) for row in sentences] # Removes all websites from text
brief_cleaning = [re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in brief_cleaning] # Retains only alphabets in text

### Lemmatization

In [26]:
lemmatizer = WordNetLemmatizer()

In [27]:
brief_cleaning = [' '.join([lemmatizer.lemmatize(word) for word in sentence.split(" ")]) for sentence in brief_cleaning]
brief_cleaning

["instruction for interested applicant what you will do we are looking for a data scientist a part of our machine learning team the ideal candidate will leverage strong collaboration skill and ability to extract valuable insight from highly complex medical insurance data set to ask the right question and find the right answer you will have great opportunity to work with data scientist to understand and learn about how we can leverage ai ml in the health insurance medical field to detect fraud waste improve automation efficiency promote vitality duty and responsibility analyze raw data assessing quality cleansing structuring for downstream processing be heavily involved to bring analytical prototype to production with the data engineering dev ops team become a subject matter expert in the health insurance domain generate actionable insight for business improvement help to develop customizable report production ready dashboard for client requirement bachelor's degree or equivalent experi

### Removing Punctuations

In [28]:
regular_punct = list(string.punctuation)
def remove_punctuation(text, punct_list):
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ')
    return text.strip()

brief_cleaning = [remove_punctuation(sentence, regular_punct) for sentence in brief_cleaning]

### Further cleaning

In [29]:
txt = []
for doc in tqdm(nlp.pipe(brief_cleaning, batch_size=5000)):
    cleaned_doc = cleaning(doc)
    if cleaned_doc:
        txt.append(cleaned_doc)
sent = [row.split() for row in txt]
sent

23735it [00:38, 616.96it/s] 


[['instruction',
  'interested',
  'applicant',
  'look',
  'data',
  'scientist',
  'machine',
  'learning',
  'team',
  'ideal',
  'candidate',
  'leverage',
  'strong',
  'collaboration',
  'skill',
  'ability',
  'extract',
  'valuable',
  'insight',
  'highly',
  'complex',
  'medical',
  'insurance',
  'datum',
  'set',
  'ask',
  'right',
  'question',
  'find',
  'right',
  'answer',
  'great',
  'opportunity',
  'work',
  'datum',
  'scientist',
  'understand',
  'learn',
  'leverage',
  'ai',
  'ml',
  'health',
  'insurance',
  'medical',
  'field',
  'detect',
  'fraud',
  'waste',
  'improve',
  'automation',
  'efficiency',
  'promote',
  'vitality',
  'duty',
  'responsibility',
  'analyze',
  'raw',
  'datum',
  'assess',
  'quality',
  'cleansing',
  'structure',
  'downstream',
  'processing',
  'heavily',
  'involved',
  'bring',
  'analytical',
  'prototype',
  'production',
  'data',
  'engineering',
  'dev',
  'op',
  'team',
  'subject',
  'matter',
  'expert',
 

### Generating Bigrams - Do not use because it tends to produce worse results

In [30]:
# MIN_BIGRAM_COUNT = 100
# phrases = Phrases(sent, min_count=MIN_BIGRAM_COUNT, progress_per=10000)
# sentences = phrases[sent]

# Training Model

In [31]:
cores = multiprocessing.cpu_count()

In [32]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1,
                     vector_size=300)

In [33]:
start_time = time()
w2v_model.build_vocab(sent, progress_per=10000)
print(f'Time Elapsed: {time() - start_time} seconds')

Time Elapsed: 0.12141680717468262 seconds


In [34]:
start_time = time()
w2v_model.train(sent, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print(f'Time Elapsed: {time() - start_time} seconds')

Time Elapsed: 7.0149900913238525 seconds


In [35]:
query = "python"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "visualization"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "warehouse"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "singapore"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))

Words similar to python
[('java', 0.8467201590538025), ('scala', 0.8154768943786621), ('proficient', 0.7810153961181641), ('golang', 0.7653834819793701), ('c', 0.7616414427757263), ('r', 0.7551167011260986), ('sql', 0.7467094659805298), ('scripting', 0.7231290936470032), ('programming', 0.7029889822006226), ('language', 0.7024306058883667)]

Words similar to visualization
[('tableau', 0.8576967120170593), ('visualisation', 0.7869049906730652), ('powerbi', 0.7643397450447083), ('bi', 0.7302674055099487), ('qlik', 0.6432563066482544), ('modelling', 0.6265978217124939), ('dashboard', 0.6265103220939636), ('advanced', 0.6262879371643066), ('statistical', 0.6202511787414551), ('sa', 0.6181435585021973)]

Words similar to warehouse
[('lake', 0.8457064032554626), ('etl', 0.7750259637832642), ('ingestion', 0.7697190642356873), ('warehousing', 0.7190618515014648), ('snowflake', 0.6951940655708313), ('premise', 0.6771638989448547), ('informatica', 0.668594241142273), ('big', 0.6661220788955688),

In [36]:
print(w2v_model.wv.similarity("python", "c"))
print(w2v_model.wv.similarity("python", "singapore"))
print(w2v_model.wv.similarity("data", "visualization"))

0.76164144
-0.14092433
0.43733197


In [37]:
# Getting all words in the model's vocabulary
for k, v in w2v_model.wv.key_to_index.items():
    print(k)

datum
experience
tiktok
team
machine
work
system
s
learn
data
business
model
build
include
skill
user
algorithm
e
develop
recommendation
create
new
product
learning
platform
solution
problem
use
computer
science
code
tool
risk
lead
need
apply
project
inspire
good
technology
responsibility
content
analysis
strong
ha
t
like
search
n
application
value
qualification
bring
provide
etc
creativity
design
mission
file
support
improve
joy
understand
technical
c
form
video
commerce
information
analytic
related
engineering
process
strategy
want
global
service
solve
area
doe
year
python
software
mining
development
database
management
communication
m
technique
understanding
degree
try
programming
add
set
relate
field
short
opportunity
method
ability
high
people
deep
requirement
time
sql
u
environment
large
change
language
destination
singapore
mobile
job
structure
look
scale
knowledge
run
drive
d
g
way
function
end
state
discipline
quality
performance
follow
industry
optimize
office
implement
goal


In [38]:
# Viewing all bigrams
for word, ind in w2v_model.wv.key_to_index.items():
    if "_" in word:
        print(word)

In [40]:
# Viewing all bigrams with "data" in it
# printed = []
# for i in range(len(phrases[sent])):
#     for word in phrases[sent][i]:
#         if "data" in word and "_" in word and word not in printed:
#             print(word)
#             printed.append(word)

In [41]:
# Saving Word Vectors into human-readable format (non-binary)
# word_vectors = w2v_model.wv
# word_vectors.save_word2vec_format(WRITE_PATH + "w2v.wordvectors")

In [42]:
# Saving Word Vectors into spaCy format (binary)
# !python -m spacy init vectors en "./Models/w2v.wordvectors" "./Models/"

In [43]:
# Loading model
# w2v_model = KeyedVectors.load_word2vec_format(WRITE_PATH + "w2v.wordvectors", binary = False)

## Link to Trained spaCy Model

In [44]:
import spacy
from spacy import displacy

In [45]:
nlp_ner = spacy.load("model-best")

In [46]:
doc = nlp_ner(
'''
In statistics, exploratory data analysis is an approach to analyzing data sets to summarize their main characteristics, often using statistical graphics and other data visualization methods. A statistical model can be used or not, but primarily EDA is for seeing what the data can tell us beyond the formal modeling or hypothesis testing task. Exploratory data analysis was promoted by John Tukey to encourage statisticians to explore the data, and possibly formulate hypotheses that could lead to new data collection and experiments. EDA is different from initial data analysis (IDA), which focuses more narrowly on checking assumptions required for model fitting and hypothesis testing, and handling missing values and making transformations of variables as needed. EDA encompasses IDA.
'''
)
spacy.displacy.render(doc, style = 'ent', jupyter=True)

In [47]:
doc.ents

(statistics,
 exploratory data analysis,
 often using statistical graphics,
 data visualization,
 modeling,
 hypothesis testing,
 Exploratory data analysis,
 possibly,
 hypotheses,
 data analysis,
 hypothesis testing,
 handling missing values)

In [48]:
w2v_model.wv.similarity("math", "statistic")

0.7972245

In [49]:
def get_score(job_desc, mod_desc, verbose = 1):
    job_desc = nlp_ner(job_desc)
    mod_desc = nlp_ner(mod_desc)
    scores = []
    OOV = [] # Stores out of vocabulary words
    
    for job_ents in job_desc.ents:
        job_ents = job_ents.text.lower() # Get the string value, rather than a spaCy SPAN object
        job_ents = lemmatizer.lemmatize(job_ents)
        job_ents = remove_punctuation(job_ents, regular_punct)
        for job_ent in job_ents.split():
            if job_ent not in w2v_model.wv: # If job_ent not found in vocabulary
                if job_ent not in OOV and job_ent not in stop_words:
                    OOV.append(job_ent)
                    if verbose:
                        print(f"JOB: {job_ent} not found in vocabulary")
                scores.append(0)
                continue
            max_cossim = -1
            best_mod_ent = None
            for mod_ents in mod_desc.ents:
                mod_ents = mod_ents.text.lower() # Get the string value, rather than a spaCy SPAN object
                mod_ents = lemmatizer.lemmatize(mod_ents)
                mod_ents = remove_punctuation(mod_ents, regular_punct)
                for mod_ent in mod_ents.split():
                    if mod_ent not in w2v_model.wv:
                        if mod_ent not in stop_words and mod_ent not in OOV:
                            OOV.append(mod_ent)
                            if verbose:
                                print(f"MODULE: {mod_ent} not found in vocabulary")
                    else:
                        cos_sim = w2v_model.wv.similarity(job_ent, mod_ent)
                        if cos_sim >= max_cossim:
                            max_cossim = cos_sim
                            best_mod_ent = mod_ent
                if best_mod_ent == None:
                    if verbose:
                        print(f"No matching skills found for {job_ent} in {mod_desc}")
                    scores.append(0)
                else:
                    score = (math.pi - math.acos(max_cossim)) * 100 / math.pi
                    scores.append(score)

    return np.mean(np.array(scores))

In [50]:
job_desc = """
What a College Intern - Data Science does at HP:
Attached to the "Smart Manufacturing Application and Research Center".
Work with an enterprising team of data scientists and build solutions to track, analyze and visualize the manufacturing and outbound quality of our supplies.
Generate deep insights through the analysis of data and understanding of operational processes and turn them into actionable recommendations.
Develop methodologies for optimizing our business processes through data visualization, real-time monitoring, predictive analytics etc.
Are you a high-performer? We are looking for an individual with:
Studying Bachelor’s degree in Computer Science, Business Analytics, Information Systems, Industrial Engineering, Statistics with good experience in programming.
Excellent analytical thinking, programming (using R/Python is desirable), and problem-solving skills.
Knowledge of data analytics, data warehousing, database management (preferably using SQL) and data visualization using RShiny and Plotly.
Fundamental knowledge of statistics and probability.
Good visualization skills to create real-time dashboards and/or reports to inform trends and insights.
    """

In [51]:
mod_desc1 = """
This module covers common algorithmic techniques for solving optimisation problems, and introduces students to approaches for finding good-enough solutions to NP-hard problems. Topics covered include linear and integer programming, network flow algorithms, local search heuristics, approximation algorithms, and randomized algorithms. Through analysis and application of the techniques to a variety of canonical problems, students develop confidence to (i) appropriately model a given optimisation problem, (ii) apply appropriate algorithmic techniques to solve the problem, (iii) analyse the properties of the problem and candidate algorithms, such as time and space complexity, convergence, approximability, and optimality bound.
"""

In [52]:
mod_desc2 = """
Data visualisation is an essential tool for data analytics. This module is an introduction to data cleaning, exploration, analysis and visualisation. Students will learn how to take raw data, extract meaningful information, use statistical tools, and make visualisations. Topics include: programming in R, introduction to data storage systems, data manipulation, exploratory data analysis, dimension reduction, statistical graphics for univariate, multivariate (high-dimensional), temporal and spatial data, basic design principles and critical evaluation of visual displays of data.
"""

In [53]:
print(get_score(job_desc, mod_desc1, verbose = 0))
print(get_score(job_desc, mod_desc2, verbose = 0))

62.17018107466963
69.46593781468314
