In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from tqdm import tqdm
from gensim.models.phrases import Phrases, Phraser
from collections import defaultdict
import multiprocessing
from gensim.models import Word2Vec, KeyedVectors
from time import time
import matplotlib.pyplot as plt
import os
import string

# Assembling Corpus

In [2]:
# If you do not want to build word embeddings based on the sentences of a particular dataset, set it to False
TO_READ = {
    "jobs" : True,
    "modules" : True,
    "lightcast" : True,
    "stackof" : True,
    "github" : True,
    "chatgpt" : True
}

In [3]:
sentences = [] # Stores all sentences to be fed into Gensim's Word2Vec model

### Gathering Jobs Datasets

In [4]:
JOB_READ = "../../Data/jobs/"
MODULE_READ = "../../../Data/university_courses/"
LIGHTCAST_READ = "../../Data/skills/"
STACKOF_READ = "../../Data/NER_annotated_data/StackOverflow/"
GH_READ = "../../Data/NER_annotated_data/GitHub/"
CHATGPT_READ = "../../Data/NER_annotated_data/ChatGPT/"

WRITE_PATH = "./Models/"

In [5]:
if TO_READ["jobs"]:
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_science.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_analyst.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-machine_learning_engineer.csv")['Description'].values.tolist())

    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())

In [6]:
# sentences contains all the descriptions in the four scraped datasets
print(len(sentences))
sentences[-1]

1258


"Why Work for Us We Power the Nation. Make the most of your talents and develop products that can create impact on a national scale. We are an in-house software team, assembled to move with speed and deliver with quality. We Build Reliable Solutions. For Customers, Company and Country. You will be part of the Digital Technology Team and together, you will innovate, create, and deploy digital products that will empower more than 3,800 employees within SP Group and improve the quality of life for the 1.6 million commercial, industrial and residential customers that SP Group serves. We build solutions that enable sustainable high quality lifestyles and help consumers save energy and cost, as well as supporting national goals for a sustainable livable city. Now, imagine the impact you can create. What You’ll Do: Create and maintain multiple robust and high-performance data processing pipeline within Cloud, Private Data Centre and Hybrid data ecosystem Assemble large, complex data sets from

### Gathering Module Datasets

In [7]:
UNI_MODDESC_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_desc",
    "NTU_course_info.csv" : "Course Aims",
    "SMU_course_info.csv" : "Description",
    "SUSS_course_info.csv" : "module description",
    "SUTD_course_info.csv" : "Module description",
    "SIT_Module_Info.csv" : "Module Description "
}

SKIP_ROWS = {
    "nus_dsa_mods.xlsx" : 0,
    "NTU_course_info.csv" : 0,
    "SMU_course_info.csv" : 1,
    "SUSS_course_info.csv" : 0,
    "SUTD_course_info.csv" : 5,
    "SIT_Module_Info.csv" : 0
}

In [8]:
mod_descriptions = pd.Series([], dtype='object')
for uni, description_col in UNI_MODDESC_MAPPING.items():
    print(f"Gathering module descriptions from {uni}")
    try:
        table = pd.read_excel(MODULE_READ + uni, skiprows=SKIP_ROWS[uni])
    except:
        table = pd.read_csv(MODULE_READ + uni, skiprows=SKIP_ROWS[uni], encoding_errors='ignore')
    
    table_desc = table[UNI_MODDESC_MAPPING[uni]].dropna().reset_index(drop=True)
    mod_descriptions = pd.concat([mod_descriptions, table_desc]).reset_index(drop=True)
    
display(mod_descriptions)

Gathering module descriptions from nus_dsa_mods.xlsx
Gathering module descriptions from NTU_course_info.csv
Gathering module descriptions from SMU_course_info.csv
Gathering module descriptions from SUSS_course_info.csv
Gathering module descriptions from SUTD_course_info.csv
Gathering module descriptions from SIT_Module_Info.csv


0      This module introduces the fundamental concept...
1      The abundance of data being harvested from var...
2      This module is a first course in linear algebr...
3      This is a course in single-variable calculus. ...
4      This module introduces students to the design ...
                             ...                        
173    Students will be grouped into teams of 5-6 and...
174    To keep up-to-date with the advances in techno...
175    Students will undertake an eight-month Integra...
176    This is a major individual project that is to ...
177    This module will endow students with the under...
Length: 178, dtype: object

In [9]:
if TO_READ['modules']:
    for description in mod_descriptions:
        sentences.append(description)

In [10]:
print(len(sentences))
sentences[-1]

1436


'This module will endow students with the understanding of the new challenges big data introduces, in particular in the area of IoT and the currently available solutions. These include (i) challenges pertaining to the modelling, accessing, and storing of big data, (ii) an understanding of the fundamentals of systems designed to store and access big data, (iii) programming paradigms for efficient scalable access to big data, and (iv) data processing methodology to facilitate big data analytics. The module will have a particular emphasis on the impact of the desiderata of scalability and efficiency in big data infrastructures, and expose students with a number of different cloud-based NoSQL systems and their design and implementation details, showing how they can achieve efficiency and scalability. '

### Gathering Lightcast Datasets

In [11]:
file_path = "lightcast_skills_queries-data_analysis_machine learning_ML_statistic.csv"

In [12]:
skills = pd.read_csv(LIGHTCAST_READ + file_path)
skill_descriptions = skills['Skill_Description']

In [13]:
if TO_READ['lightcast']:
    for description in skill_descriptions:
        sentences.append(description)

In [14]:
print(len(sentences))
sentences[-1]

1923


'The National Vital Statistics System (NVSS) is an inter-governmental system of sharing data on the vital statistics of the population of the United States. It involves coordination between the different state health departments of the US states and the National Center for Health Statistics, a division of the Centers for Disease Control and Prevention.'

### Gathering StackOverflow and GitHub Datasets

In [15]:
stack_of_txt_files = [filename for filename in os.listdir(STACKOF_READ) if ".txt" in filename and "2" not in filename]
stack_of_txt_files

['dev.txt', 'test.txt', 'train.txt']

In [16]:
if TO_READ['stackof']:
    for dataset in stack_of_txt_files:
        with open(f"./../../Data/NER_annotated_data/StackOverflow/{dataset}", "r", encoding = "utf-8") as f:
            sentence = ""
            for line in f:
                if line == "\n":
                    sentences.append(sentence[1:])
                    sentence = ""
                else:
                    word, man_label, comp_label = line.split("\t")
                    word = word.strip()
                    if word in string.punctuation:
                        sentence += word
                    else:
                        sentence += " " + word

In [17]:
if TO_READ['github']:
    github_filename = "gh_test.txt"
    with open(GH_READ + github_filename, "r", encoding = "utf-8") as f:
        sentence = ""
        for line in f:
            if line == "\n":
                sentences.append(sentence[1:])
                sentence = ""
            else:
                word, man_label, comp_label = line.split("\t")
                word = word.strip()
                if word in string.punctuation:
                    sentence += word
                else:
                    sentence += " " + word

In [18]:
print(len(sentences))
sentences[-1]

23485


'Change-type: patch Signed-off-by: Theodor Gherzan theodor@resin.io'

### Gathering ChatGPT Dataset

In [19]:
if TO_READ['chatgpt']:
    chatgpt_filename = "chatgpt_sentences.txt"
    with open(CHATGPT_READ + chatgpt_filename, "r", encoding = 'utf-8') as f:
        for line in f:
            sentences.append(line)

In [20]:
print(len(sentences))
sentences[-1]

23735


'Familiarity with agile software development methodologies'

### Data Cleaning

In [21]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

In [22]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [23]:
brief_cleaning = [re.sub(r'http\S+', ' ', row) for row in sentences] # Removes all websites from text
brief_cleaning = [re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in brief_cleaning] # Retains only alphabets in text
brief_cleaning

["instructions for interested applicants what you will do we are looking for a data scientist as part of our machine learning team the ideal candidate will leverage strong collaboration skills and ability to extract valuable insights from highly complex medical insurance data sets to ask the right questions and find the right answers you will have great opportunity to work with data scientist to understand and learn about how we can leverage ai ml in the health insurance medical field to detect fraud waste improve automation efficiency promote vitality duties and responsibilities analyze raw data assessing quality cleansing structuring for downstream processing be heavily involved to bring analytical prototypes to production with the data engineering dev ops teams become a subject matter expert in the health insurance domain generate actionable insights for business improvements help to develop customizable reports production ready dashboards for clients requirements bachelor's degree 

In [24]:
txt = []
for doc in tqdm(nlp.pipe(brief_cleaning, batch_size=5000)):
    cleaned_doc = cleaning(doc)
    if cleaned_doc:
        txt.append(cleaned_doc)
txt

23735it [00:45, 521.13it/s] 


["instruction interested applicant look data scientist machine learning team ideal candidate leverage strong collaboration skill ability extract valuable insight highly complex medical insurance datum set ask right question find right answer great opportunity work datum scientist understand learn leverage ai ml health insurance medical field detect fraud waste improve automation efficiency promote vitality duty responsibility analyze raw datum assess quality cleansing structure downstream processing heavily involved bring analytical prototype production data engineering dev op team subject matter expert health insurance domain generate actionable insight business improvement help develop customizable report production ready dashboard client requirement bachelor degree equivalent experience quantitative field statistic mathematics computer science engineering etc year ' experience quantitative analytic datum modeling ability write robust code python good understanding database system sq

### Generating Bigrams

In [25]:
MIN_BIGRAM_COUNT = 100
sent = [row.split() for row in txt]
phrases = Phrases(sent, min_count=MIN_BIGRAM_COUNT, progress_per=10000)
sentences = phrases[sent]

# Training Model

In [26]:
cores = multiprocessing.cpu_count()

In [27]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1,
                     vector_size=300)

In [28]:
start_time = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print(f'Time Elapsed: {time() - start_time} seconds')

Time Elapsed: 0.7684860229492188 seconds


In [29]:
start_time = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print(f'Time Elapsed: {time() - start_time} seconds')

Time Elapsed: 25.849844455718994 seconds


In [30]:
query = "python"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "visualization"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "warehouse"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "singapore"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))

Words similar to python
[('r', 0.8026129007339478), ('sas', 0.7884186506271362), ('sql', 0.779222846031189), ('scala', 0.7686433792114258), ('python_java', 0.7482452988624573), ('librarie', 0.7462363243103027), ('familiarity', 0.7217974066734314), ('programming_language', 0.7117031812667847), ('golang', 0.7106770277023315), ('scripting', 0.6923781037330627)]

Words similar to visualization
[('tableau', 0.8194629549980164), ('visualisation', 0.7777239680290222), ('powerbi', 0.7775654792785645), ('bi', 0.7450905442237854), ('dashboard', 0.6827767491340637), ('tool', 0.6671122312545776), ('modelling', 0.661561131477356), ('advanced', 0.6504266858100891), ('sas', 0.6446869969367981), ('etl', 0.6268311142921448)]

Words similar to warehouse
[('lake', 0.857105016708374), ('ingestion', 0.777418315410614), ('warehousing', 0.7628757953643799), ('premise', 0.7225556373596191), ('etl', 0.716988742351532), ('snowflake', 0.7153854966163635), ('rdbms', 0.67666095495224), ('informatica', 0.6723101735

In [31]:
print(w2v_model.wv.similarity("python", "c"))
print(w2v_model.wv.similarity("python", "singapore"))
print(w2v_model.wv.similarity("data", "visualization"))

0.61957693
-0.104582
0.43646386


In [32]:
# Getting all words in the model's vocabulary
for k, v in w2v_model.wv.key_to_index.items():
    print(k)

datum
experience
work
team
tiktok
data
business
model
user
system
build
machine_learn
use
solution
algorithm
skill
include
need
apply
product
code
project
content
good
responsibility
analysis
platform
like
develop
problem
application
new
recommendation
provide
design
technology
'
tool
file
support
create
risk
e_commerce
etc
machine_learning
analytic
process
search
engineering
want
strong
service
learn
value
science
database
management
understanding
add
try
s
method
ability
understand
requirement
set
sql
c
inspire_creativity
bring_joy
change
strategy
datum_mining
recommendation_system
look
drive
run
knowledge
year
opportunity
way
high
'_m
function
form_mobile
destination_short
tiktok_lead
technical
time
e_g
quality
performance
optimize
software
improve
development
implement
information
computer_science
require
able
python
video_mission
result
key
engineer
lead
follow
stakeholder
base
test
role
error
job
pipeline
security
nlp
big
ensure
ml
field
office_include
example
new_york
issue
obje

In [33]:
# Viewing all bigrams
for word, ind in w2v_model.wv.key_to_index.items():
    if "_" in word:
        print(word)

machine_learn
e_commerce
machine_learning
inspire_creativity
bring_joy
datum_mining
recommendation_system
'_m
form_mobile
destination_short
tiktok_lead
e_g
computer_science
video_mission
office_include
new_york
los_angeles
london_paris
berlin_dubai
tiktok_global
large_scale
problem_solve
mission_inspire
team_responsible
programming_language
relate_technical
follow_area
computer_vision
structure_algorithm
mumbai_singapore
jakarta_seoul
strong_communication
creativity_bring
cross_functional
relate_field
modern_machine
search_engine
'_ve
experience_unique
technique_improve
perspective_platform
globe_workplace
connect_people
degree_computer
create_inclusive
develop_state
art_machine
environment_reflect
space_employee
value_skill
joy_achieve
goal_committed
celebrate_diverse
voice_create
tiktok_commit
community_reach
software_development
develop_highly
scalable_classifier
build_industry
learning_model
passionate_hope
solve_challenging
deep_learning
risk_control
understand_product
teamwork_sk

In [34]:
# Viewing all bigrams with "data" in it
printed = []
for i in range(len(phrases[sent])):
    for word in phrases[sent][i]:
        if "data" in word and "_" in word and word not in printed:
            print(word)
            printed.append(word)

In [35]:
# Saving Word Vectors into human-readable format (non-binary)
word_vectors = w2v_model.wv
word_vectors.save_word2vec_format(WRITE_PATH + "w2v.wordvectors")

In [36]:
# Saving Word Vectors into spaCy format (binary)
!python -m spacy init vectors en "./Models/w2v.wordvectors" "./Models/"

[38;5;4m[i] Creating blank nlp object for language 'en'[0m
[38;5;2m[+] Successfully converted 2277 vectors[0m
[38;5;2m[+] Saved nlp object with vectors to output directory. You can now use
the path to it in your config as the 'vectors' setting in [initialize].[0m
C:\Users\ernest.liu\Documents\git\dsa3101-2220-12-ds\Backend\Code\NER
Model\Models


[2023-03-23 21:16:14,222] [INFO] Reading vectors from Models\w2v.wordvectors

0it [00:00, ?it/s]
1401it [00:00, 13799.77it/s]
2277it [00:00, 13978.91it/s]
[2023-03-23 21:16:14,388] [INFO] Loaded vectors from Models\w2v.wordvectors


In [37]:
# Loading model
w2v_model = KeyedVectors.load_word2vec_format(WRITE_PATH + "w2v.wordvectors", binary = False)