In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from tqdm import tqdm
from gensim.models.phrases import Phrases, Phraser
from collections import defaultdict
import multiprocessing
from gensim.models import Word2Vec, KeyedVectors
from time import time
import matplotlib.pyplot as plt

# Assembling Corpus

In [2]:
read_path = "../../Data/jobs/"
write_path = "./Models/"

In [3]:
sentences = pd.read_csv(read_path + "mycareersfuture_query-data_science.csv")['Description'].values.tolist()
sentences.extend(pd.read_csv(read_path + "mycareersfuture_query-data_engineer.csv")['Description'].values.tolist())
sentences.extend(pd.read_csv(read_path + "mycareersfuture_query-data_analyst.csv")['Description'].values.tolist())
sentences.extend(pd.read_csv(read_path + "mycareersfuture_query-machine_learning_engineer.csv")['Description'].values.tolist())

sentences.extend(pd.read_csv(read_path + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
sentences.extend(pd.read_csv(read_path + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
sentences.extend(pd.read_csv(read_path + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
sentences.extend(pd.read_csv(read_path + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())

In [4]:
# sentences contains all the descriptions in the four scraped datasets
sentences

["Instructions for interested applicants\r\nhttps://aia.wd3.myworkdayjobs.com/amplifyhealthexternal/job/Singapore-SG-Amplify-Health/Data-Science_JR-36535\r\n\r\nWhat you will do?\r\n\r\nWe are looking for a Data Scientist as part of our Machine Learning Team. The ideal candidate will leverage strong collaboration skills and ability to extract valuable insights from highly complex medical & insurance data sets to ask the right questions and find the right answers. You will have great opportunity to work with Data Scientist to understand and learn about how we can leverage AI/ML in the health insurance & medical field to detect fraud & waste, improve automation efficiency & promote vitality.\r\n\r\nDuties and Responsibilities\r\n• Analyze raw data: assessing quality, cleansing, structuring for downstream processing\r\n• Be heavily involved to bring analytical prototypes to production with the data engineering & dev-ops teams\r\n• Become a subject-matter expert in the health & insurance d

### Data Cleaning

In [5]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

In [6]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [7]:
brief_cleaning = [re.sub(r'http\S+', ' ', row) for row in sentences] # Removes all websites from text
brief_cleaning = [re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in brief_cleaning] # Retains only alphabets in text
brief_cleaning

["instructions for interested applicants what you will do we are looking for a data scientist as part of our machine learning team the ideal candidate will leverage strong collaboration skills and ability to extract valuable insights from highly complex medical insurance data sets to ask the right questions and find the right answers you will have great opportunity to work with data scientist to understand and learn about how we can leverage ai ml in the health insurance medical field to detect fraud waste improve automation efficiency promote vitality duties and responsibilities analyze raw data assessing quality cleansing structuring for downstream processing be heavily involved to bring analytical prototypes to production with the data engineering dev ops teams become a subject matter expert in the health insurance domain generate actionable insights for business improvements help to develop customizable reports production ready dashboards for clients requirements bachelor's degree 

In [8]:
txt = []
for doc in tqdm(nlp.pipe(brief_cleaning, batch_size=5000)):
    cleaned_doc = cleaning(doc)
    if cleaned_doc:
        txt.append(cleaned_doc)
txt

1258it [00:44, 28.33it/s]


["instruction interested applicant look data scientist machine learning team ideal candidate leverage strong collaboration skill ability extract valuable insight highly complex medical insurance datum set ask right question find right answer great opportunity work datum scientist understand learn leverage ai ml health insurance medical field detect fraud waste improve automation efficiency promote vitality duty responsibility analyze raw datum assess quality cleansing structure downstream processing heavily involved bring analytical prototype production data engineering dev op team subject matter expert health insurance domain generate actionable insight business improvement help develop customizable report production ready dashboard client requirement bachelor degree equivalent experience quantitative field statistic mathematics computer science engineering etc year ' experience quantitative analytic datum modeling ability write robust code python good understanding database system sq

### Generating Bigrams

In [9]:
MIN_BIGRAM_COUNT = 100
sent = [row.split() for row in txt]
phrases = Phrases(sent, min_count=MIN_BIGRAM_COUNT, progress_per=10000)
sentences = phrases[sent]

# Training Model

In [10]:
cores = multiprocessing.cpu_count()

In [11]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1,
                     vector_size=300)

In [12]:
start_time = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print(f'Time Elapsed: {time() - start_time} seconds')

Time Elapsed: 0.9352293014526367 seconds


In [13]:
start_time = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print(f'Time Elapsed: {time() - start_time} seconds')

Time Elapsed: 33.46021318435669 seconds


In [14]:
query = "python"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "visualization"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "warehouse"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "singapore"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))

Words similar to python
[('javascript', 0.8606737852096558), ('sas', 0.8428760766983032), ('r', 0.8382299542427063), ('python_java', 0.8191605806350708), ('scala', 0.8109925389289856), ('scripting', 0.8014379143714905), ('sql', 0.785336971282959), ('java', 0.739366888999939), ('c', 0.7274278402328491), ('proficiency', 0.7034691572189331)]

Words similar to visualization
[('tableau', 0.8548766374588013), ('visualisation', 0.8166266679763794), ('bi', 0.7883120179176331), ('powerbi', 0.7580701112747192), ('dashboard', 0.7385857701301575), ('excel', 0.7164304256439209), ('sas', 0.7135964035987854), ('qlik', 0.7130809426307678), ('modelling', 0.6874540448188782), ('reporting', 0.6630122065544128)]

Words similar to warehouse
[('lake', 0.8905083537101746), ('warehousing', 0.7711740732192993), ('ingestion', 0.7668590545654297), ('etl', 0.7572290897369385), ('snowflake', 0.7249391078948975), ('big', 0.6915439963340759), ('premise', 0.6898927688598633), ('cloudera', 0.6821742057800293), ('infor

In [15]:
print(w2v_model.wv.similarity("python", "c"))
print(w2v_model.wv.similarity("python", "singapore"))
print(w2v_model.wv.similarity("data", "visualization"))

0.7274278
-0.08734734
0.36319077


In [16]:
# Getting all words in the model's vocabulary
for k, v in w2v_model.wv.key_to_index.items():
    print(k)

datum
experience
tiktok
team
work
business
data
system
model
machine_learn
recommendation
build
skill
algorithm
solution
user
machine_learning
responsibility
product
technology
develop
platform
apply
content
tool
good
include
risk
e_commerce
mission_inspire
design
creativity_bring
understand
project
strategy
analytic
provide
support
engineering
strong
etc
mining
qualification
science
understanding
search
ability
requirement
improve
related
management
year
new
service
application
opportunity
joy
mobile_video
short_form
analysis
lead_destination
problem
sql
learn
drive
technical
deep
s
optimize
high
quality
knowledge
solid
engineer
process
stakeholder
responsible
nlp
pipeline
performance
development
paris_berlin
singapore_jakarta
global_office
seoul_tokyo
include_los
angeles_new
york_london
ml
need
role
e_g
job
ensure
area
relevant
security
large_scale
discipline
computer_science
big
cross_functional
degree_computer
python
spark
implement
software
ai
lead
infrastructure
c
technique
rank


In [17]:
# Viewing all bigrams
for word, ind in w2v_model.wv.key_to_index.items():
    if "_" in word:
        print(word)

machine_learn
machine_learning
e_commerce
mission_inspire
creativity_bring
mobile_video
short_form
lead_destination
paris_berlin
singapore_jakarta
global_office
seoul_tokyo
include_los
angeles_new
york_london
e_g
large_scale
computer_science
cross_functional
degree_computer
problem_solve
qualification_bachelor
follow_area
structure_algorithm
dubai_mumbai
strong_communication
product_objective
computer_vision
programming_language
relate_field
technique_improve
search_engine
commit_create
platform_connect
inclusive_space
employee_value
unique_perspective
people_globe
develop_state
joy_achieve
community_reach
goal_committed
voice_create
celebrate_diverse
environment_reflect
industry_lead
develop_highly
scalable_classifier
software_development
passionate_hope
solve_challenging
risk_control
teamwork_skill
c_python
relate_technical
information_retrieval
cut_edge
include_limit
recommender_system
pursue_bold
live_stream
communication_teamwork
trust_safety
science_relate
account_integrity
gener

In [18]:
# Viewing all bigrams with "data" in it
printed = []
for i in range(len(phrases[sent])):
    for word in phrases[sent][i]:
        if "data" in word and "_" in word and word not in printed:
            print(word)
            printed.append(word)

In [19]:
# Saving Word Vectors into human-readable format (non-binary)
word_vectors = w2v_model.wv
word_vectors.save_word2vec_format(write_path + "w2v.wordvectors")

In [20]:
# Saving Word Vectors into spaCy format (binary)
!python -m spacy init vectors en "./Models/w2v.wordvectors" "./Models/"

[38;5;4m[i] Creating blank nlp object for language 'en'[0m
[38;5;2m[+] Successfully converted 1386 vectors[0m
[38;5;2m[+] Saved nlp object with vectors to output directory. You can now use
the path to it in your config as the 'vectors' setting in [initialize].[0m
C:\Users\ernest.liu\Documents\git\dsa3101-2220-12-ds\Backend\Code\NER
Model\Models


[2023-03-22 14:23:46,881] [INFO] Reading vectors from Models\w2v.wordvectors

0it [00:00, ?it/s]
489it [00:00, 4877.98it/s]
1211it [00:00, 6219.92it/s]
1386it [00:00, 6130.63it/s]
[2023-03-22 14:23:47,117] [INFO] Loaded vectors from Models\w2v.wordvectors


In [21]:
# Loading model
w2v_model = KeyedVectors.load_word2vec_format(write_path + "w2v.wordvectors", binary = False)