In [1]:
import pandas as pd
import numpy as np
import re
import math
import spacy
from tqdm import tqdm, trange
from gensim.models.phrases import Phrases, Phraser
from collections import defaultdict
import multiprocessing
from gensim.models import Word2Vec, KeyedVectors
from time import time
import matplotlib.pyplot as plt
import os
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from datasets import load_dataset
from multiprocess import Process, cpu_count
import multiprocess as mp

In [2]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('punkt')

# Assembling Corpus

In [3]:
# If you do not want to build word embeddings based on the sentences of a particular dataset, set it to False
TO_READ = {
    "jobs" : True,
    "modules" : True,
    "lightcast" : True,
    "stackof" : True,
    "github" : True,
    "chatgpt" : True,
    "huggingface" : True
}

In [4]:
sentences = [] # Stores all sentences to be fed into Gensim's Word2Vec model

### Gathering Jobs Datasets

In [5]:
JOB_READ = "../../Data/jobs/"
MODULE_READ = "../../../Data/university_courses/"
LIGHTCAST_READ = "../../Data/skills/"
STACKOF_READ = "../../Data/NER_annotated_data/StackOverflow/"
GH_READ = "../../Data/NER_annotated_data/GitHub/"
CHATGPT_READ = "../../Data/NER_annotated_data/ChatGPT/"

NER_WRITE_PATH = "./Model/"
W2V_WRITE_PATH = "../W2V Model/"

In [6]:
if TO_READ["jobs"]:
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_science.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_analyst.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-machine_learning_engineer.csv")['Description'].values.tolist())

    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())

In [7]:
# sentences contains all the descriptions in the four scraped datasets
print(len(sentences))
sentences[-1]

1258


"Why Work for Us We Power the Nation. Make the most of your talents and develop products that can create impact on a national scale. We are an in-house software team, assembled to move with speed and deliver with quality. We Build Reliable Solutions. For Customers, Company and Country. You will be part of the Digital Technology Team and together, you will innovate, create, and deploy digital products that will empower more than 3,800 employees within SP Group and improve the quality of life for the 1.6 million commercial, industrial and residential customers that SP Group serves. We build solutions that enable sustainable high quality lifestyles and help consumers save energy and cost, as well as supporting national goals for a sustainable livable city. Now, imagine the impact you can create. What You’ll Do: Create and maintain multiple robust and high-performance data processing pipeline within Cloud, Private Data Centre and Hybrid data ecosystem Assemble large, complex data sets from

### Gathering Module Datasets

In [8]:
UNI_MODDESC_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_desc",
    "NTU_course_info.csv" : "Course Aims",
    "SMU_course_info.csv" : "Description",
    "SUSS_course_info.csv" : "module description",
    "SUTD_course_info.csv" : "Module description",
    "SIT_Module_Info.csv" : "Module Description "
}

UNI_MODCODE_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_code",
    "NTU_course_info.csv" : "Module Code",
    "SMU_course_info.csv" : "Module Code",
    "SUSS_course_info.csv" : "module code",
    "SUTD_course_info.csv" : "Module code",
    "SIT_Module_Info.csv" : "Module Code"
}

UNI_MODNAME_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_name",
    "NTU_course_info.csv" : "Module Name",
    "SMU_course_info.csv" : "Module Name",
    "SUSS_course_info.csv" : "module name",
    "SUTD_course_info.csv" : "Module Title",
    "SIT_Module_Info.csv" : "Module Title"
}

SKIP_ROWS = {
    "nus_dsa_mods.xlsx" : 0,
    "NTU_course_info.csv" : 0,
    "SMU_course_info.csv" : 1,
    "SUSS_course_info.csv" : 0,
    "SUTD_course_info.csv" : 5,
    "SIT_Module_Info.csv" : 0
}

In [9]:
mod_descriptions = pd.Series([], dtype='object')
for uni, description_col in UNI_MODDESC_MAPPING.items():
    print(f"Gathering module descriptions from {uni}")
    try:
        table = pd.read_excel(MODULE_READ + uni, skiprows=SKIP_ROWS[uni])
    except:
        table = pd.read_csv(MODULE_READ + uni, skiprows=SKIP_ROWS[uni], encoding_errors='ignore')
    
    table_desc = table[UNI_MODDESC_MAPPING[uni]].dropna().reset_index(drop=True)
    mod_descriptions = pd.concat([mod_descriptions, table_desc]).reset_index(drop=True)
    
display(mod_descriptions)

Gathering module descriptions from nus_dsa_mods.xlsx
Gathering module descriptions from NTU_course_info.csv
Gathering module descriptions from SMU_course_info.csv
Gathering module descriptions from SUSS_course_info.csv
Gathering module descriptions from SUTD_course_info.csv
Gathering module descriptions from SIT_Module_Info.csv


0      This module introduces the fundamental concept...
1      The abundance of data being harvested from var...
2      This module is a first course in linear algebr...
3      This is a course in single-variable calculus. ...
4      This module introduces students to the design ...
                             ...                        
173    Students will be grouped into teams of 5-6 and...
174    To keep up-to-date with the advances in techno...
175    Students will undertake an eight-month Integra...
176    This is a major individual project that is to ...
177    This module will endow students with the under...
Length: 178, dtype: object

In [10]:
if TO_READ['modules']:
    for description in mod_descriptions:
        sentences.append(description)

In [11]:
print(len(sentences))
sentences[-1]

1436


'This module will endow students with the understanding of the new challenges big data introduces, in particular in the area of IoT and the currently available solutions. These include (i) challenges pertaining to the modelling, accessing, and storing of big data, (ii) an understanding of the fundamentals of systems designed to store and access big data, (iii) programming paradigms for efficient scalable access to big data, and (iv) data processing methodology to facilitate big data analytics. The module will have a particular emphasis on the impact of the desiderata of scalability and efficiency in big data infrastructures, and expose students with a number of different cloud-based NoSQL systems and their design and implementation details, showing how they can achieve efficiency and scalability. '

### Gathering Lightcast Datasets

In [12]:
file_path = "lightcast_skills_queries-data_analysis_machine learning_ML_statistic.csv"

In [13]:
skills = pd.read_csv(LIGHTCAST_READ + file_path)
skill_descriptions = skills['Skill_Description']

In [14]:
if TO_READ['lightcast']:
    for description in skill_descriptions:
        sentences.append(description)

In [15]:
print(len(sentences))
sentences[-1]

1923


'The National Vital Statistics System (NVSS) is an inter-governmental system of sharing data on the vital statistics of the population of the United States. It involves coordination between the different state health departments of the US states and the National Center for Health Statistics, a division of the Centers for Disease Control and Prevention.'

### Gathering StackOverflow and GitHub Datasets

In [16]:
stack_of_txt_files = [filename for filename in os.listdir(STACKOF_READ) if ".txt" in filename and "2" not in filename]
stack_of_txt_files

['dev.txt', 'test.txt', 'train.txt']

In [17]:
if TO_READ['stackof']:
    for dataset in stack_of_txt_files:
        with open(f"./../../Data/NER_annotated_data/StackOverflow/{dataset}", "r", encoding = "utf-8") as f:
            sentence = ""
            for line in f:
                if line == "\n":
                    sentences.append(sentence[1:])
                    sentence = ""
                else:
                    word, man_label, comp_label = line.split("\t")
                    word = word.strip()
                    if word in string.punctuation:
                        sentence += word
                    else:
                        sentence += " " + word

In [18]:
if TO_READ['github']:
    github_filename = "gh_test.txt"
    with open(GH_READ + github_filename, "r", encoding = "utf-8") as f:
        sentence = ""
        for line in f:
            if line == "\n":
                sentences.append(sentence[1:])
                sentence = ""
            else:
                word, man_label, comp_label = line.split("\t")
                word = word.strip()
                if word in string.punctuation:
                    sentence += word
                else:
                    sentence += " " + word

In [19]:
print(len(sentences))
sentences[-1]

23485


'Change-type: patch Signed-off-by: Theodor Gherzan theodor@resin.io'

### Gathering ChatGPT Dataset

In [20]:
if TO_READ['chatgpt']:
    chatgpt_filename = "chatgpt_sentences.txt"
    with open(CHATGPT_READ + chatgpt_filename, "r", encoding = 'utf-8') as f:
        for line in f:
            sentences.append(line)

In [21]:
print(len(sentences))
sentences[-1]

23735


'Familiarity with agile software development methodologies'

### Gathering Huggingface Dataset
Credits: https://huggingface.co/datasets/jpwahle/dblp-discovery-dataset

In [22]:
if TO_READ["huggingface"]:
    start_time = time()
    hf_dataset = load_dataset("jpwahle/dblp-discovery-dataset", 'papers', split='train[:2000000]') # Take only first 2000000 rows out of 5 million. Otherwise my computer runs into OOM issues LOL
    abstracts = hf_dataset['abstract']
    filtered_dataset = []
    for i, row in tqdm(enumerate(hf_dataset['s2fieldsofstudy'])):
        if not row: continue
        categories = row['category']
        if "Mathematics" not in categories:
            continue
        else:
            filtered_dataset.append(abstracts[i])
    print(f"Dataset loaded in {time() - start_time} seconds")
    sentences.extend(filtered_dataset)

Found cached dataset dblp-discovery-dataset (C:/Users/ernest.liu/.cache/huggingface/datasets/jpwahle___dblp-discovery-dataset/papers/2.0.0/ede8044004622bf018789b8c32a1b7bada6460a02b59e36e9a77b4dd4d5f5247)
2000000it [00:02, 837268.57it/s] 

Dataset loaded in 65.71200489997864 seconds





In [23]:
len(sentences)

401013

In [24]:
sentences[-3]

'Human action recognition from video clips has become an active research field in recent years. Each action has its unique shape and a motion sequence can be suitably represented by a histogram. In this paper a histogram based action recognition method is presented. Motion history images are a good spatiotemporal template for action representation. In the present method, we use local binary patterns of directional motion history images for the histogram representation. We measured the performance of the proposed method along with some variants of it by employing KTH action dataset and found higher accuracy. The presented results also justify the superiority of the proposed method compared to other approaches for action recognition found in literature.'

# Data Cleaning

In [25]:
HTML_PATTERN = re.compile('<.*?>')
STOP_WORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def cleaning(chunk):
    
    # Importing libraries for parallelization later
    import re
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    HTML_PATTERN = re.compile('<.*?>')
    STOP_WORDS = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    outputs = []
    for target_input in chunk:
        # convert to lower case
        target_input = target_input.lower()

        # remove websites
        target_input = re.sub(r'http\S+', ' ', target_input)

        # remove html tags
        target_input = re.sub(HTML_PATTERN, ' ', target_input)

        # remove all non-alphabets
        target_input = re.sub("[^A-Za-z']+", ' ', target_input)

        #will remove extra spaces
        target_input = re.sub(r'\s+',' ',target_input)

        # remove stopwords and lemmatize
        target_input_tokens = nltk.word_tokenize(target_input)
        target_input_tokens_wo_stopwords = [lemmatizer.lemmatize(i) for i in target_input_tokens if i not in STOP_WORDS and i]
        
        outputs.append(target_input_tokens_wo_stopwords)
    
    return outputs

In [26]:
cores = cpu_count()
n_chunks = cores - 1
chunk_size = len(sentences) // n_chunks
chunks = [sentences[i:i+chunk_size] for i in range(0, len(sentences), chunk_size)]

In [27]:
start_time = time()
with mp.Pool(processes=n_chunks) as pool:
    sent = pool.map(cleaning, chunks)

    # Flatten the list of examples
    print("Flattening...")
    sent = [example for chunk in sent for example in chunk]
end_time = time()
print(f"Data cleaning completed in {end_time - start_time} seconds with {n_chunks} cores")

Flattening...
Data cleaning completed in 223.20317959785461 seconds with 7 cores


# Training Word2Vec Model

In [28]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1,
                     vector_size=300)

In [29]:
start_time = time()
w2v_model.build_vocab(sent, progress_per=10000)
print(f'Time to build vocabulary: {time() - start_time} seconds')

Time to build vocabulary: 9.845680475234985 seconds


In [30]:
start_time = time()
w2v_model.train(sent, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print(f'Time to train model: {time() - start_time} seconds')

Time to train model: 998.1999423503876 seconds


# Evaluating Model

In [31]:
query = "python"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "visualization"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "warehouse"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "singapore"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))

Words similar to python
[('golang', 0.6223884224891663), ('java', 0.6103256344795227), ('scala', 0.5778589248657227), ('scripting', 0.5599056482315063), ('numpy', 0.5341468453407288), ('perl', 0.5338581204414368), ('talend', 0.5041490197181702), ('applet', 0.48489707708358765), ('kera', 0.47871559858322144), ('matlab', 0.46168455481529236)]

Words similar to visualization
[('visualisation', 0.6365019083023071), ('visualizing', 0.5592391490936279), ('visualize', 0.5084337592124939), ('glyph', 0.4658353626728058), ('exploratory', 0.4636591374874115), ('graphic', 0.4279281795024872), ('tool', 0.4206080138683319), ('visualizes', 0.4193439781665802), ('mining', 0.4139707386493683), ('medical', 0.4135166108608246)]

Words similar to warehouse
[('warehousing', 0.4651496112346649), ('olap', 0.41488489508628845), ('etl', 0.4090724587440491), ('cloudera', 0.40825262665748596), ('ingestion', 0.40533730387687683), ('talend', 0.3955431878566742), ('devops', 0.392263799905777), ('postgresql', 0.3874

In [32]:
print(w2v_model.wv.similarity("python", "c"))
print(w2v_model.wv.similarity("python", "singapore"))
print(w2v_model.wv.similarity("data", "visualization"))

0.24961944
0.017878931
0.37362587


In [33]:
# Displaying all words in the model's vocabulary
for k, v in w2v_model.wv.key_to_index.items():
    print(k)

algorithm
method
problem
paper
model
system
based
result
proposed
time
n
data
image
show
using
function
two
set
approach
number
also
one
new
used
graph
linear
network
performance
present
order
solution
scheme
case
information
state
matrix
k
point
technique
analysis
parameter
space
error
first
bound
given
control
signal
class
optimal
channel
e
study
g
structure
different
use
property
propose
well
application
distribution
rate
condition
value
feature
code
process
design
non
work
complexity
noise
known
input
p
estimation
high
x
's
simulation
example
term
large
optimization
constraint
multiple
general
presented
random
probability
shown
theory
learning
type
obtained
equation
approximation
filter
form
sequence
mean
r
real
size
many
vector
power
local
efficient
finite
variable
framework
however
several
numerical
polynomial
dynamic
estimate
consider
provide
object
c
low
edge
simple
representation
measure
output
novel
field
f
compared
nonlinear
give
level
dimensional
three
l
distance
single
may

# Saving Model
#### Please don't run anything under this section, I will kms

In [34]:
# Saving Word2Vec model to disk
# w2v_model.save(W2V_WRITE_PATH + "w2v.model")

In [35]:
# Saving Word Vectors into human-readable format (non-binary)
# word_vectors = w2v_model.wv
# word_vectors.save_word2vec_format(W2V_WRITE_PATH + "w2v.wordvectors")

In [36]:
# Saving Word Vectors into spaCy format (binary)
# !python -m spacy init vectors en "../W2V Model/w2v.wordvectors" "./Model"

In [37]:
# Loading model
# w2v_model = Word2Vec.load(W2V_WRITE_PATH + "w2v.model")

In [38]:
# Loading word vectors
# word_vectors = KeyedVectors.load_word2vec_format(W2V_WRITE_PATH + "w2v.wordvectors", binary = False)