In [1]:
import pandas as pd
import numpy as np
import re
import math
import spacy
from tqdm import tqdm, trange
from gensim.models.phrases import Phrases, Phraser
from collections import defaultdict
import multiprocessing
from gensim.models import Word2Vec, KeyedVectors
from time import time
import matplotlib.pyplot as plt
import os
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from datasets import load_dataset
from multiprocess import Process, cpu_count
import multiprocess as mp

In [2]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('punkt')

# Assembling Corpus

In [3]:
# If you do not want to build word embeddings based on the sentences of a particular dataset, set it to False
TO_READ = {
    "jobs" : True,
    "modules" : True,
    "lightcast" : True,
    "stackof" : True,
    "github" : True,
    "chatgpt" : True,
    "huggingface" : True
}

In [4]:
sentences = [] # Stores all sentences to be fed into Gensim's Word2Vec model

### Gathering Jobs Datasets

In [5]:
JOB_READ = "../../Data/jobs/"
MODULE_READ = "../../../Data/university_courses/"
LIGHTCAST_READ = "../../Data/skills/"
STACKOF_READ = "../../Data/NER_annotated_data/StackOverflow/"
GH_READ = "../../Data/NER_annotated_data/GitHub/"
CHATGPT_READ = "../../Data/NER_annotated_data/ChatGPT/"

WRITE_PATH = "./Models/"

In [6]:
if TO_READ["jobs"]:
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_science.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-data_analyst.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "mycareersfuture_query-machine_learning_engineer.csv")['Description'].values.tolist())

    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())
    sentences.extend(pd.read_csv(JOB_READ + "jobstreet_query-machine_learning_engineer.csv")['Description'].values.tolist())

In [7]:
# sentences contains all the descriptions in the four scraped datasets
print(len(sentences))
sentences[-1]

1258


"Why Work for Us We Power the Nation. Make the most of your talents and develop products that can create impact on a national scale. We are an in-house software team, assembled to move with speed and deliver with quality. We Build Reliable Solutions. For Customers, Company and Country. You will be part of the Digital Technology Team and together, you will innovate, create, and deploy digital products that will empower more than 3,800 employees within SP Group and improve the quality of life for the 1.6 million commercial, industrial and residential customers that SP Group serves. We build solutions that enable sustainable high quality lifestyles and help consumers save energy and cost, as well as supporting national goals for a sustainable livable city. Now, imagine the impact you can create. What You’ll Do: Create and maintain multiple robust and high-performance data processing pipeline within Cloud, Private Data Centre and Hybrid data ecosystem Assemble large, complex data sets from

### Gathering Module Datasets

In [8]:
UNI_MODDESC_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_desc",
    "NTU_course_info.csv" : "Course Aims",
    "SMU_course_info.csv" : "Description",
    "SUSS_course_info.csv" : "module description",
    "SUTD_course_info.csv" : "Module description",
    "SIT_Module_Info.csv" : "Module Description "
}

SKIP_ROWS = {
    "nus_dsa_mods.xlsx" : 0,
    "NTU_course_info.csv" : 0,
    "SMU_course_info.csv" : 1,
    "SUSS_course_info.csv" : 0,
    "SUTD_course_info.csv" : 5,
    "SIT_Module_Info.csv" : 0
}

In [9]:
mod_descriptions = pd.Series([], dtype='object')
for uni, description_col in UNI_MODDESC_MAPPING.items():
    print(f"Gathering module descriptions from {uni}")
    try:
        table = pd.read_excel(MODULE_READ + uni, skiprows=SKIP_ROWS[uni])
    except:
        table = pd.read_csv(MODULE_READ + uni, skiprows=SKIP_ROWS[uni], encoding_errors='ignore')
    
    table_desc = table[UNI_MODDESC_MAPPING[uni]].dropna().reset_index(drop=True)
    mod_descriptions = pd.concat([mod_descriptions, table_desc]).reset_index(drop=True)
    
display(mod_descriptions)

Gathering module descriptions from nus_dsa_mods.xlsx
Gathering module descriptions from NTU_course_info.csv
Gathering module descriptions from SMU_course_info.csv
Gathering module descriptions from SUSS_course_info.csv
Gathering module descriptions from SUTD_course_info.csv
Gathering module descriptions from SIT_Module_Info.csv


0      This module introduces the fundamental concept...
1      The abundance of data being harvested from var...
2      This module is a first course in linear algebr...
3      This is a course in single-variable calculus. ...
4      This module introduces students to the design ...
                             ...                        
173    Students will be grouped into teams of 5-6 and...
174    To keep up-to-date with the advances in techno...
175    Students will undertake an eight-month Integra...
176    This is a major individual project that is to ...
177    This module will endow students with the under...
Length: 178, dtype: object

In [10]:
if TO_READ['modules']:
    for description in mod_descriptions:
        sentences.append(description)

In [11]:
print(len(sentences))
sentences[-1]

1436


'This module will endow students with the understanding of the new challenges big data introduces, in particular in the area of IoT and the currently available solutions. These include (i) challenges pertaining to the modelling, accessing, and storing of big data, (ii) an understanding of the fundamentals of systems designed to store and access big data, (iii) programming paradigms for efficient scalable access to big data, and (iv) data processing methodology to facilitate big data analytics. The module will have a particular emphasis on the impact of the desiderata of scalability and efficiency in big data infrastructures, and expose students with a number of different cloud-based NoSQL systems and their design and implementation details, showing how they can achieve efficiency and scalability. '

### Gathering Lightcast Datasets

In [12]:
file_path = "lightcast_skills_queries-data_analysis_machine learning_ML_statistic.csv"

In [13]:
skills = pd.read_csv(LIGHTCAST_READ + file_path)
skill_descriptions = skills['Skill_Description']

In [14]:
if TO_READ['lightcast']:
    for description in skill_descriptions:
        sentences.append(description)

In [15]:
print(len(sentences))
sentences[-1]

1923


'The National Vital Statistics System (NVSS) is an inter-governmental system of sharing data on the vital statistics of the population of the United States. It involves coordination between the different state health departments of the US states and the National Center for Health Statistics, a division of the Centers for Disease Control and Prevention.'

### Gathering StackOverflow and GitHub Datasets

In [16]:
stack_of_txt_files = [filename for filename in os.listdir(STACKOF_READ) if ".txt" in filename and "2" not in filename]
stack_of_txt_files

['dev.txt', 'test.txt', 'train.txt']

In [17]:
if TO_READ['stackof']:
    for dataset in stack_of_txt_files:
        with open(f"./../../Data/NER_annotated_data/StackOverflow/{dataset}", "r", encoding = "utf-8") as f:
            sentence = ""
            for line in f:
                if line == "\n":
                    sentences.append(sentence[1:])
                    sentence = ""
                else:
                    word, man_label, comp_label = line.split("\t")
                    word = word.strip()
                    if word in string.punctuation:
                        sentence += word
                    else:
                        sentence += " " + word

In [18]:
if TO_READ['github']:
    github_filename = "gh_test.txt"
    with open(GH_READ + github_filename, "r", encoding = "utf-8") as f:
        sentence = ""
        for line in f:
            if line == "\n":
                sentences.append(sentence[1:])
                sentence = ""
            else:
                word, man_label, comp_label = line.split("\t")
                word = word.strip()
                if word in string.punctuation:
                    sentence += word
                else:
                    sentence += " " + word

In [19]:
print(len(sentences))
sentences[-1]

23485


'Change-type: patch Signed-off-by: Theodor Gherzan theodor@resin.io'

### Gathering ChatGPT Dataset

In [20]:
if TO_READ['chatgpt']:
    chatgpt_filename = "chatgpt_sentences.txt"
    with open(CHATGPT_READ + chatgpt_filename, "r", encoding = 'utf-8') as f:
        for line in f:
            sentences.append(line)

In [21]:
print(len(sentences))
sentences[-1]

23735


'Familiarity with agile software development methodologies'

### Gathering Huggingface Dataset
Credits: https://huggingface.co/datasets/jpwahle/dblp-discovery-dataset

In [91]:
if TO_READ["huggingface"]:
    start_time = time()
    hf_dataset = load_dataset("jpwahle/dblp-discovery-dataset", 'papers', split='train[:1000000]') # Take only first 100000 rows out of 5 million. Otherwise my computer runs into OOM issues LOL
    abstracts = hf_dataset['abstract']
    filtered_dataset = []
    for i, row in tqdm(enumerate(hf_dataset['s2fieldsofstudy'])):
        if not row: continue
        categories = row['category']
        if "Mathematics" not in categories:
            continue
        else:
            filtered_dataset.append(abstracts[i])
    print(f"Dataset loaded in {time() - start_time} seconds")
    sentences.extend(filtered_dataset)

Found cached dataset dblp-discovery-dataset (C:/Users/ernest.liu/.cache/huggingface/datasets/jpwahle___dblp-discovery-dataset/papers/2.0.0/ede8044004622bf018789b8c32a1b7bada6460a02b59e36e9a77b4dd4d5f5247)
1000000it [00:00, 1311826.03it/s]

Dataset loaded in 26.111464738845825 seconds





In [92]:
len(sentences)

514553

In [105]:
sentences[-3]

'This paper deals with the problem of fault (or disturbance) decoupling in nonlinear systems. A new method is proposed to increase dimension of state subspace which is insensitive to fault (or disturbance). This method is based on a nonlinear filter defined by means of the generalized output injection. An example is presented which illustrates results.'

# Data Cleaning

In [24]:
HTML_PATTERN = re.compile('<.*?>')
STOP_WORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def cleaning(chunk):
    import re
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    HTML_PATTERN = re.compile('<.*?>')
    STOP_WORDS = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    outputs = []
    for target_input in chunk:
        # convert to lower case
        target_input = target_input.lower()

        # remove websites
        target_input = re.sub(r'http\S+', ' ', target_input)

        # remove html tags
        target_input = re.sub(HTML_PATTERN, ' ', target_input)

        # remove all non-alphabets
        target_input = re.sub("[^A-Za-z']+", ' ', target_input)

        #will remove extra spaces
        target_input = re.sub(r'\s+',' ',target_input)

        # remove stopwords and lemmatize
        target_input_tokens = nltk.word_tokenize(target_input)
        target_input_tokens_wo_stopwords = [lemmatizer.lemmatize(i) for i in target_input_tokens if i not in STOP_WORDS and i]
        
        outputs.append(target_input_tokens_wo_stopwords)
    
    return outputs

In [25]:
# start_time = time()
# sent = [cleaning(sentence) for sentence in sentences]
# print(f"Data cleaning completed in {time() - start_time} seconds")

In [26]:
cores = cpu_count()
n_chunks = cores - 1
chunk_size = len(sentences) // n_chunks
chunks = [sentences[i:i+chunk_size] for i in range(0, len(sentences), chunk_size)]

In [27]:
start_time = time()
with mp.Pool(processes=n_chunks) as pool:
    sent = pool.map(cleaning, chunks)

    # Flatten the list of examples
    print("Flattening...")
    sent = [example for chunk in sent for example in chunk]
end_time = time()
print(f"Data cleaning completed in {end_time - start_time} seconds with {n_chunks} cores")

Flattening...
Data cleaning completed in 54.09854531288147 seconds with 7 cores


### Generating Bigrams - Do not use because it tends to produce worse results

In [30]:
# MIN_BIGRAM_COUNT = 100
# phrases = Phrases(sent, min_count=MIN_BIGRAM_COUNT, progress_per=10000)
# sentences = phrases[sent]

# Training Model

In [31]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1,
                     vector_size=300)

In [32]:
start_time = time()
w2v_model.build_vocab(sent, progress_per=10000)
print(f'Time to build vocabulary: {time() - start_time} seconds')

Time to build vocabulary: 2.3174614906311035 seconds


In [33]:
start_time = time()
w2v_model.train(sent, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print(f'Time to train model: {time() - start_time} seconds')

Time to train model: 281.89956426620483 seconds


In [34]:
query = "python"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "visualization"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "warehouse"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))
print()

query = "singapore"
print(f"Words similar to {query}")
print(w2v_model.wv.most_similar(positive=[query]))

Words similar to python
[('golang', 0.5953328013420105), ('java', 0.5062761902809143), ('perl', 0.49531954526901245), ('numpy', 0.4851156771183014), ('scala', 0.4706411063671112), ('vba', 0.450999915599823), ('scripting', 0.4478495419025421), ('sql', 0.43705251812934875), ('pytorch', 0.42267802357673645), ('talend', 0.415731281042099)]

Words similar to visualization
[('visualisation', 0.5771949887275696), ('tool', 0.5286591053009033), ('visualizing', 0.5177838206291199), ('interactive', 0.45796141028404236), ('glyph', 0.45366370677948), ('visualize', 0.4480428695678711), ('scatterplots', 0.4300623834133148), ('infovis', 0.42647597193717957), ('graphic', 0.4126529395580292), ('exploration', 0.408756285905838)]

Words similar to warehouse
[('warehousing', 0.5517992973327637), ('mart', 0.4812602698802948), ('etl', 0.47809740900993347), ('database', 0.437243789434433), ('snowflake', 0.42627203464508057), ('materialized', 0.4228900969028473), ('olap', 0.4165637195110321), ('rdbms', 0.40453

In [35]:
print(w2v_model.wv.similarity("python", "c"))
print(w2v_model.wv.similarity("python", "singapore"))
print(w2v_model.wv.similarity("data", "visualization"))

0.38070914
0.08412752
0.325069


In [36]:
# Getting all words in the model's vocabulary
for k, v in w2v_model.wv.key_to_index.items():
    print(k)

system
data
model
based
paper
algorithm
method
result
network
problem
using
time
approach
performance
proposed
information
show
user
used
application
two
present
new
also
one
design
technique
use
analysis
image
set
different
study
process
number
work
learning
control
high
e
feature
's
level
function
propose
well
service
first
scheme
simulation
large
order
solution
state
task
structure
case
power
real
however
n
framework
research
software
many
provide
environment
object
error
language
channel
multiple
rate
communication
tool
technology
dynamic
node
support
space
signal
experiment
code
point
type
parameter
sensor
may
given
quality
knowledge
several
test
machine
cost
architecture
term
novel
low
source
graph
efficient
important
value
implementation
development
resource
domain
g
presented
need
existing
multi
search
class
computer
non
optimal
three
area
detection
energy
way
developed
input
web
strategy
human
mobile
component
experimental
single
device
processing
property
compared
linear
exam

In [37]:
# Viewing all bigrams
for word, ind in w2v_model.wv.key_to_index.items():
    if "_" in word:
        print(word)

In [38]:
# Viewing all bigrams with "data" in it
# printed = []
# for i in range(len(phrases[sent])):
#     for word in phrases[sent][i]:
#         if "data" in word and "_" in word and word not in printed:
#             print(word)
#             printed.append(word)

In [39]:
# Saving Word Vectors into human-readable format (non-binary)
# word_vectors = w2v_model.wv
# word_vectors.save_word2vec_format(WRITE_PATH + "w2v.wordvectors")

In [40]:
# Saving Word Vectors into spaCy format (binary)
# !python -m spacy init vectors en "./Models/w2v.wordvectors" "./Models/"

In [41]:
# Loading model
# w2v_model = KeyedVectors.load_word2vec_format(WRITE_PATH + "w2v.wordvectors", binary = False)

## Link to Trained spaCy Model

In [42]:
import spacy
from spacy import displacy

In [43]:
nlp_ner = spacy.load("model-best")

In [44]:
doc = nlp_ner(
'''
In statistics, exploratory data analysis is an approach to analyzing data sets to summarize their main characteristics, often using statistical graphics and other data visualization methods. A statistical model can be used or not, but primarily EDA is for seeing what the data can tell us beyond the formal modeling or hypothesis testing task. Exploratory data analysis was promoted by John Tukey to encourage statisticians to explore the data, and possibly formulate hypotheses that could lead to new data collection and experiments. EDA is different from initial data analysis (IDA), which focuses more narrowly on checking assumptions required for model fitting and hypothesis testing, and handling missing values and making transformations of variables as needed. EDA encompasses IDA.
'''
)
spacy.displacy.render(doc, style = 'ent', jupyter=True)

In [45]:
doc.ents

(statistics,
 exploratory data analysis,
 often using statistical graphics,
 data visualization,
 modeling,
 hypothesis testing,
 Exploratory data analysis,
 possibly,
 hypotheses,
 data analysis,
 hypothesis testing,
 handling missing values)

In [61]:
w2v_model.wv.similarity("mathematics", "english")

0.13650116

In [47]:
def get_score(job_desc, mod_desc, verbose = 1):
    job_desc = nlp_ner(job_desc)
    mod_desc = nlp_ner(mod_desc)
    scores = []
    OOV = [] # Stores out of vocabulary words
    
    for job_ents in job_desc.ents:
        job_ents = cleaning([job_ents.text])[0]
        for job_ent in job_ents:
            if job_ent not in w2v_model.wv: # If job_ent not found in vocabulary
                if job_ent not in OOV and job_ent not in STOP_WORDS:
                    OOV.append(job_ent)
                    if verbose:
                        print(f"JOB: {job_ent} not found in vocabulary")
                scores.append(0)
                continue
            max_cossim = -1
            best_mod_ent = None
            for mod_ents in mod_desc.ents:
                mod_ents = cleaning([mod_ents.text])[0]
                for mod_ent in mod_ents:
                    if mod_ent not in w2v_model.wv:
                        if mod_ent not in STOP_WORDS and mod_ent not in OOV:
                            OOV.append(mod_ent)
                            if verbose:
                                print(f"MODULE: {mod_ent} not found in vocabulary")
                    else:
                        cos_sim = w2v_model.wv.similarity(job_ent, mod_ent)
                        if cos_sim >= max_cossim:
                            max_cossim = cos_sim
                            best_mod_ent = mod_ent
                if best_mod_ent == None:
                    if verbose:
                        print(f"No matching skills found for {job_ent} in {mod_desc}")
                    scores.append(0)
                else:
                    score = (math.pi - math.acos(max_cossim)) * 100 / math.pi
                    scores.append(score)

    return np.mean(np.array(scores))

In [48]:
job_desc = """
What a College Intern - Data Science does at HP:
Attached to the "Smart Manufacturing Application and Research Center".
Work with an enterprising team of data scientists and build solutions to track, analyze and visualize the manufacturing and outbound quality of our supplies.
Generate deep insights through the analysis of data and understanding of operational processes and turn them into actionable recommendations.
Develop methodologies for optimizing our business processes through data visualization, real-time monitoring, predictive analytics etc.
Are you a high-performer? We are looking for an individual with:
Studying Bachelor’s degree in Computer Science, Business Analytics, Information Systems, Industrial Engineering, Statistics with good experience in programming.
Excellent analytical thinking, programming (using R/Python is desirable), and problem-solving skills.
Knowledge of data analytics, data warehousing, database management (preferably using SQL) and data visualization using RShiny and Plotly.
Fundamental knowledge of statistics and probability.
Good visualization skills to create real-time dashboards and/or reports to inform trends and insights.
    """

In [49]:
mod_desc1 = """
This module covers common algorithmic techniques for solving optimisation problems, and introduces students to approaches for finding good-enough solutions to NP-hard problems. Topics covered include linear and integer programming, network flow algorithms, local search heuristics, approximation algorithms, and randomized algorithms. Through analysis and application of the techniques to a variety of canonical problems, students develop confidence to (i) appropriately model a given optimisation problem, (ii) apply appropriate algorithmic techniques to solve the problem, (iii) analyse the properties of the problem and candidate algorithms, such as time and space complexity, convergence, approximability, and optimality bound.
"""

In [50]:
mod_desc2 = """
Data visualisation is an essential tool for data analytics. This module is an introduction to data cleaning, exploration, analysis and visualisation. Students will learn how to take raw data, extract meaningful information, use statistical tools, and make visualisations. Topics include: programming in R, introduction to data storage systems, data manipulation, exploratory data analysis, dimension reduction, statistical graphics for univariate, multivariate (high-dimensional), temporal and spatial data, basic design principles and critical evaluation of visual displays of data.
"""

In [51]:
print(f"Score: {get_score(job_desc, mod_desc1, verbose = 1)}")
print(f"Score: {get_score(job_desc, mod_desc2, verbose = 1)}")

JOB: rshiny not found in vocabulary
JOB: plotly not found in vocabulary
Score: 57.41024398421825
JOB: rshiny not found in vocabulary
JOB: plotly not found in vocabulary
Score: 66.2972698837946


## Finding best module per skill

In [106]:
UNI_MODCODE_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_code",
    "NTU_course_info.csv" : "Module Code",
    "SMU_course_info.csv" : "Module Code",
    "SUSS_course_info.csv" : "module code",
    "SUTD_course_info.csv" : "Module code",
    "SIT_Module_Info.csv" : "Module Code"
}

UNI_MODNAME_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_name",
    "NTU_course_info.csv" : "Module Name",
    "SMU_course_info.csv" : "Module Name",
    "SUSS_course_info.csv" : "module name",
    "SUTD_course_info.csv" : "Module Title",
    "SIT_Module_Info.csv" : "Module Title"
}

In [120]:
modules = pd.DataFrame([], dtype='object', columns = ["school", "code", "name", "description"])
for uni, description_col in UNI_MODDESC_MAPPING.items():
    school_name = uni.split("_")[0].upper()
    print(f"Gathering module descriptions from {school_name}")
    try:
        table = pd.read_excel(MODULE_READ + uni, skiprows=SKIP_ROWS[uni])
    except:
        table = pd.read_csv(MODULE_READ + uni, skiprows=SKIP_ROWS[uni], encoding_errors='ignore')
    
    table = table[[UNI_MODCODE_MAPPING[uni], UNI_MODNAME_MAPPING[uni], UNI_MODDESC_MAPPING[uni]]].dropna().reset_index(drop=True)
    table.rename(columns = {
        UNI_MODCODE_MAPPING[uni] : "code",
        UNI_MODNAME_MAPPING[uni] : "name",
        UNI_MODDESC_MAPPING[uni] : "description"
    }, inplace=True)
    table["school"] = school_name
    
    modules = pd.concat([modules, table], axis = 0).reset_index(drop=True)
    
display(modules)

Gathering module descriptions from NUS
Gathering module descriptions from NTU
Gathering module descriptions from SMU
Gathering module descriptions from SUSS
Gathering module descriptions from SUTD
Gathering module descriptions from SIT


Unnamed: 0,school,code,name,description
0,NUS,CS1010,Programming Methodology,This module introduces the fundamental concept...
1,NUS,DSA1101,Introduction to Data Science,The abundance of data being harvested from var...
2,NUS,MA2001,Linear Algebra I,This module is a first course in linear algebr...
3,NUS,MA2002,Calculus,This is a course in single-variable calculus. ...
4,NUS,CS2040,Data Structures and Algorithms,This module introduces students to the design ...
...,...,...,...,...
173,SIT,ICT3211,Integrative Team Project,Students will be grouped into teams of 5-6 and...
174,SIT,ICT3210/ICT3110,Industry Certification Module,To keep up-to-date with the advances in techno...
175,SIT,CSC3002,Integrated Work Study Programme,Students will undertake an eight-month Integra...
176,SIT,ICT4001,Capstone Project,This is a major individual project that is to ...


In [129]:
start_time = time()
modules_copy = modules.copy()
modules_copy['score'] = modules.description.apply(lambda x: get_score(job_desc, x, verbose = 0))
end_time = time()
print(f"Time taken: {end_time - start_time} seconds")
display(modules_copy)

Time taken: 41.21925973892212 seconds


Unnamed: 0,school,code,name,description,score
0,NUS,CS1010,Programming Methodology,This module introduces the fundamental concept...,59.845216
1,NUS,DSA1101,Introduction to Data Science,The abundance of data being harvested from var...,64.006379
2,NUS,MA2001,Linear Algebra I,This module is a first course in linear algebr...,56.428487
3,NUS,MA2002,Calculus,This is a course in single-variable calculus. ...,53.830623
4,NUS,CS2040,Data Structures and Algorithms,This module introduces students to the design ...,61.133539
...,...,...,...,...,...
173,SIT,ICT3211,Integrative Team Project,Students will be grouped into teams of 5-6 and...,0.000000
174,SIT,ICT3210/ICT3110,Industry Certification Module,To keep up-to-date with the advances in techno...,50.562230
175,SIT,CSC3002,Integrated Work Study Programme,Students will undertake an eight-month Integra...,0.000000
176,SIT,ICT4001,Capstone Project,This is a major individual project that is to ...,0.000000


In [131]:
modules_copy.groupby('school').agg(max(modules_copy['score']))

TypeError: 'float' object is not callable