In [1]:
import pandas as pd
import numpy as np
import re
import math
import spacy
from tqdm import tqdm, trange
from gensim.models.phrases import Phrases, Phraser
from collections import defaultdict
import multiprocessing
from gensim.models import Word2Vec, KeyedVectors
from time import time
import matplotlib.pyplot as plt
import os
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from datasets import load_dataset
from multiprocess import Process, cpu_count
import multiprocess as mp
import spacy
from spacy import displacy

In [2]:
W2V_MODEL_PATH = "../W2V Model/"
NER_MODEL_PATH = "./model-best"

In [3]:
CUSTOM_OPTIONS = {"colors" : {"SKILL" : "#78C0E0"}}

In [4]:
w2v_model = Word2Vec.load(W2V_MODEL_PATH + "w2v.model")

In [5]:
nlp_ner = spacy.load("model-best")



### Gathering Module Datasets

In [6]:
# Getting cached results
skill_sch_code = modules_copy = pd.read_csv('../../Data/skill_sch_code.csv')
modules_copy = pd.read_csv('../../../Data/university_courses/All_courses_info.csv')

### Function for preprocessing data

In [7]:
HTML_PATTERN = re.compile('<.*?>')
STOP_WORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def cleaning(chunk):
    
    # Importing libraries for parallelization later
    import re
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    HTML_PATTERN = re.compile('<.*?>')
    STOP_WORDS = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    outputs = []
    for target_input in chunk:
        # convert to lower case
        target_input = target_input.lower()

        # remove websites
        target_input = re.sub(r'http\S+', ' ', target_input)

        # remove html tags
        target_input = re.sub(HTML_PATTERN, ' ', target_input)

        # remove all non-alphabets
        target_input = re.sub("[^A-Za-z']+", ' ', target_input)

        #will remove extra spaces
        target_input = re.sub(r'\s+',' ',target_input)

        # remove stopwords and lemmatize
        target_input_tokens = nltk.word_tokenize(target_input)
        target_input_tokens_wo_stopwords = [lemmatizer.lemmatize(i) for i in target_input_tokens if i not in STOP_WORDS and i]
        
        outputs.append(target_input_tokens_wo_stopwords)
    
    return outputs

### Functions for Generating Scores

In [8]:
def calc_score(cos_sim):
    """
    Calculates a score from 0 to 100 given a cosine-similarity score that ranges from -1 to 1
    
    Details:
    Let A be the angle between two vectors u and v. In other words, cos_sim = cos(A)
    score = A/180 * 100
    """
    
    # To handle computational rounding errors. Sometimes cos_sim is 1.00001 or -1.00001
    if cos_sim > 1: cos_sim = 1
    elif cos_sim < -1: cos_sim = -1
    
    return (math.pi - math.acos(cos_sim)) * 100 / math.pi

In [9]:
# def get_doc2doc_score(job_desc, mod_desc, verbose = 1):
#     """
#     Computes the similarity score between two documents. 
#     This function is available if the front-end team wants to compare module against module.
#     """
#     job_desc = nlp_ner(job_desc)
#     mod_desc = nlp_ner(mod_desc)
#     scores = []
#     OOV = [] # Stores out of vocabulary words
    
#     for job_ents in job_desc.ents:
#         job_ents = cleaning([job_ents.text])[0]
#         for job_ent in job_ents:
#             if job_ent not in w2v_model.wv: # If job_ent not found in vocabulary
#                 if job_ent not in OOV and job_ent not in STOP_WORDS:
#                     OOV.append(job_ent)
#                     if verbose:
#                         print(f"JOB: {job_ent} not found in vocabulary")
#                 scores.append(0)
#                 continue
#             max_cossim = -1
#             best_mod_ent = None
#             for mod_ents in mod_desc.ents:
#                 mod_ents = cleaning([mod_ents.text])[0]
#                 for mod_ent in mod_ents:
#                     if mod_ent not in w2v_model.wv:
#                         if mod_ent not in STOP_WORDS and mod_ent not in OOV:
#                             OOV.append(mod_ent)
#                             if verbose:
#                                 print(f"MODULE: {mod_ent} not found in vocabulary")
#                     else:
#                         cos_sim = w2v_model.wv.similarity(job_ent, mod_ent)
                        
#                         # To handle computational rounding errors. Sometimes cos_sim is 1.00001 or -1.00001
#                         if cos_sim > 1: cos_sim = 1
#                         elif cos_sim < -1: cos_sim = -1
                            
#                         if cos_sim >= max_cossim:
#                             max_cossim = cos_sim
#                             best_mod_ent = mod_ent
#                 if best_mod_ent == None:
#                     if verbose:
#                         print(f"No matching skills found for {job_ent} in {mod_desc}")
#                     scores.append(0)
#                 else:
#                     score = calc_score(max_cossim)
#                     scores.append(score)

#     return np.mean(np.array(scores))

In [10]:
def process_job_desc(job_desc):
    job_desc = nlp_ner(job_desc)
    result = [cleaning([job_ents.text])[0] for job_ents in job_desc.ents]
    return result

In [11]:
def get_skill2mod_score(skill, mod_desc):
    """
    Generates a score and the skill token identified in `mod_desc` with the closest match to `skill`
    """
    mod_desc = nlp_ner(mod_desc)
    max_score = 0
    best_ent = None
    if skill not in w2v_model.wv:
        return max_score, best_ent
    for mod_ents in mod_desc.ents:
        mod_ents = cleaning([mod_ents.text])[0]
        for mod_ent in mod_ents:
            if mod_ent in w2v_model.wv:
                cos_sim = w2v_model.wv.similarity(mod_ent, skill)
                
                # To handle computational rounding errors. Sometimes cos_sim is 1.00001 or -1.00001
                if cos_sim > 1: cos_sim = 1
                elif cos_sim < -1: cos_sim = -1
                    
                score = calc_score(cos_sim)
                if max_score < score:
                    best_ent = mod_ent
                max_score = max(max_score, score)
            else:
                max_score = max(max_score, 0)
    return max_score, best_ent

In [12]:
def get_school_scores(all_schools):
    """
    Assigns a score to every school. Score ranges from 0 to 100
    """
    all_scores = {}
    num_skills = len(all_schools)
    for skill in all_schools.keys():
        schools = all_schools[skill].keys()
        for school in schools:
            score = all_schools[skill][school][1]
            if school not in all_scores: all_scores[school] = 0
            all_scores[school] += score
    for school, total_score in all_scores.items():
        all_scores[school] = total_score / num_skills
    return all_scores

In [13]:
def get_mod_recommendations(job_desc):
    """
    WARNING: This function takes about 1-2 minutes to run
    Given a job description (JD), this function will identify all skills within the JD, 
    and recommend a module per school for each identified skill
    
    Input: Job description (str)
    Output: Module recommendations, School scores (dictionary D1 of the format below, dictionary D2 of the format below)
    
    Format of D1:
    {
        JD_skill_1 : {
            school_1 : (school_1_module, score_for_school_1_module),
            school_2 : (school_2_module, score_for_school_2_module),
            school_3 : (school_3_module, score_for_school_3_module),
            ...
        },
        JD_skill_2 : {
            school_1 : (school_1_module, score_for_school_1_module),
            school_2 : (school_2_module, score_for_school_2_module),
            school_3 : (school_3_module, score_for_school_3_module),
            ...
        },
        JD_skill_3 : {
            school_1 : (school_1_module, score_for_school_1_module),
            school_2 : (school_2_module, score_for_school_2_module),
            school_3 : (school_3_module, score_for_school_3_module),
            ...
        },
        ...
    }
    
    Format of D2:
    {
        school_1 : score,
        school_2 : score,
        school_3 : score,
        ...
    }
    """
    
    all_schools = {}
    job_desc = process_job_desc(job_desc)
    # for ent in tqdm(nlp_ner(job_desc).ents):
    for skill_words in job_desc:
        # skill_words = cleaning([ent.text])[0]
        best_mods = {}
        for skill_word in skill_words:
            # modules_copy = modules.copy()
            global modules_copy
            modules_copy['score'] = modules_copy.description.apply(lambda x: get_skill2mod_score(skill_word, x)[0])
            modules_copy = modules_copy.sort_values('score', ascending=False).drop_duplicates('school')
            for i, row in modules_copy.iterrows():
                school, code, name, description, score = row
                if school not in best_mods or best_mods[school][1]:
                    best_mods[school] = (code, score)
        all_schools[" ".join(skill_words)] = best_mods
    
    return all_schools, get_school_scores(all_schools)

### Demo

In [14]:
job_desc = """
What a College Intern - Data Science does at HP:
Attached to the "Smart Manufacturing Application and Research Center".
Work with an enterprising team of data scientists and build solutions to track, analyze and visualize the manufacturing and outbound quality of our supplies.
Generate deep insights through the analysis of data and understanding of operational processes and turn them into actionable recommendations.
Develop methodologies for optimizing our business processes through data visualization, real-time monitoring, predictive analytics etc.
Are you a high-performer? We are looking for an individual with:
Studying Bachelor’s degree in Computer Science, Business Analytics, Information Systems, Industrial Engineering, Statistics with good experience in programming.
Excellent analytical thinking, programming (using R/Python is desirable), and problem-solving skills.
Knowledge of data analytics, data warehousing, database management (preferably using SQL) and data visualization using RShiny and Plotly.
Fundamental knowledge of statistics and probability.
Good visualization skills to create real-time dashboards and/or reports to inform trends and insights.
"""

In [15]:
mod_desc1 = """
This module covers common algorithmic techniques for solving optimisation problems, and introduces students to approaches for finding good-enough solutions to NP-hard problems. Topics covered include linear and integer programming, network flow algorithms, local search heuristics, approximation algorithms, and randomized algorithms. Through analysis and application of the techniques to a variety of canonical problems, students develop confidence to (i) appropriately model a given optimisation problem, (ii) apply appropriate algorithmic techniques to solve the problem, (iii) analyse the properties of the problem and candidate algorithms, such as time and space complexity, convergence, approximability, and optimality bound.
"""

In [16]:
mod_desc2 = """
Data visualisation is an essential tool for data analytics. This module is an introduction to data cleaning, exploration, analysis and visualisation. Students will learn how to take raw data, extract meaningful information, use statistical tools, and make visualisations. Topics include: programming in R, introduction to data storage systems, data manipulation, exploratory data analysis, dimension reduction, statistical graphics for univariate, multivariate (high-dimensional), temporal and spatial data, basic design principles and critical evaluation of visual displays of data.
"""

In [17]:
spacy.displacy.render(nlp_ner(job_desc), style = 'ent', jupyter=True, options = CUSTOM_OPTIONS)

In [18]:
spacy.displacy.render(nlp_ner(mod_desc1), style = 'ent', jupyter=True, options = CUSTOM_OPTIONS)

In [19]:
spacy.displacy.render(nlp_ner(mod_desc2), style = 'ent', jupyter=True, options = CUSTOM_OPTIONS)

In [20]:
print(f"Score: {get_skill2mod_score('visualization', mod_desc1)}")
print(f"Score: {get_skill2mod_score('visualization', mod_desc2)}")

Score: (53.63886179258952, 'search')
Score: (71.64724786243553, 'visualisation')


In [21]:
start_time = time()
mod_reco, school_scores = get_mod_recommendations(job_desc)
end_time = time()

In [22]:
print(f"Computation completed in {end_time - start_time} seconds")

Computation completed in 14.676666975021362 seconds


In [23]:
mod_reco

{'analyze visualize manufacturing': {'SUTD': ('50.004', 58.52840059754694),
  'NTU': ('CZ2001', 62.6140016341832),
  'SMU': ('DSA303', 58.956459646216985),
  'SUSS': ('BUS352', 57.9056215054029),
  'SIT': ('CSC3004', 62.6140016341832),
  'NUS': ('CS3230', 54.496028975205185)},
 'analysis data': {'SMU': ('DSA303', 100.0),
  'NUS': ('CS3230', 54.32250243861272),
  'SIT': ('CSC3004', 100.0),
  'SUTD': ('50.004', 100.0),
  'SUSS': ('BUS352', 100.0),
  'NTU': ('CZ2001', 54.797027106435614)},
 'process': {'SMU': ('DSA303', 65.85597138377442),
  'SUSS': ('BUS352', 65.85597138377442),
  'SIT': ('CSC3004', 60.54888335940564),
  'SUTD': ('50.004', 60.26273047384029),
  'NUS': ('CS3230', 60.26273047384029),
  'NTU': ('CZ2001', 55.68825398972828)},
 'data visualization': {'SIT': ('CSC3004', 62.255992556413865),
  'SUSS': ('BUS352', 61.68361234252909),
  'SMU': ('DSA303', 65.23758139037687),
  'SUTD': ('50.004', 61.63652565514192),
  'NTU': ('CZ2001', 60.460646445028004),
  'NUS': ('CS3230', 60.045

In [24]:
school_scores

{'SUTD': 59.405246628435506,
 'NTU': 60.58777159317864,
 'SMU': 59.28550919364833,
 'SUSS': 65.64914730686675,
 'SIT': 64.12027401045816,
 'NUS': 56.711694150164355}