In [62]:
from ssg_sea.extract_skills import extract_skills, batch_extract_skills
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import requests


#initializing language model
model = SentenceTransformer('all-mpnet-base-v2')

#intitializing JINZHA span extraction model
token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")

In [32]:
import os
os.getcwd()

'/Users/eugenechua/Downloads/semantic_skill_extractions/notebooks'

In [33]:
df = pd.read_csv('/Users/eugenechua/Downloads/semantic_skill_extractions/data/skill_master_dedup_06nov2022.csv')
df = df[['skill_id', 'skill_title', 'dup_parent']]
df['merged_title'] = df['dup_parent'].combine_first(df.skill_title)
df['source'] = 'skill title'

df = df[['merged_title', 'skill_title', 'skill_id', 'source']]

# Including the 2k skills descriptions
df2 = pd.read_csv('/Users/eugenechua/Downloads/semantic_skill_extractions/data/unique_2k_skills.csv')
df2 = df2[['skill_title', 'skill_description_final', 'skill_id']]
df2['source'] = 'full description'
df2.columns = ['merged_title', 'skill_title', 'skill_id', 'source']

df2 = df2.dropna()

#Drop all NAs
df_cleaned = pd.concat([df, df2], ignore_index = True)
df_cleaned = df_cleaned.dropna()
print(df_cleaned.shape)

(5833, 4)


In [11]:
b = list(df_cleaned['skill_title'])
embeddings = model.encode(b)

Next **2 chunks** of codes is just to save the embeddings for easy access for deployment

In [15]:
from numpy import savetxt

savetxt('/Users/eugenechua/Downloads/semantic_skill_extractions/data/skills_embeddings.csv', embeddings, delimiter=',')


In [16]:
from numpy import loadtxt

embeddings = loadtxt('/Users/eugenechua/Downloads/semantic_skill_extractions/data/skills_embeddings.csv', delimiter=',')
print(embeddings.shape)

(5833, 768)


In [37]:

df_clean = pd.DataFrame(list(zip(embeddings, list(df_cleaned['skill_title']), list(df_cleaned['source']))))
df_clean.head(10)

Unnamed: 0,0,1,2
0,"[0.0645764097571373, 0.007018603850156069, -0....",supplier sourcing,skill title
1,"[-0.025321509689092636, -0.04846109449863434, ...",heavy lifting machinery operation,skill title
2,"[0.010486770421266556, -0.008886554278433323, ...",JavaFX,skill title
3,"[-0.021288812160491943, -0.06264938414096832, ...",perform rigging of suspended scaffold,skill title
4,"[-0.05150282755494118, 0.0588972233235836, -0....",Unreal Engine,skill title
5,"[-0.12205683439970016, 0.03769751638174057, -0...",measurement of building and construction works,skill title
6,"[-0.004400251433253288, -0.007667871192097664,...",security education and awareness,skill title
7,"[-0.021364398300647736, -0.03291257470846176, ...",Travis CI,skill title
8,"[0.025099849328398705, 0.036317531019449234, -...",cleanliness and contamination control,skill title
9,"[0.018627600744366646, 0.02188299596309662, -0...","health, hygiene and nutrition for children",skill title


In [20]:
test = """
This data scientist job requires the ability to do exploratory data analysis, machine learning and statistical modeling. 
In addition, we require the candidate to make good presentations to senior management and leadership. One special request is that we will need the candidate to work remotely from Java, Indonesia.
"""

In [43]:
#Function to aggregate single words to next span if it is adjacent --> part of JINZHA original script

def aggregate_span(results):
    new_results = []
    current_result = results[0]

    for result in results[1:]:
        if result["start"] == current_result["end"] + 1:
            current_result["word"] += " " + result["word"]
            current_result["end"] = result["end"]
        else:
            new_results.append(current_result)
            current_result = result

    new_results.append(current_result)

    return new_results

In [44]:
def find_similar(q,k):
    testing = model.encode(q)
    trial = []
    vals = cosine_similarity([testing],embeddings)
    idx_asc = vals.argsort()[0][-k:]
    idx_dsc = idx_asc[::-1]
    flatv = np.sort(vals[0])
    vk_asc = flatv[-k:]
    vk_dsc = vk_asc[::-1]
    if(vk_dsc[0]==0):
      print("No skills matched")
    else:
      for v, i in zip(vk_dsc, idx_dsc):
        a = {'score' : float(v), 'skill' : df_clean[1][i], 'phrase' : q, 'matched_by': df_clean[2][i]}
        trial.append(a)
      df_output = pd.DataFrame(trial)
      return df_output

In [45]:
def ner_combined(text):
    output_skills = token_skill_classifier(text)
    for result in output_skills:
        if result.get("entity_group"):
            result["entity"] = "Skill"
            del result["entity_group"]

    output_knowledge = token_knowledge_classifier(text)
    for result in output_knowledge:
        if result.get("entity_group"):
            result["entity"] = "Knowledge"
            del result["entity_group"]

    output = output_skills + output_knowledge

    if len(output) > 0:
        output_skills = aggregate_span(output)
    skill_list = []
    for i in output_skills:
        skill_list.append(i['word'])
    
    df_skills_extracted = pd.DataFrame(columns = ['score', 'skill', 'phrase', 'matched_by'])
    
    #mpnet portion with span extraction
    for i in range(len(skill_list)):
        df_skills_extracted = pd.concat([find_similar(skill_list[i], 1), df_skills_extracted], ignore_index = True)

    span_extracted = df_skills_extracted.sort_values('score', ascending = False).drop_duplicates(subset ='skill', keep = 'first')

    span_ex_list = list(span_extracted[span_extracted['score'] >= 0.555]['skill'].sort_values())
    
    #SEA_V1 portion
    sea_v1_list = []
    for item in extract_skills(text).values():
        for skills in item.values():
            sea_v1_list.append(skills['skill_title'])

    sea_v1_list.sort()


    df_final = pd.DataFrame([[text, span_ex_list, sea_v1_list]], columns = ['Text', 'mpnet_JINZHAspan', 'SEAv1'])

    return df_final, span_extracted

In [53]:
extracted_skills = ner_combined(test)

extracted_skill_df = extracted_skills[0]

In [54]:
extracted_skill_df

Unnamed: 0,Text,mpnet_JINZHAspan,SEAv1
0,\nThis data scientist job requires the ability...,"[business presentation delivery, data and stat...","[Java, big data analytics]"


In [60]:
for i in extracted_skill_df['mpnet_JINZHAspan']:
    print(i)

['business presentation delivery', 'data and statistical analysis', 'data visualisation', 'machine learning application', 'research data analysis']


In [61]:
for i in extracted_skill_df['SEAv1']:
    print(i)

['Java', 'big data analytics']


We note that  SEA V1 didn't just returned lesser skills, but also **wrongly** extracted Java seemingly as a programming language.

Let try posting the same query to ADA 002!

In [63]:
r = requests.post('https://ssg-course-search-ai.herokuapp.com/skills_finder', json={'query': test})

r = r.json()
print(r)

{'matches': [{'id': '2606bd4a7d123eb7a47bfc6460afaa88a171812d89ae81c98948ccb42483ad08', 'score': 0.830264628, 'metadata': {'skill_description_final': 'Data Collection and Analysis: Skill in data analysis and interpretation to identify opportunities and risks, and provide meaningful insights to inform decision making. Utilize appropriate techniques and tools to collect, extract and interpret data, and engage stakeholders to communicate findings.', 'skill_title': 'Data Collection and Analysis', 'skill_type': 'SFw TSC'}}, {'id': '11e651fbc6af74d20fd3740b27da9fad919c050e73454c69511183f6da6e3dcd', 'score': 0.820142746, 'metadata': {'skill_description_final': 'Infographics and Data Visualisation: Create visualisations, infographics and reports to communicate data insights and drive data-driven decision-making. Lead and coach teams in the use of data visualisation tools to enable effective storytelling.', 'skill_title': 'Infographics and Data Visualisation', 'skill_type': 'SFw TSC'}}, {'id': 

In [74]:
for item in r['matches']:
    if item['score'] >= 0.8:
        print(item['metadata']['skill_title'])

Data Collection and Analysis
Infographics and Data Visualisation
Analytics and Computational Modelling
Data and Statistical Analysis
Data Analysis and Interpretation
Business Data Analysis
Data Storytelling and Visualisation
Data Analytics System Design
Data Visualisation
Data-Mining and Modelling
Data Migration
Computational Modelling
Laboratory Data Analysis
