### Tech Data Generation

In [10]:
import pickle
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import json
from generator import SkillsGenerator, DatasetGenerator
from tqdm.notebook import tqdm

from transformers import AutoModel, AutoTokenizer


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
## Loading the embedded taxonomy
emb_sh = "_jbEn"
with open(f"../../../data/taxonomy/taxonomy_embeddings{emb_sh}.pkl", "rb") as f:
    emb_tax = pickle.load(f)
    emb_tax["name"] = emb_tax["name+definition"].apply(
        lambda x : x.split(" : ")[0]
    )


## popularity measures
with open("frequency_vals.json", "r") as f:
    F = json.load(f)

## Loading the combination dist
combination_dist = np.loadtxt("assoc_skilled_dist.npy")

In [71]:
def save_generation(text_file, res):
    tres = ""
    for (skills, job_posting) in res:
        tres += "skills : "+str(skills) + "\n"
        tres += (job_posting if job_posting is not None else "[NULL]") + "\n"
        tres += "-"*100 + "\n"

    with open(text_file, "w") as f:
        f.write(tres)

    print("Result saved in : ./" + text_file)

In [5]:
emb_tax.head(5)

Unnamed: 0,unique_id,name+definition,embeddings,name
0,0,manage musical staff : Assign and manage staff...,"[[tensor(0.5317), tensor(-0.4808), tensor(-0.0...",manage musical staff
1,1,supervise correctional procedures : Supervise ...,"[[tensor(0.1043), tensor(-0.5770), tensor(-0.1...",supervise correctional procedures
2,2,apply anti-oppressive practices : Identify opp...,"[[tensor(0.9992), tensor(-0.7967), tensor(-0.6...",apply anti-oppressive practices
3,3,control compliance of railway vehicles regulat...,"[[tensor(0.6206), tensor(-0.4386), tensor(0.66...",control compliance of railway vehicles regulat...
4,4,identify available services : Identify the dif...,"[[tensor(1.0326), tensor(0.0204), tensor(-0.98...",identify available services


In [8]:
## laoding the tech skills to reduce the skills we're assessing
tech_skills = pd.read_csv("../../../../esco/digitalSkillsCollection_en.csv")
tech_skills_names = set(tech_skills.preferredLabel.values).union(set(tech_skills.altLabels.values))
print("Complete number of skills : ", len(emb_tax.index))
print("Number of tech skills : ", len(emb_tax[emb_tax.name.isin(tech_skills_names)].index))

tech_emb_tax = emb_tax[emb_tax.name.isin(tech_skills_names)]

Complete number of skills :  13896
Number of tech skills :  1201


In [14]:
## loading the reference dataset
test = pd.read_csv("https://raw.githubusercontent.com/jensjorisdecorte/Skill-Extraction-benchmark/main/tech_test_annotations.csv")
test = test[~test.label.isin(["LABEL NOT PRESENT", "UNDERSPECIFIED"])]
multilabel = test.groupby("sentence").agg({"label": lambda x : x.tolist() , "span": lambda x : x.tolist()}).reset_index()
multilabel["skill+sentence"] = multilabel[["label", "sentence"]].apply(lambda x : str(x["label"]) + " : " + x["sentence"], axis=1)
multilabel.head(5)

Unnamed: 0,sentence,label,span,skill+sentence
0,( Cypress for frontend is a must have pytest f...,[Solidity],[Solidity],['Solidity'] : ( Cypress for frontend is a mus...
1,* Ability to analyse requirements and interact...,"[analyse business requirements, communicate wi...","[analyse requirements, interact directly with ...","['analyse business requirements', 'communicate..."
2,* Ability to support the developers in your te...,"[manage a team, lead others, mentor individual...","[support the developers in your team, leading,...","['manage a team', 'lead others', 'mentor indiv..."
3,* Ability to work in large collaborative teams...,"[work in teams, meet commitments, work in team...","[work in large collaborative teams, achieve or...","['work in teams', 'meet commitments', 'work in..."
4,* Ability to work in large collaborative teams...,"[work in teams, meet commitments, think innova...","[work in large collaborative teams, achieve or...","['work in teams', 'meet commitments', 'think i..."


In [77]:
word_emb = "jjzha/jobbert-base-cased"
word_emb_model = AutoModel.from_pretrained(word_emb)
word_emb_tokenizer = AutoTokenizer.from_pretrained(word_emb)

gen = SkillsGenerator(taxonomy=tech_emb_tax, 
                taxonomy_is_embedded=True,
                combination_dist=combination_dist,
                popularity=F)

datagen = DatasetGenerator(emb_tax,
                           multilabel,
                           word_emb_model,
                           word_emb_tokenizer)

stochastic_gen_args = {
"total_generations" : 50, # number of samples
"threshold" : 0.6, # not considering skills that are less than .8 similar
"beam_size" : 50, # considering 20 skills
"temperature_skill" : 1, # we want to see diversity in generated words
"temperature_pairing": 2, # popularity to be skewed toward popular skills
"temperature_sample_size": 0.1,
"frequency_select": True, # wether we select within the NN acording to frequency
"upper_bound_skill_matching": 0
}

Some weights of BertModel were not initialized from the model checkpoint at jjzha/jobbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Zero shot GPT-4 with single skills

In [36]:
generations = gen.stochastic_inf_iter(**stochastic_gen_args)

In [37]:
res = datagen.generate_ds(generations, specific_few_shots=True, nb_few_shots=0)

0it [00:00, ?it/s]

----------------------------------------------------------------------------------------------------
You are a hiring manager for a big company. You need to define write a job opening for different skill requirements in your company.

You are a hiring manager for a big company and your goal is to write the perfect sentence to describe job that uses a set of skills. You'll be given a set of skill, the job posting will reference each of them explicitely or implicitely. The job you describe must include capailities in each of these skills. No external skills should be mentionned. The description of the job should be one line long and be as specific as possible.

skills: ['e-agriculture']
----------------------------------------------------------------------------------------------------
You are a hiring manager for a big company. You need to define write a job opening for different skill requirements in your company.

You are a hiring manager for a big company and your goal is to write th

In [40]:
text_file = "generated/zero_shot_gpt4_tech_50_samples.txt"
save_generation(text_file, res)

Result saved in : ./generated/zero_shot_gpt4_tech_50_samples.txt


### Zero shot GPT-4 with skills pairing

In [67]:
stochastic_gen_args = {
"total_generations" : 50, # number of samples
"threshold" : 0.87, # not considering skills that are less than .8 similar
"beam_size" : 50, # considering 20 skills
"temperature_skill" : 1, # we want to see diversity in generated words
"temperature_pairing": 2, # popularity to be skewed toward popular skills
"temperature_sample_size": 0.1,
"frequency_select": True, # wether we select within the NN acording to frequency
"upper_bound_skill_matching": 5
}

generations = gen.stochastic_inf_iter(**stochastic_gen_args)

In [68]:
res = datagen.generate_ds(generations, specific_few_shots=True, nb_few_shots=0)

0it [00:00, ?it/s]

----------------------------------------------------------------------------------------------------
You are a hiring manager for a big company. You need to define write a job opening for different skill requirements in your company.

You are a hiring manager for a big company and your goal is to write the perfect sentence to describe job that uses a set of skills. You'll be given a set of skill, the job posting will reference each of them explicitely or implicitely. The job you describe must include capailities in each of these skills. No external skills should be mentionned. The description of the job should be one line long and be as specific as possible.

skills: ['manage open publications', 'monitor ICT research']
----------------------------------------------------------------------------------------------------
You are a hiring manager for a big company. You need to define write a job opening for different skill requirements in your company.

You are a hiring manager for a big c

In [69]:
text_file = "generated/zero_shot_gpt4_tech_50_samples_skill_pairing.txt"
save_generation(text_file, res)

Result saved in : ./generated/zero_shot_gpt4_tech_50_samples_skill_pairing.txt


### Specific few shots GPT-4 with pairing

In [91]:
stochastic_gen_args = {
"total_generations" : 10, # number of samples
"threshold" : 0.87, # not considering skills that are less than .8 similar
"beam_size" : 50, # considering 20 skills
"temperature_skill" : 1, # we want to see diversity in generated words
"temperature_pairing": 2, # popularity to be skewed toward popular skills
"temperature_sample_size": 0.05,
"frequency_select": True, # wether we select within the NN acording to frequency
"upper_bound_skill_matching": 5
}

generations = gen.stochastic_inf_iter(**stochastic_gen_args)

generation_args = {
    "generations": generations, # sample generator
    "specific_gew_shots": True, # enable the use of specific few shots
    "nb_few_shots": 3, # number of few shots to use
    "shot_sim_treshold": 0.75
}
res = datagen.generate_ds(generations, specific_few_shots=True, nb_few_shots=3)

0it [00:00, ?it/s]

##################################################
[0.75619775 0.75559354 0.7481957 ]
input skills :  ['manage quantitative data', 'inspect data', 'manage research data', 'record data from biomedical tests', 'collect ICT data', 'use data processing techniques']
top sim ids :  [ 91 206 210]
associated_skills :  ["skills: ['data extraction, transformation and loading tools', 'use databases', 'data extraction, transformation and loading tools']"
 "skills: ['analyse test data']"
 "skills: ['develop automated software tests', 'assess reliability of data', 'visual presentation techniques']"]
##################################################
----------------------------------------------------------------------------------------------------
You are a hiring manager for a big company. You need to define write a job opening for different skill requirements in your company.

You are a hiring manager for a big company and your goal is to write the perfect sentence to describe job that uses a set

In [94]:
datagen.generate_specific_few_shots(["SQL", "Scala"], 3, 0.0)

##################################################
[0.76983726 0.747653   0.7474016 ]
input skills :  ['SQL', 'Scala']
top sim ids :  [107 335 196]
["skills: ['develop test procedures', 'Objective-C', 'Java (computer programming)']\nJob Opening : Experience with TDD ( test driven development ) Knowledge of other analytics tools and programming languages especially Objective C and Java.\n", "skills: ['tools for ICT test automation']\nJob Opening : • Very good understanding of test automation frameworks ..\n", "skills: ['database management systems', 'Python (computer programming)']\nJob Opening : Strong familiarity with database structures webapp design in Python and ORMs ( Django ORM SQLAlchemy ) and can adapt them as required.\n"]
##################################################


["skills: ['develop test procedures', 'Objective-C', 'Java (computer programming)']\nJob Opening : Experience with TDD ( test driven development ) Knowledge of other analytics tools and programming languages especially Objective C and Java.\n",
 "skills: ['tools for ICT test automation']\nJob Opening : • Very good understanding of test automation frameworks ..\n",
 "skills: ['database management systems', 'Python (computer programming)']\nJob Opening : Strong familiarity with database structures webapp design in Python and ORMs ( Django ORM SQLAlchemy ) and can adapt them as required.\n"]