## Load libraries

In [1]:
# Libraries to work cross-platform
import os

# Libraries to work with dataset
import numpy as np
import pandas as pd
import ast                      # convert string to list after importing csv data
# import dask.dataframe as dd
import pickle

# Libraries to pre-process data
# import spacy
from gensim.models.keyedvectors import KeyedVectors

# Libraries to visualize data
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# import seaborn as sns

# Libraries for monitoring operation process
from tqdm import tqdm

## Configurate and declare global variables

In [2]:
os_name = os.name

if os_name == 'nt':
    BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"
elif os_name == 'posix':
    BASE_DIR = "/media/pinkalinux/WORK/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"

SEED = 6886

%matplotlib inline

## Import data

In [3]:
# Load document list
skill_tokens_long_df = pd.read_csv(OUTPUT_DIR + "skill_tokens_long_df.csv")
skill_tokens_long_df.iloc[:25, ]

Unnamed: 0,row_id,key_id,job_id,skill_id,column_id,column_type,skill,skill_lemma,skill_token
0,10101,101,1,1,1,job_description,ameria investment consulting company,ameria investment consult company,"['ameria', 'investment', 'consult', 'company']"
1,10201,102,1,2,1,job_description,requires high level,require high level,"['require', 'high', 'level']"
2,10301,103,1,3,1,job_description,provides highly responsible,provide highly responsible,"['provide', 'highly', 'responsible']"
3,10401,104,1,4,1,job_description,complex staff assistance,complex staff assistance,"['complex', 'staff', 'assistance']"
4,10501,105,1,5,1,job_description,chief financial officer,chief financial officer,"['chief', 'financial', 'officer']"
5,20101,201,2,1,1,job_description,irex currently seeks,irex currently seek,"['irex', 'currently', 'seek']"
6,20201,202,2,2,1,job_description,position reports directly,position report directly,"['position', 'report', 'directly']"
7,20301,203,2,3,1,job_description,cc program manager,cc program manager,"['cc', 'program', 'manager']"
8,20401,204,2,4,1,job_description,yerevan office,yerevan office,"['yerevan', 'office']"
9,20501,205,2,5,1,job_description,community connections,community connection,"['community', 'connection']"


In [4]:
skill_tokens_long_df[["skill_token"]] = skill_tokens_long_df[["skill_token"]].applymap(
    ast.literal_eval
)
skill_tokens_long_df

Unnamed: 0,row_id,key_id,job_id,skill_id,column_id,column_type,skill,skill_lemma,skill_token
0,10101,101,1,1,1,job_description,ameria investment consulting company,ameria investment consult company,"[ameria, investment, consult, company]"
1,10201,102,1,2,1,job_description,requires high level,require high level,"[require, high, level]"
2,10301,103,1,3,1,job_description,provides highly responsible,provide highly responsible,"[provide, highly, responsible]"
3,10401,104,1,4,1,job_description,complex staff assistance,complex staff assistance,"[complex, staff, assistance]"
4,10501,105,1,5,1,job_description,chief financial officer,chief financial officer,"[chief, financial, officer]"
...,...,...,...,...,...,...,...,...,...
257200,190010103,1900101,19001,1,3,job_qualification,ra financial system,ra financial system,"[ra, financial, system]"
257201,190010203,1900102,19001,2,3,job_qualification,higher legal education,high legal education,"[high, legal, education]"
257202,190010303,1900103,19001,3,3,job_qualification,high pressure environment,high pressure environment,"[high, pressure, environment]"
257203,190010403,1900104,19001,4,3,job_qualification,professional work experience,professional work experience,"[professional, work, experience]"


In [5]:
# Load half-size reduced word embeddings
word_embeddings = KeyedVectors.load(OUTPUT_DIR + "skill_halfsize_word_norm_vectors.kv")

## Create document embeddings

In [6]:
def vectorize(key_list, doc_list, kv, vector_size):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        key_list: List of unique ids (job_id + skill_id) of documents
        doc_list: List of documents
        kv: GloVe's Word Embedding in Gensim Keyed Vectors format
        vector_size: dimension of 1 word embedding

    Returns:
        Dictionary of document ids and vectors (mean of word vectors)
    """
    features = {}
    zero_vector = np.zeros(vector_size)

    process_doc_list = tqdm(enumerate(doc_list))
    process_doc_list.set_description(desc="Creating document embeddings")

    for idx, tokens in process_doc_list:
        key_id = key_list[idx]
        vectors = []
        for token in tokens:
            if kv.has_index_for(token):
                vectors.append(kv.get_vector(token))
            else:
                continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features[key_id] = avg_vec
        else:
            features[key_id] = zero_vector
    return features

In [7]:
vector_size = word_embeddings.vector_size
vector_size

150

In [8]:
# Create document embeddings for skills from job description
skill_description_dict = vectorize(
    skill_tokens_long_df.loc[
        skill_tokens_long_df["column_type"] == "job_description", "key_id"
    ].values.tolist(),
    skill_tokens_long_df.loc[
        skill_tokens_long_df["column_type"] == "job_description", "skill_token"
    ].values.tolist(),
    word_embeddings,
    vector_size,
)

Creating document embeddings: : 79921it [00:01, 52963.31it/s]


In [9]:
# Save document embeddings for skills from job description in file
skill_description_dict_outfile = (
    OUTPUT_DIR + "skill_description_halfsize_long_dict.pkl"
)
# np.save(skill_description_embeddings_outfile, skill_description_embeddings)
pickle.dump(
    skill_description_dict, open(skill_description_dict_outfile, "wb")
)

In [10]:
# Create document embeddings for skills from job requirement
skill_requirement_dict = vectorize(
    skill_tokens_long_df.loc[
        skill_tokens_long_df["column_type"] == "job_requirement", "key_id"
    ].values.tolist(),
    skill_tokens_long_df.loc[
        skill_tokens_long_df["column_type"] == "job_requirement", "skill_token"
    ].values.tolist(),
    word_embeddings,
    vector_size,
)

Creating document embeddings: : 84470it [00:01, 55500.65it/s]


In [11]:
# Save document embeddings for skills from job requirement in file
skill_requirement_dict_outfile = (
    OUTPUT_DIR + "skill_requirement_halfsize_long_dict.pkl"
)
pickle.dump(
    skill_requirement_dict, open(skill_requirement_dict_outfile, "wb")
)

In [12]:
# # Create document embeddings for skills from job qualification
skill_qualification_dict = vectorize(
    skill_tokens_long_df.loc[
        skill_tokens_long_df["column_type"] == "job_qualification", "key_id"
    ].values.tolist(),
    skill_tokens_long_df.loc[
        skill_tokens_long_df["column_type"] == "job_qualification", "skill_token"
    ].values.tolist(),
    word_embeddings,
    vector_size,
)

Creating document embeddings: : 92814it [00:01, 54373.29it/s]


In [13]:
# Save document embeddings for skills from job qualification in file
skill_qualification_dict_outfile = (
    OUTPUT_DIR + "skill_qualification_halfsize_long_dict.pkl"
)
pickle.dump(
    skill_qualification_dict, open(skill_qualification_dict_outfile, "wb")
)

In [19]:
# Create empty array to store embeddings of 3 columns
description_keys = set(skill_description_dict.keys())
requirement_keys = set(skill_requirement_dict.keys())
qualification_keys = set(skill_qualification_dict.keys())
key_sets = description_keys | requirement_keys | qualification_keys

doc_length = len(key_sets)
doc_width = vector_size * 3
skill_wide_embeddings = np.zeros((doc_length, doc_width))
doc_length, doc_width

(94937, 450)

In [20]:
# Concat embeddings of 3 columns with corresponding key_id (job_id + skill_id)
skill_embeddings_dict = {}
for idx, key in tqdm(enumerate(sorted(key_sets))):
    if key in description_keys:
        vector1 = skill_description_dict[key]
        skill_wide_embeddings[idx, :vector_size] = vector1
        row_key = str(key) + '01'
        skill_embeddings_dict[row_key] = vector1
    if key in requirement_keys:
        vector2 = skill_requirement_dict[key]
        skill_wide_embeddings[idx, vector_size:(vector_size*2)] = vector2
        row_key = str(key) + '02'
        skill_embeddings_dict[row_key] = vector2
    if key in qualification_keys:
        vector3 = skill_qualification_dict[key]
        skill_wide_embeddings[idx, (vector_size*2):] = vector3
        row_key = str(key) + '03'
        skill_embeddings_dict[row_key] = vector3

94937it [00:00, 204614.55it/s]


In [21]:
skill_wide_embeddings.shape, len(skill_embeddings_dict)

((94937, 450), 257205)

In [22]:
skill_wide_embeddings[:2, :20]

array([[-8.80736906e-08,  1.87548409e-07, -3.83345991e-07,
         6.80101039e-07,  6.15872580e-07,  1.52150369e-06,
        -8.49366188e-07, -3.29465479e-01,  3.61264169e-01,
        -6.92119524e-02,  8.23668391e-02,  3.52487326e-01,
         3.65901947e-01, -3.32510695e-02, -1.55614853e-01,
         1.37356669e-01, -7.10459054e-02, -2.78310776e-01,
        -5.27126729e-01,  7.15583265e-02],
       [-7.99701638e-08, -1.04489580e-07,  4.71790855e-07,
         5.44738612e-07, -1.93728943e-06,  1.91306458e-06,
        -1.63167715e-06, -2.26008549e-01,  1.83149025e-01,
         5.50982296e-01, -3.73337120e-02, -4.97710565e-03,
        -5.29644072e-01, -9.71275344e-02,  6.14784062e-01,
        -2.78548002e-02,  4.23982859e-01,  3.18652868e-01,
         1.34607747e-01, -3.40985507e-01]])

In [23]:
# Save document embeddings of skill concatenating in numpy format file
skill_wide_embeddings_outfile = OUTPUT_DIR + "skill_wide_halfsize_embeddings.npz"
np.savez_compressed(
    skill_wide_embeddings_outfile, skill_wide_embeddings=skill_wide_embeddings
)

In [24]:
pickle.dump(
    skill_embeddings_dict,
    open(OUTPUT_DIR + "skill_halfsize_embeddings_dict.pkl", "wb"),
)