## Load libraries

In [83]:
# Libraries to work with dataset
import pandas as pd
import numpy as np

# Library to create embeddings
import nltk
from gensim.models import KeyedVectors

# Libraries to visualize data
from tqdm import tqdm     # displaying progress bar while running computation

## Configurate and declare global variables

In [84]:
BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"

## Import data

In [85]:
# Load clean job title dataset from file
job_df = pd.read_csv(OUTPUT_DIR + "clean_title_df.csv")
job_df

Unnamed: 0,job_title
0,chief financial officer
1,full time community connection intern pay inte...
2,country coordinator
3,bcc specialist
4,software developer
...,...
18987,senior creative ux ui designer
18988,category development manager
18989,operational marketing manager
18990,head online sale department


In [86]:
raw_job_df = pd.read_csv(INPUT_DIR + "refined_jobpost_data.csv", usecols=['job_title'])
raw_job_df

Unnamed: 0,job_title
0,Chief Financial Officer
1,Full-time Community Connections Intern (paid i...
2,Country Coordinator
3,BCC Specialist
4,Software Developer
...,...
18987,Senior Creative UX/ UI Designer
18988,Category Development Manager
18989,Operational Marketing Manager
18990,Head of Online Sales Department


## Create embeddings

In [87]:
def create_vocab_list(df, col):
    vocab = []
    for doc in tqdm(df[col]):
        words = [word for word in nltk.word_tokenize(str(doc))]
        vocab.extend(words)

    return vocab

In [88]:
tqdm.pandas(desc="Creating Job Title Corpus")
full_vocab_list = create_vocab_list(job_df, 'job_title')

100%|█████████████████████████████████████████████████████████████████████████| 18992/18992 [00:01<00:00, 13556.04it/s]


In [89]:
print(len(full_vocab_list))
full_vocab_list[:10]

56353


['chief',
 'financial',
 'officer',
 'full',
 'time',
 'community',
 'connection',
 'intern',
 'pay',
 'internship']

In [90]:
vocab_list = list(set(full_vocab_list))
print(len(vocab_list))
vocab_list[:10]

2617


['initiative',
 'reality',
 'sr',
 'software',
 'kindergarten',
 'pharmacist',
 'rating',
 'field',
 'yoga',
 'saleswoman']

In [91]:
# load the Stanford GloVe model in Word2Vec format
pre_model = KeyedVectors.load_word2vec_format(INPUT_DIR + "w2v_from_glove.6B.100d.txt")

In [92]:
model_vocab = pre_model.index_to_key

In [93]:
out_of_vocab = [token for token in vocab_list if token not in model_vocab]
print(len(out_of_vocab))
out_of_vocab

186


['tchambarak',
 'wostayn',
 'marag',
 'srecialist',
 'dcfta',
 'truthing',
 'kumayri',
 'instumintation',
 'procredit',
 'flotator',
 'gortsaranain',
 'dcop',
 'usgaap',
 'sanoshops',
 'internatioonal',
 'usarmenia',
 'marzes',
 'ettl',
 'sayat',
 'bankmail',
 'brsc',
 'aparan',
 'hrci',
 'jfdp',
 'angularjs',
 'gnoseological',
 'gegharqunik',
 'materiological',
 'methodologist',
 'spesialist',
 'prip',
 'aregak',
 'assuarance',
 'tsapatagh',
 'developmant',
 'achajour',
 'merchendiser',
 'chambarak',
 'sncos',
 'baldinini',
 'tecnical',
 'horeca',
 'cedc',
 'stepanavan',
 'wvmeer',
 'scibm',
 'sisian',
 'bookkepeer',
 'offier',
 'iwrm',
 'languagetranslator',
 'specilalist',
 'airticket',
 'achitect',
 'modelmaker',
 'becd',
 'biling',
 'adminsitrative',
 'ugrad',
 'involver',
 'enpard',
 'dceo',
 'careercenter',
 'charentsavan',
 'pmdi',
 'uplistsikhe',
 'techincal',
 'dispetcher',
 'nushikyan',
 'vlac',
 'assitant',
 'dafi',
 'ceed',
 'amasia',
 'kajaran',
 'mounter',
 'enpi',
 'viz

In [95]:
def create_corpus(df, col):
    corpus = []
    for doc in tqdm(df[col]):
        words = [word for word in nltk.word_tokenize(str(doc))]
        corpus.append(words)

    return corpus

In [96]:
def vectorize(doc_list, keyed_vectors):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        doc_list: List of documents
        keyed_vectors: Gensim's Word Embedding

    Returns:
        List of document vectors (mean of word vectors)
    """
    features = []

    for tokens in doc_list:
        zero_vector = np.zeros(keyed_vectors.vector_size)
        vectors = []
        for token in tokens:
            if token in keyed_vectors:
                try:
                    vectors.append(keyed_vectors[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [97]:
title_docs = create_corpus(job_df, 'job_title')
title_embeddings = vectorize(title_docs, keyed_vectors=pre_model)
len(title_embeddings), len(title_embeddings[0])

100%|█████████████████████████████████████████████████████████████████████████| 18992/18992 [00:01<00:00, 11907.23it/s]


(18992, 100)

In [101]:
title_docs[:5]

[['chief', 'financial', 'officer'],
 ['full', 'time', 'community', 'connection', 'intern', 'pay', 'internship'],
 ['country', 'coordinator'],
 ['bcc', 'specialist'],
 ['software', 'developer']]

In [99]:
title_embeddings[:1]

[array([ 3.69033334e-03, -8.16303313e-01,  2.58175343e-01, -5.07819951e-01,
         2.10846677e-01, -6.16106331e-01, -2.74685651e-01, -3.47249359e-01,
        -2.43132666e-01,  1.37167662e-01,  1.98463321e-01, -3.64103280e-02,
         3.89976650e-01,  3.74276638e-01, -4.11023349e-01, -5.15526652e-01,
         3.82256657e-01, -4.81333345e-01, -8.08190048e-01,  5.29996715e-02,
         2.01666844e-03, -3.53026003e-01,  1.31509900e-02,  6.21566735e-02,
        -5.93280017e-01, -5.18303327e-02,  6.04100013e-03, -2.52666682e-01,
        -1.05693340e-01,  3.43269974e-01,  3.63386661e-01,  3.38946670e-01,
        -7.02413321e-01, -3.53622675e-01, -5.72921634e-01, -1.58606663e-01,
        -2.40019992e-01,  3.60786676e-01,  5.92480004e-01, -7.88466707e-02,
        -4.56159979e-01, -2.84089327e-01,  4.85139996e-01, -5.09253331e-02,
         3.02446663e-01,  6.68813288e-01, -1.62749991e-01, -2.43542984e-01,
        -4.65536684e-01, -5.84923327e-01,  1.28453359e-01, -6.46554351e-01,
        -1.4

In [100]:
# Save embeddings file in numpy format
title_embeddings_outfile = OUTPUT_DIR + "title_embeddings"
np.save(title_embeddings_outfile, title_embeddings)

# Save job title tokens in file
title_docs_outfile = OUTPUT_DIR + "title_docs.csv"
df = raw_job_df.copy(deep=True)
df["clean_job_title"] = pd.Series(title_docs)
pd.DataFrame(data=df).to_csv(title_docs_outfile, index=False)