## Load libraries

In [57]:
# Libraries to work cross-platform
import os

# Libraries to work with dataset
import numpy as np
import pandas as pd

# Libraries to pre-process data
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from rake_nltk import Rake

# Libraries for monitoring operation process
from tqdm import tqdm

## Configurate and declare global variables

In [2]:
os_name = os.name

if os_name == 'nt':
    BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"
elif os_name == 'posix':
    BASE_DIR = "/media/pinkalinux/WORK/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"

SEED = 6886

%matplotlib inline

## Import data

In [3]:
df_raw = pd.read_csv(INPUT_DIR + "refined_jobpost_data.csv")
pd.set_option('display.max_colwidth', None)

## Explore data

In [4]:
skill_columns = ["job_description", "job_requirement", "job_qualification"]
skill_df = df_raw[["job_id"] + skill_columns]
skill_df.head(2)

Unnamed: 0,job_id,job_description,job_requirement,job_qualification
0,1,"AMERIA Investment Consulting Company is seeking a\r\nChief Financial Officer. This position manages the company's fiscal and\r\nadministrative functions, provides highly responsible and technically\r\ncomplex staff assistance to the Executive Director. The work performed\r\nrequires a high level of technical proficiency in financial management\r\nand investment management, as well as management, supervisory, and\r\nadministrative skills.","- Supervises financial management and administrative staff, including\r\nassigning responsibilities, reviewing employees' work processes and\r\nproducts, counseling employees, giving performance evaluations, and\r\nrecommending disciplinary action;\r\n- Serves as member of management team participating in both strategic\r\nand operational planning for the company;\r\n- Directs and oversees the company's financial management activities,\r\nincluding establishing and monitoring internal controls, managing cash\r\nand investments, and managing the investment portfolio in collaboration\r\nwith the Investment team leader. This includes, but is not limited to,\r\nevaluation of investment risk, concentration risk, fund deployment\r\nlevels, adequacy of loss and liquidity reserves Assists investment team\r\nin development of proper documentation and internal systems;\r\n- Directs and oversees the annual budgeting process, including\r\ndeveloping projections for financial planning, and preparing budgets;\r\n- Prepares external and internal financial management reports, such as\r\naudited financial statements, tax returns, and reports for the board of\r\ndirectors and company staff;\r\n- Develops, implements, and maintains efficient and effective accounting\r\nsystems and controls to ensure compliance with national and\r\ninternational accounting standards and principles, sufficiency of fund\r\naccounting, and comprehensiveness of data for reporting and compliance\r\nrequirements;\r\n- Ensures contract compliance, including interpreting and monitoring\r\ncontracts with clients, submitting required reports, and monitoring\r\ncovenants and other contract terms;\r\n- Oversees the design, implementation and maintenance of computer-based\r\ninformation system. Oversees records retention (both manual and\r\ncomputer-based) and file maintenance activities;\r\n- Serves as company's risk manager, including evaluating loss exposure\r\nand obtaining insurance as appropriate;\r\n- Manages other administrative operations, such as facilities\r\nmanagement, payroll administration, office operations, and\r\nadministrative support;\r\n- Monitors corporate compliance with by-laws and articles of\r\nincorporation regarding corporate registration and reporting of\r\nfundraising operations.","To perform this job successfully, an\r\nindividual must be able to perform each essential duty satisfactorily.\r\nThe requirements listed below are representative of the knowledge,\r\nskill, and/or ability required.\r\nKnowledge of:\r\n- Generally accepted accounting principles;\r\n- Local accounting standards and legislation;\r\n- State reporting requirements pertaining to accounting;\r\n- Principles and practices of financial management and budgeting;\r\n- Principles and practices of financial systems design and analysis;\r\n- Principles and practices of contract management, records management,\r\nand risk management;\r\n- Principles and practices of management and supervision;\r\n- Principles and practices of information systems management.\r\nAbility to:\r\n- Apply sound fiscal and administrative practices to the company's\r\nactivities;\r\n- Plan, organize and supervise the work of subordinate employees,\r\nincluding training them, assigning and evaluating their work, and\r\nproviding job performance feedback;\r\n- Critically analyze fiscal and administrative policies, practices,\r\nprocedures, and systems, and recommend and implement changes as needed;\r\n- Gather and synthesize financial information from a variety of sources\r\nand present it to a variety of audiences with differing financial\r\nmanagement and analysis expertise;\r\n- Prepare detailed, comprehensive financial reports, including\r\nexplanatory text;\r\n- Operate IBM-compatible personal computer, including word processing,\r\nspreadsheet, and database software applications;\r\n- Operate specialized software applications that support the financial\r\nmanagement and budgeting functions.\r\nQualifications:\r\n- A minimum of 5-7 years Accounting/ Corporate Finance/ Banking\r\nexperience, including a role as a CFO;\r\n- Excellent finance and accounting technical skills coupled with a\r\ndemonstrated knowledge of all key financial functions in an consulting\r\ncompany context - accounting, finance, control, treasury, reserving, and\r\nreporting;\r\n- Strong financial planning and analytical skills and experience and the\r\nability to work closely with and support the CEO and other executives in\r\nstrategic development and implementation;\r\n- Excellent leadership, management and supervisory track record of\r\nattracting, selecting, developing, rewarding and retaining high-caliber,\r\naccounting and finance executive and teams who achieve business goals;\r\n- An undergraduate degree in finance, business, or other related\r\ndiscipline is required. A CPA, CFA, ACCA or other financial\r\ncertification is highly preferred, as is a Masters degree in Business\r\nAdministration, Accounting or Finance;\r\n- Fluency in English, Armenian and Russian with outstanding writing\r\nskills;\r\n- Excellent analytical, communication, teamwork, interpersonal skills;\r\n- Need to be well organized and detail-oriented as well as goal/ result\r\ndriven and able to deal with complex issues."
1,2,IREX currently seeks to fill the position of a paid\r\nIntern for the Community Connections (CC) Program. The position is based\r\nin the Yerevan office however applicants must be willing to travel\r\nthroughout Armenia as necessary. This position reports directly to the\r\nCC Program Manager.,"- Presenting the CC program to interested parties; \r\n- Assisting in planning and scheduling of programmatic meetings and\r\nevents (this includes coordinating logistics for CC staff, visitors and\r\nparticipants);\r\n- Assisting the Program Staff;\r\n- Translation/Interpretation from Armenian to English and vice versa;\r\n- Helping create, maintain and update the CC filing system and\r\ndatabases;\r\n- Completing general administrative tasks for the CC program within the\r\noffice;\r\n- Other duties as assigned/ needed.",- Bachelor's Degree; Master's is preferred;\r\n- Excellent skills in spoken and written English and Armenian languages;\r\n- Past English to Armenian translation and Armenian to English\r\ntranslation experience;\r\n- Good communication and public speaking skills;\r\n- Ability to work independently and as part of a team.\r\nREMUNERATION: Commensurate with experience.


## Text preprocessing

In [5]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    """Helper function to convert nltk POS tags to wordnet POS tags"""
    if nltk_tag.startswith("J"):
        return wordnet.ADJ
    elif nltk_tag.startswith("V"):
        return wordnet.VERB
    elif nltk_tag.startswith("N"):
        return wordnet.NOUN
    elif nltk_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def lemmatize_token(token):
    lemmatizer = WordNetLemmatizer()
    nltk_tagged = nltk.pos_tag(token)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if no tag found then use as it is
            lemmatized_sentence.append(word)
        else:
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))

    return " ".join(lemmatized_sentence)


class pre_process_text:
    """Function to pre process text once for all steps:
    - Replace all consecutive special characters by 1 single character "|"
    - Removing URLs
    - Removing stop words
    - Removing special characters and numbers and replace by 1 space
    - Removing short words with length < 2
    - Lowercasing
    - Lemmatizing

    Inputs:
    - text: raw text to normalize
    - spec_chars: list of special characters to remove
    - stop_words: set of stopwords to remove

    Return: normalized text
    """

    def __init__(self, text, spec_chars, stop_words):
        self.text = text
        self.spec_chars = spec_chars
        self.stop_words = stop_words

    def convert2string(self):
        self.text = str(self.text)
        return self

    def lowercase(self):
        # convert to string and lowercasing
        self.text = self.text.lower()
        return self

    def remove_url(self):
        # remove URLs
        self.text = re.sub(r"http\S+", "", self.text)
        self.text = re.sub(r"www\S+", "", self.text)
        return self

    def remove_spec_chars(self):
        # remove special characters and numbers
        # self.text = self.text.translate(str.maketrans("", "", self.spec_chars))
        for char in self.spec_chars:
            self.text = self.text.replace(char, " ")
        self.text = re.sub("[ ]{2,}", " ", self.text)
        return self

    def replace_spec_chars(self):
        # replace special characters and numbers by "|"
        for char in self.spec_chars:
            self.text = self.text.replace(char, "|")
        self.text = re.sub("[|]{2,}", "|", self.text)
        return self

    def get_tokens(self):
        # tokenize
        self.text = nltk.word_tokenize(self.text)
        return self

    def join_phrases(self):
        # join keyphrases into 1 document
        self.text = [word for phrase in self.text for word in phrase.split()]
        return self

    def remove_stopwords(self):
        # remove stop words
        self.text = [word for word in self.text if word not in self.stop_words]
        self.text = " ".join(self.text)
        return self

    def remove_shortwords(self):
        # remove words with length < 2
        self.text = [word for word in self.text if len(word) >= 2]
        return self

    def lemmatize(self):
        # lemmatize
        self.text = lemmatize_token(self.text)
        return self

    def clean(self):
        self = self.convert2string()
        self = self.lowercase()
        self = self.remove_url()
        self = self.replace_spec_chars()
        return self

    def extract_keyphrases(self):
        # extract keyphrases
        rake_nltk_var = Rake(
            stopwords=self.stop_words,
            punctuations=self.spec_chars,
            min_length=1,
            max_length=5,
        )
        rake_nltk_var.extract_keywords_from_text(self.text)
        self.text = rake_nltk_var.get_ranked_phrases()[:20]
        return self.text

    def normalize(self):
        self = self.join_phrases()
        self = self.remove_stopwords()
        self = self.remove_spec_chars()
        self = self.get_tokens()
        self = self.remove_shortwords()
        self = self.lemmatize()
        self = self.get_tokens()
        return self.text

In [6]:
# Set stopwords and special characters to remove
stop_words = set(stopwords.words("english"))
spec_chars = string.punctuation + string.digits + "\n\r"

In [7]:
tqdm.pandas(desc="Job Skill Keyphrase extracting")
clean_skill_df = skill_df[skill_columns].progress_applymap(
    lambda x: pre_process_text(x, spec_chars=spec_chars, stop_words=stop_words)
    .clean()
    .extract_keyphrases()
)
clean_skill_df.insert(0, 'job_id', skill_df['job_id'])
clean_skill_df.head(3)

Job Skill Keyphrase extracting: 100%|██████████████████████████████████████████| 56976/56976 [00:19<00:00, 2922.71it/s]


Unnamed: 0,job_id,job_description,job_requirement,job_qualification
0,1,"[ameria investment consulting company, provides highly responsible, complex staff assistance, chief financial officer, investment management, financial management, work performed, technical proficiency, position manages, high level, executive director, administrative skills, administrative functions, company, management, well, technically, supervisory, seeking, requires]","[liquidity reserves assists investment team, incorporation regarding corporate registration, including evaluating loss exposure, internal financial management reports, investment team leader, management team participating, recommending disciplinary action, giving performance evaluations, audited financial statements, annual budgeting process, supervises financial management, financial management activities, monitors corporate compliance, submitting required reports, oversees records retention, international accounting standards, file maintenance activities, ensures contract compliance, monitoring internal controls, investment risk]","[providing job performance feedback, operate specialized software applications, state reporting requirements pertaining, generally accepted accounting principles, accounting technical skills coupled, database software applications, supervisory track record, essential duty satisfactorily, critically analyze fiscal, compatible personal computer, apply sound fiscal, local accounting standards, synthesize financial information, strong financial planning, comprehensive financial reports, key financial functions, including word processing, achieve business goals, financial systems design, information systems management]"
1,2,"[yerevan office however applicants must, irex currently seeks, position reports directly, cc program manager, throughout armenia, community connections, program, position, cc, willing, travel, paid, necessary, intern, fill, based]","[completing general administrative tasks, includes coordinating logistics, cc filing system, cc program within, cc program, cc staff, program staff, vice versa, programmatic meetings, interested parties, helping create, visitors, update, translation, scheduling, presenting, planning, participants, office, needed]","[public speaking skills, excellent skills, work independently, good communication, written english, past english, armenian translation, armenian languages, translation experience, english, armenian, experience, team, spoken, remuneration, preferred, part, master, degree, commensurate]"
2,3,"[time position, primary contact, international organizations, environmental ngos, public outreach, public agencies, public, strengthening, serve, network, growing, full, cenn, businesses]","[provide environmental information, primary local contact, prepare cenn seminars, international organizations, country director, caucasus region, armenian offices, armenian ngos, cenn, workshops, working, strategy, serving, policy, participating, organize, helping, defining, businesses, armenia]","[environmentally related field, environmental issues specific, years relevant, written fluency, salary commensurate, working, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]"


In [8]:
clean_skill_df["combi1"] = (
    clean_skill_df["job_description"] + clean_skill_df["job_requirement"]
)
clean_skill_df["combi2"] = (
    clean_skill_df["job_description"] + clean_skill_df["job_qualification"]
)
clean_skill_df["combi3"] = (
    clean_skill_df["job_requirement"] + clean_skill_df["job_qualification"]
)
clean_skill_df["skills"] = (
    clean_skill_df["job_description"]
    + clean_skill_df["job_requirement"]
    + clean_skill_df["job_qualification"]
)
clean_skill_df.head(3)

Unnamed: 0,job_id,job_description,job_requirement,job_qualification,combi1,combi2,combi3,skills
0,1,"[ameria investment consulting company, provides highly responsible, complex staff assistance, chief financial officer, investment management, financial management, work performed, technical proficiency, position manages, high level, executive director, administrative skills, administrative functions, company, management, well, technically, supervisory, seeking, requires]","[liquidity reserves assists investment team, incorporation regarding corporate registration, including evaluating loss exposure, internal financial management reports, investment team leader, management team participating, recommending disciplinary action, giving performance evaluations, audited financial statements, annual budgeting process, supervises financial management, financial management activities, monitors corporate compliance, submitting required reports, oversees records retention, international accounting standards, file maintenance activities, ensures contract compliance, monitoring internal controls, investment risk]","[providing job performance feedback, operate specialized software applications, state reporting requirements pertaining, generally accepted accounting principles, accounting technical skills coupled, database software applications, supervisory track record, essential duty satisfactorily, critically analyze fiscal, compatible personal computer, apply sound fiscal, local accounting standards, synthesize financial information, strong financial planning, comprehensive financial reports, key financial functions, including word processing, achieve business goals, financial systems design, information systems management]","[ameria investment consulting company, provides highly responsible, complex staff assistance, chief financial officer, investment management, financial management, work performed, technical proficiency, position manages, high level, executive director, administrative skills, administrative functions, company, management, well, technically, supervisory, seeking, requires, liquidity reserves assists investment team, incorporation regarding corporate registration, including evaluating loss exposure, internal financial management reports, investment team leader, management team participating, recommending disciplinary action, giving performance evaluations, audited financial statements, annual budgeting process, supervises financial management, financial management activities, monitors corporate compliance, submitting required reports, oversees records retention, international accounting standards, file maintenance activities, ensures contract compliance, monitoring internal controls, investment risk]","[ameria investment consulting company, provides highly responsible, complex staff assistance, chief financial officer, investment management, financial management, work performed, technical proficiency, position manages, high level, executive director, administrative skills, administrative functions, company, management, well, technically, supervisory, seeking, requires, providing job performance feedback, operate specialized software applications, state reporting requirements pertaining, generally accepted accounting principles, accounting technical skills coupled, database software applications, supervisory track record, essential duty satisfactorily, critically analyze fiscal, compatible personal computer, apply sound fiscal, local accounting standards, synthesize financial information, strong financial planning, comprehensive financial reports, key financial functions, including word processing, achieve business goals, financial systems design, information systems management]","[liquidity reserves assists investment team, incorporation regarding corporate registration, including evaluating loss exposure, internal financial management reports, investment team leader, management team participating, recommending disciplinary action, giving performance evaluations, audited financial statements, annual budgeting process, supervises financial management, financial management activities, monitors corporate compliance, submitting required reports, oversees records retention, international accounting standards, file maintenance activities, ensures contract compliance, monitoring internal controls, investment risk, providing job performance feedback, operate specialized software applications, state reporting requirements pertaining, generally accepted accounting principles, accounting technical skills coupled, database software applications, supervisory track record, essential duty satisfactorily, critically analyze fiscal, compatible personal computer, apply sound fiscal, local accounting standards, synthesize financial information, strong financial planning, comprehensive financial reports, key financial functions, including word processing, achieve business goals, financial systems design, information systems management]","[ameria investment consulting company, provides highly responsible, complex staff assistance, chief financial officer, investment management, financial management, work performed, technical proficiency, position manages, high level, executive director, administrative skills, administrative functions, company, management, well, technically, supervisory, seeking, requires, liquidity reserves assists investment team, incorporation regarding corporate registration, including evaluating loss exposure, internal financial management reports, investment team leader, management team participating, recommending disciplinary action, giving performance evaluations, audited financial statements, annual budgeting process, supervises financial management, financial management activities, monitors corporate compliance, submitting required reports, oversees records retention, international accounting standards, file maintenance activities, ensures contract compliance, monitoring internal controls, investment risk, providing job performance feedback, operate specialized software applications, state reporting requirements pertaining, generally accepted accounting principles, accounting technical skills coupled, database software applications, supervisory track record, essential duty satisfactorily, critically analyze fiscal, compatible personal computer, apply sound fiscal, local accounting standards, synthesize financial information, strong financial planning, comprehensive financial reports, key financial functions, including word processing, achieve business goals, financial systems design, information systems management]"
1,2,"[yerevan office however applicants must, irex currently seeks, position reports directly, cc program manager, throughout armenia, community connections, program, position, cc, willing, travel, paid, necessary, intern, fill, based]","[completing general administrative tasks, includes coordinating logistics, cc filing system, cc program within, cc program, cc staff, program staff, vice versa, programmatic meetings, interested parties, helping create, visitors, update, translation, scheduling, presenting, planning, participants, office, needed]","[public speaking skills, excellent skills, work independently, good communication, written english, past english, armenian translation, armenian languages, translation experience, english, armenian, experience, team, spoken, remuneration, preferred, part, master, degree, commensurate]","[yerevan office however applicants must, irex currently seeks, position reports directly, cc program manager, throughout armenia, community connections, program, position, cc, willing, travel, paid, necessary, intern, fill, based, completing general administrative tasks, includes coordinating logistics, cc filing system, cc program within, cc program, cc staff, program staff, vice versa, programmatic meetings, interested parties, helping create, visitors, update, translation, scheduling, presenting, planning, participants, office, needed]","[yerevan office however applicants must, irex currently seeks, position reports directly, cc program manager, throughout armenia, community connections, program, position, cc, willing, travel, paid, necessary, intern, fill, based, public speaking skills, excellent skills, work independently, good communication, written english, past english, armenian translation, armenian languages, translation experience, english, armenian, experience, team, spoken, remuneration, preferred, part, master, degree, commensurate]","[completing general administrative tasks, includes coordinating logistics, cc filing system, cc program within, cc program, cc staff, program staff, vice versa, programmatic meetings, interested parties, helping create, visitors, update, translation, scheduling, presenting, planning, participants, office, needed, public speaking skills, excellent skills, work independently, good communication, written english, past english, armenian translation, armenian languages, translation experience, english, armenian, experience, team, spoken, remuneration, preferred, part, master, degree, commensurate]","[yerevan office however applicants must, irex currently seeks, position reports directly, cc program manager, throughout armenia, community connections, program, position, cc, willing, travel, paid, necessary, intern, fill, based, completing general administrative tasks, includes coordinating logistics, cc filing system, cc program within, cc program, cc staff, program staff, vice versa, programmatic meetings, interested parties, helping create, visitors, update, translation, scheduling, presenting, planning, participants, office, needed, public speaking skills, excellent skills, work independently, good communication, written english, past english, armenian translation, armenian languages, translation experience, english, armenian, experience, team, spoken, remuneration, preferred, part, master, degree, commensurate]"
2,3,"[time position, primary contact, international organizations, environmental ngos, public outreach, public agencies, public, strengthening, serve, network, growing, full, cenn, businesses]","[provide environmental information, primary local contact, prepare cenn seminars, international organizations, country director, caucasus region, armenian offices, armenian ngos, cenn, workshops, working, strategy, serving, policy, participating, organize, helping, defining, businesses, armenia]","[environmentally related field, environmental issues specific, years relevant, written fluency, salary commensurate, working, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]","[time position, primary contact, international organizations, environmental ngos, public outreach, public agencies, public, strengthening, serve, network, growing, full, cenn, businesses, provide environmental information, primary local contact, prepare cenn seminars, international organizations, country director, caucasus region, armenian offices, armenian ngos, cenn, workshops, working, strategy, serving, policy, participating, organize, helping, defining, businesses, armenia]","[time position, primary contact, international organizations, environmental ngos, public outreach, public agencies, public, strengthening, serve, network, growing, full, cenn, businesses, environmentally related field, environmental issues specific, years relevant, written fluency, salary commensurate, working, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]","[provide environmental information, primary local contact, prepare cenn seminars, international organizations, country director, caucasus region, armenian offices, armenian ngos, cenn, workshops, working, strategy, serving, policy, participating, organize, helping, defining, businesses, armenia, environmentally related field, environmental issues specific, years relevant, written fluency, salary commensurate, working, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]","[time position, primary contact, international organizations, environmental ngos, public outreach, public agencies, public, strengthening, serve, network, growing, full, cenn, businesses, provide environmental information, primary local contact, prepare cenn seminars, international organizations, country director, caucasus region, armenian offices, armenian ngos, cenn, workshops, working, strategy, serving, policy, participating, organize, helping, defining, businesses, armenia, environmentally related field, environmental issues specific, years relevant, written fluency, salary commensurate, working, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]"


In [9]:
# Split phrases to word tokens
tqdm.pandas(desc="Job Skill Normalizing text")
skill_tokens_df = clean_skill_df[skill_columns].progress_applymap(
    lambda x: pre_process_text(x, spec_chars=spec_chars, stop_words=stop_words)
    .normalize()
)
skill_tokens_df.insert(0, 'job_id', skill_df['job_id'])
skill_tokens_df.head(3)

Job Skill Normalizing text: 100%|███████████████████████████████████████████████| 56976/56976 [02:33<00:00, 371.82it/s]


Unnamed: 0,job_id,job_description,job_requirement,job_qualification
0,1,"[ameria, investment, consult, company, provide, highly, responsible, complex, staff, assistance, chief, financial, officer, investment, management, financial, management, work, perform, technical, proficiency, position, manage, high, level, executive, director, administrative, skill, administrative, function, company, management, well, technically, supervisory, seek, require]","[liquidity, reserve, assist, investment, team, incorporation, regard, corporate, registration, include, evaluating, loss, exposure, internal, financial, management, report, investment, team, leader, management, team, participate, recommend, disciplinary, action, give, performance, evaluation, audit, financial, statement, annual, budgeting, process, supervise, financial, management, financial, management, activity, monitor, corporate, compliance, submitting, require, report, oversees, record, retention, international, accounting, standard, file, maintenance, activity, ensure, contract, compliance, monitoring, internal, control, investment, risk]","[provide, job, performance, feedback, operate, specialize, software, application, state, reporting, requirement, pertain, generally, accept, accounting, principle, account, technical, skill, couple, database, software, application, supervisory, track, record, essential, duty, satisfactorily, critically, analyze, fiscal, compatible, personal, computer, apply, sound, fiscal, local, accounting, standard, synthesize, financial, information, strong, financial, planning, comprehensive, financial, report, key, financial, function, include, word, processing, achieve, business, goal, financial, system, design, information, system, management]"
1,2,"[yerevan, office, however, applicant, must, irex, currently, seek, position, report, directly, cc, program, manager, throughout, armenia, community, connection, program, position, cc, willing, travel, pay, necessary, intern, fill, base]","[complete, general, administrative, task, include, coordinate, logistics, cc, file, system, cc, program, within, cc, program, cc, staff, program, staff, vice, versa, programmatic, meeting, interested, party, help, create, visitor, update, translation, schedule, present, planning, participant, office, need]","[public, speaking, skill, excellent, skill, work, independently, good, communication, write, english, past, english, armenian, translation, armenian, languages, translation, experience, english, armenian, experience, team, speak, remuneration, preferred, part, master, degree, commensurate]"
2,3,"[time, position, primary, contact, international, organization, environmental, ngos, public, outreach, public, agency, public, strengthen, serve, network, grow, full, cenn, business]","[provide, environmental, information, primary, local, contact, prepare, cenn, seminar, international, organization, country, director, caucasus, region, armenian, office, armenian, ngos, cenn, workshop, work, strategy, serve, policy, participate, organize, help, define, business, armenia]","[environmentally, related, field, environmental, issue, specific, year, relevant, write, fluency, salary, commensurate, work, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]"


In [10]:
skill_tokens_df["combi1"] = (
    skill_tokens_df["job_description"] + skill_tokens_df["job_requirement"]
)
skill_tokens_df["combi2"] = (
    skill_tokens_df["job_description"] + skill_tokens_df["job_qualification"]
)
skill_tokens_df["combi3"] = (
    skill_tokens_df["job_requirement"] + skill_tokens_df["job_qualification"]
)
skill_tokens_df["skills"] = (
    skill_tokens_df["job_description"]
    + skill_tokens_df["job_requirement"]
    + skill_tokens_df["job_qualification"]
)
skill_tokens_df.head(3)

Unnamed: 0,job_id,job_description,job_requirement,job_qualification,combi1,combi2,combi3,skills
0,1,"[ameria, investment, consult, company, provide, highly, responsible, complex, staff, assistance, chief, financial, officer, investment, management, financial, management, work, perform, technical, proficiency, position, manage, high, level, executive, director, administrative, skill, administrative, function, company, management, well, technically, supervisory, seek, require]","[liquidity, reserve, assist, investment, team, incorporation, regard, corporate, registration, include, evaluating, loss, exposure, internal, financial, management, report, investment, team, leader, management, team, participate, recommend, disciplinary, action, give, performance, evaluation, audit, financial, statement, annual, budgeting, process, supervise, financial, management, financial, management, activity, monitor, corporate, compliance, submitting, require, report, oversees, record, retention, international, accounting, standard, file, maintenance, activity, ensure, contract, compliance, monitoring, internal, control, investment, risk]","[provide, job, performance, feedback, operate, specialize, software, application, state, reporting, requirement, pertain, generally, accept, accounting, principle, account, technical, skill, couple, database, software, application, supervisory, track, record, essential, duty, satisfactorily, critically, analyze, fiscal, compatible, personal, computer, apply, sound, fiscal, local, accounting, standard, synthesize, financial, information, strong, financial, planning, comprehensive, financial, report, key, financial, function, include, word, processing, achieve, business, goal, financial, system, design, information, system, management]","[ameria, investment, consult, company, provide, highly, responsible, complex, staff, assistance, chief, financial, officer, investment, management, financial, management, work, perform, technical, proficiency, position, manage, high, level, executive, director, administrative, skill, administrative, function, company, management, well, technically, supervisory, seek, require, liquidity, reserve, assist, investment, team, incorporation, regard, corporate, registration, include, evaluating, loss, exposure, internal, financial, management, report, investment, team, leader, management, team, participate, recommend, disciplinary, action, give, performance, evaluation, audit, financial, statement, annual, budgeting, process, supervise, financial, management, financial, management, activity, monitor, corporate, compliance, submitting, require, report, oversees, record, retention, international, accounting, standard, file, maintenance, activity, ensure, contract, compliance, monitoring, internal, control, ...]","[ameria, investment, consult, company, provide, highly, responsible, complex, staff, assistance, chief, financial, officer, investment, management, financial, management, work, perform, technical, proficiency, position, manage, high, level, executive, director, administrative, skill, administrative, function, company, management, well, technically, supervisory, seek, require, provide, job, performance, feedback, operate, specialize, software, application, state, reporting, requirement, pertain, generally, accept, accounting, principle, account, technical, skill, couple, database, software, application, supervisory, track, record, essential, duty, satisfactorily, critically, analyze, fiscal, compatible, personal, computer, apply, sound, fiscal, local, accounting, standard, synthesize, financial, information, strong, financial, planning, comprehensive, financial, report, key, financial, function, include, word, processing, achieve, business, goal, financial, system, design, ...]","[liquidity, reserve, assist, investment, team, incorporation, regard, corporate, registration, include, evaluating, loss, exposure, internal, financial, management, report, investment, team, leader, management, team, participate, recommend, disciplinary, action, give, performance, evaluation, audit, financial, statement, annual, budgeting, process, supervise, financial, management, financial, management, activity, monitor, corporate, compliance, submitting, require, report, oversees, record, retention, international, accounting, standard, file, maintenance, activity, ensure, contract, compliance, monitoring, internal, control, investment, risk, provide, job, performance, feedback, operate, specialize, software, application, state, reporting, requirement, pertain, generally, accept, accounting, principle, account, technical, skill, couple, database, software, application, supervisory, track, record, essential, duty, satisfactorily, critically, analyze, fiscal, compatible, personal, computer, apply, ...]","[ameria, investment, consult, company, provide, highly, responsible, complex, staff, assistance, chief, financial, officer, investment, management, financial, management, work, perform, technical, proficiency, position, manage, high, level, executive, director, administrative, skill, administrative, function, company, management, well, technically, supervisory, seek, require, liquidity, reserve, assist, investment, team, incorporation, regard, corporate, registration, include, evaluating, loss, exposure, internal, financial, management, report, investment, team, leader, management, team, participate, recommend, disciplinary, action, give, performance, evaluation, audit, financial, statement, annual, budgeting, process, supervise, financial, management, financial, management, activity, monitor, corporate, compliance, submitting, require, report, oversees, record, retention, international, accounting, standard, file, maintenance, activity, ensure, contract, compliance, monitoring, internal, control, ...]"
1,2,"[yerevan, office, however, applicant, must, irex, currently, seek, position, report, directly, cc, program, manager, throughout, armenia, community, connection, program, position, cc, willing, travel, pay, necessary, intern, fill, base]","[complete, general, administrative, task, include, coordinate, logistics, cc, file, system, cc, program, within, cc, program, cc, staff, program, staff, vice, versa, programmatic, meeting, interested, party, help, create, visitor, update, translation, schedule, present, planning, participant, office, need]","[public, speaking, skill, excellent, skill, work, independently, good, communication, write, english, past, english, armenian, translation, armenian, languages, translation, experience, english, armenian, experience, team, speak, remuneration, preferred, part, master, degree, commensurate]","[yerevan, office, however, applicant, must, irex, currently, seek, position, report, directly, cc, program, manager, throughout, armenia, community, connection, program, position, cc, willing, travel, pay, necessary, intern, fill, base, complete, general, administrative, task, include, coordinate, logistics, cc, file, system, cc, program, within, cc, program, cc, staff, program, staff, vice, versa, programmatic, meeting, interested, party, help, create, visitor, update, translation, schedule, present, planning, participant, office, need]","[yerevan, office, however, applicant, must, irex, currently, seek, position, report, directly, cc, program, manager, throughout, armenia, community, connection, program, position, cc, willing, travel, pay, necessary, intern, fill, base, public, speaking, skill, excellent, skill, work, independently, good, communication, write, english, past, english, armenian, translation, armenian, languages, translation, experience, english, armenian, experience, team, speak, remuneration, preferred, part, master, degree, commensurate]","[complete, general, administrative, task, include, coordinate, logistics, cc, file, system, cc, program, within, cc, program, cc, staff, program, staff, vice, versa, programmatic, meeting, interested, party, help, create, visitor, update, translation, schedule, present, planning, participant, office, need, public, speaking, skill, excellent, skill, work, independently, good, communication, write, english, past, english, armenian, translation, armenian, languages, translation, experience, english, armenian, experience, team, speak, remuneration, preferred, part, master, degree, commensurate]","[yerevan, office, however, applicant, must, irex, currently, seek, position, report, directly, cc, program, manager, throughout, armenia, community, connection, program, position, cc, willing, travel, pay, necessary, intern, fill, base, complete, general, administrative, task, include, coordinate, logistics, cc, file, system, cc, program, within, cc, program, cc, staff, program, staff, vice, versa, programmatic, meeting, interested, party, help, create, visitor, update, translation, schedule, present, planning, participant, office, need, public, speaking, skill, excellent, skill, work, independently, good, communication, write, english, past, english, armenian, translation, armenian, languages, translation, experience, english, armenian, experience, team, speak, remuneration, preferred, part, master, degree, commensurate]"
2,3,"[time, position, primary, contact, international, organization, environmental, ngos, public, outreach, public, agency, public, strengthen, serve, network, grow, full, cenn, business]","[provide, environmental, information, primary, local, contact, prepare, cenn, seminar, international, organization, country, director, caucasus, region, armenian, office, armenian, ngos, cenn, workshop, work, strategy, serve, policy, participate, organize, help, define, business, armenia]","[environmentally, related, field, environmental, issue, specific, year, relevant, write, fluency, salary, commensurate, work, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]","[time, position, primary, contact, international, organization, environmental, ngos, public, outreach, public, agency, public, strengthen, serve, network, grow, full, cenn, business, provide, environmental, information, primary, local, contact, prepare, cenn, seminar, international, organization, country, director, caucasus, region, armenian, office, armenian, ngos, cenn, workshop, work, strategy, serve, policy, participate, organize, help, define, business, armenia]","[time, position, primary, contact, international, organization, environmental, ngos, public, outreach, public, agency, public, strengthen, serve, network, grow, full, cenn, business, environmentally, related, field, environmental, issue, specific, year, relevant, write, fluency, salary, commensurate, work, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]","[provide, environmental, information, primary, local, contact, prepare, cenn, seminar, international, organization, country, director, caucasus, region, armenian, office, armenian, ngos, cenn, workshop, work, strategy, serve, policy, participate, organize, help, define, business, armenia, environmentally, related, field, environmental, issue, specific, year, relevant, write, fluency, salary, commensurate, work, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]","[time, position, primary, contact, international, organization, environmental, ngos, public, outreach, public, agency, public, strengthen, serve, network, grow, full, cenn, business, provide, environmental, information, primary, local, contact, prepare, cenn, seminar, international, organization, country, director, caucasus, region, armenian, office, armenian, ngos, cenn, workshop, work, strategy, serve, policy, participate, organize, help, define, business, armenia, environmentally, related, field, environmental, issue, specific, year, relevant, write, fluency, salary, commensurate, work, russian, remuneration, plus, oral, knowledge, experience, english, degree, armenian, armenia]"


## Transform to skill-driven data

In [11]:
# skill_tokens_long_df = clean_skill_df[['job_id', 'skills']].explode('skills')
# skill_tokens_long_df

In [12]:
# skill_tokens_long_df.insert(0, 'skill_id', 0)

In [13]:
# skill_tokens_long_df["skill_id"] = skill_tokens_long_df.groupby("job_id")["job_id"].rank(
#     method="first", ascending=True
# )

In [14]:
# skill_tokens_long_df["skill_id"] = skill_tokens_long_df["skill_id"].apply(lambda x: int(x))

In [15]:
# skill_tokens_long_df.iloc[70:76,]

In [41]:
tmp1 = clean_skill_df[["job_id", "job_description"]].explode("job_description")

# skill_tokens_long_df = pd.merge(skill_tokens_long_df, tmp1, how="left", on=["job_id", "skills"])

tmp1["skill_id"] = tmp1.groupby("job_id")["job_id"].rank(
    method="first", ascending=True
)

In [42]:
tmp1.iloc[:25, ]

Unnamed: 0,job_id,job_description,skill_id
0,1,ameria investment consulting company,1.0
0,1,provides highly responsible,2.0
0,1,complex staff assistance,3.0
0,1,chief financial officer,4.0
0,1,investment management,5.0
0,1,financial management,6.0
0,1,work performed,7.0
0,1,technical proficiency,8.0
0,1,position manages,9.0
0,1,high level,10.0


In [43]:
tmp2 = clean_skill_df[["job_id", "job_requirement"]].explode("job_requirement")

tmp2["skill_id"] = tmp2.groupby("job_id")["job_id"].rank(
    method="first", ascending=True
)

In [44]:
tmp2.iloc[:25, ]

Unnamed: 0,job_id,job_requirement,skill_id
0,1,liquidity reserves assists investment team,1.0
0,1,incorporation regarding corporate registration,2.0
0,1,including evaluating loss exposure,3.0
0,1,internal financial management reports,4.0
0,1,investment team leader,5.0
0,1,management team participating,6.0
0,1,recommending disciplinary action,7.0
0,1,giving performance evaluations,8.0
0,1,audited financial statements,9.0
0,1,annual budgeting process,10.0


In [45]:
skill_tokens_long_df = pd.merge(tmp1, tmp2, how="outer", on=["job_id", "skill_id"])

In [46]:
skill_tokens_long_df.iloc[:25, ]

Unnamed: 0,job_id,job_description,skill_id,job_requirement
0,1,ameria investment consulting company,1.0,liquidity reserves assists investment team
1,1,provides highly responsible,2.0,incorporation regarding corporate registration
2,1,complex staff assistance,3.0,including evaluating loss exposure
3,1,chief financial officer,4.0,internal financial management reports
4,1,investment management,5.0,investment team leader
5,1,financial management,6.0,management team participating
6,1,work performed,7.0,recommending disciplinary action
7,1,technical proficiency,8.0,giving performance evaluations
8,1,position manages,9.0,audited financial statements
9,1,high level,10.0,annual budgeting process


In [47]:
skill_tokens_long_df.iloc[70:76, ]

Unnamed: 0,job_id,job_description,skill_id,job_requirement
70,5,unprovided,1.0,sql servers maintenance activities
71,6,sell menswear,1.0,unprovided
72,6,saleswoman,2.0,
73,6,accessories,3.0,
74,7,institute assistance foundation,1.0,unprovided
75,7,armenian branch office,2.0,


In [48]:
tmp3 = clean_skill_df[["job_id", "job_qualification"]].explode("job_qualification")

tmp3["skill_id"] = tmp3.groupby("job_id")["job_id"].rank(method="first", ascending=True)

In [50]:
tmp3.iloc[:25, ]

Unnamed: 0,job_id,job_qualification,skill_id
0,1,providing job performance feedback,1.0
0,1,operate specialized software applications,2.0
0,1,state reporting requirements pertaining,3.0
0,1,generally accepted accounting principles,4.0
0,1,accounting technical skills coupled,5.0
0,1,database software applications,6.0
0,1,supervisory track record,7.0
0,1,essential duty satisfactorily,8.0
0,1,critically analyze fiscal,9.0
0,1,compatible personal computer,10.0


In [51]:
skill_tokens_long_df = pd.merge(
    skill_tokens_long_df, tmp3, how="outer", on=["job_id", "skill_id"]
)

In [52]:
skill_tokens_long_df.iloc[:25, ]

Unnamed: 0,job_id,job_description,skill_id,job_requirement,job_qualification
0,1,ameria investment consulting company,1.0,liquidity reserves assists investment team,providing job performance feedback
1,1,provides highly responsible,2.0,incorporation regarding corporate registration,operate specialized software applications
2,1,complex staff assistance,3.0,including evaluating loss exposure,state reporting requirements pertaining
3,1,chief financial officer,4.0,internal financial management reports,generally accepted accounting principles
4,1,investment management,5.0,investment team leader,accounting technical skills coupled
5,1,financial management,6.0,management team participating,database software applications
6,1,work performed,7.0,recommending disciplinary action,supervisory track record
7,1,technical proficiency,8.0,giving performance evaluations,essential duty satisfactorily
8,1,position manages,9.0,audited financial statements,critically analyze fiscal
9,1,high level,10.0,annual budgeting process,compatible personal computer


In [54]:
skill_tokens_long_df.iloc[70:85, ]

Unnamed: 0,job_id,job_description,skill_id,job_requirement,job_qualification
70,5,unprovided,1.0,sql servers maintenance activities,database software development
71,6,sell menswear,1.0,unprovided,excellent communication skills
72,6,saleswoman,2.0,,years old
73,6,accessories,3.0,,starting salary
74,7,institute assistance foundation,1.0,unprovided,one year minimum experience
75,7,armenian branch office,2.0,,strong organizational skills
76,7,seeking applications,3.0,,quick learning skills
77,7,open society,4.0,,handle confidential issues
78,7,chief accountant,5.0,,armenian taxation laws
79,7,administrative expenses,6.0,,university degree


In [31]:
# skill_tokens_long_df['type'] = np.nan
# skill_tokens_long_df.loc[skill_tokens_long_df['job_description'].notnull(), ['type']] = 'job_description'
# skill_tokens_long_df

In [32]:
# skill_tokens_long_df.loc[
#     skill_tokens_long_df["job_requirement"].notnull()
#     & pd.isnull(skill_tokens_long_df["type"]),
#     ["type"]
# ] = "job_requirement"

In [56]:
# skill_tokens_long_df.iloc[:25, ]

In [34]:
# skill_tokens_long_df.loc[
#     skill_tokens_long_df["job_qualification"].notnull() & pd.isnull(skill_tokens_long_df["type"]),
#     ["type"],
# ] = "job_qualification"

In [35]:
# skill_tokens_long_df.iloc[4:20, ]

In [36]:
# skill_tokens_long_df.iloc[70:76, ]

In [67]:
token_col_list = [
    "skill_description_tokens",
    "skill_requirement_tokens",
    "skill_qualification_tokens",
]
text_col_list = ["job_description", "job_requirement", "job_qualification"]
# tmp_long_df = skill_tokens_long_df.copy(deep=True)
tqdm_bar = tqdm(desc='Tokenizing skills for each column', total=len(token_col_list))
for idx, i in enumerate(token_col_list):
    j = text_col_list[idx]
    check_for_nan = skill_tokens_long_df[j].isnull()
    skill_tokens_long_df[i] = skill_tokens_long_df[j].apply(
        lambda x: nltk.word_tokenize(x)
        if pd.notnull(x)
        else nltk.word_tokenize("unprovided")
    )
    tqdm_bar.update(1)
tqdm_bar.close()

Tokenizing skills for each column: 100%|█████████████████████████████████████████████████| 3/3 [01:07<00:00, 22.38s/it]


In [68]:
skill_tokens_long_df.iloc[70:76, ]

Unnamed: 0,job_id,job_description,skill_id,job_requirement,job_qualification,skill_description_tokens,skill_requirement_tokens,skill_qualification_tokens
70,5,unprovided,1.0,sql servers maintenance activities,database software development,[unprovided],"[sql, servers, maintenance, activities]","[database, software, development]"
71,6,sell menswear,1.0,unprovided,excellent communication skills,"[sell, menswear]",[unprovided],"[excellent, communication, skills]"
72,6,saleswoman,2.0,,years old,[saleswoman],[unprovided],"[years, old]"
73,6,accessories,3.0,,starting salary,[accessories],[unprovided],"[starting, salary]"
74,7,institute assistance foundation,1.0,unprovided,one year minimum experience,"[institute, assistance, foundation]",[unprovided],"[one, year, minimum, experience]"
75,7,armenian branch office,2.0,,strong organizational skills,"[armenian, branch, office]",[unprovided],"[strong, organizational, skills]"


In [69]:
skill_tokens_long_df = skill_tokens_long_df[
    [
        "skill_id",
        "job_id",
        "job_description",
        "skill_description_tokens",
        "job_requirement",
        "skill_requirement_tokens",
        "job_qualification",
        "skill_qualification_tokens",
    ]
]

In [75]:
tqdm.pandas(desc="Job Skill Converting ID to integer")
skill_tokens_long_df["skill_id"] = skill_tokens_long_df["skill_id"].progress_apply(
    lambda x: int(x)
)

Job Skill Converting ID to integer: 100%|█████████████████████████████████| 361584/361584 [00:00<00:00, 1089267.95it/s]


In [76]:
skill_tokens_long_df.iloc[:25, ]

Unnamed: 0,skill_id,job_id,job_description,skill_description_tokens,job_requirement,skill_requirement_tokens,job_qualification,skill_qualification_tokens
0,1,1,ameria investment consulting company,"[ameria, investment, consulting, company]",liquidity reserves assists investment team,"[liquidity, reserves, assists, investment, team]",providing job performance feedback,"[providing, job, performance, feedback]"
1,2,1,provides highly responsible,"[provides, highly, responsible]",incorporation regarding corporate registration,"[incorporation, regarding, corporate, registration]",operate specialized software applications,"[operate, specialized, software, applications]"
2,3,1,complex staff assistance,"[complex, staff, assistance]",including evaluating loss exposure,"[including, evaluating, loss, exposure]",state reporting requirements pertaining,"[state, reporting, requirements, pertaining]"
3,4,1,chief financial officer,"[chief, financial, officer]",internal financial management reports,"[internal, financial, management, reports]",generally accepted accounting principles,"[generally, accepted, accounting, principles]"
4,5,1,investment management,"[investment, management]",investment team leader,"[investment, team, leader]",accounting technical skills coupled,"[accounting, technical, skills, coupled]"
5,6,1,financial management,"[financial, management]",management team participating,"[management, team, participating]",database software applications,"[database, software, applications]"
6,7,1,work performed,"[work, performed]",recommending disciplinary action,"[recommending, disciplinary, action]",supervisory track record,"[supervisory, track, record]"
7,8,1,technical proficiency,"[technical, proficiency]",giving performance evaluations,"[giving, performance, evaluations]",essential duty satisfactorily,"[essential, duty, satisfactorily]"
8,9,1,position manages,"[position, manages]",audited financial statements,"[audited, financial, statements]",critically analyze fiscal,"[critically, analyze, fiscal]"
9,10,1,high level,"[high, level]",annual budgeting process,"[annual, budgeting, process]",compatible personal computer,"[compatible, personal, computer]"


In [77]:
skill_tokens_long_df.shape

(361584, 8)

## Save pre-processed text to csv file

In [70]:
# Save cleaned job skills in file
clean_skill_df_outfile = OUTPUT_DIR + "clean_skill_df.csv"
clean_skill_df.to_csv(clean_skill_df_outfile, index=False)

In [71]:
# Save cleaned job skill tokens in file
skill_tokens_df_outfile = OUTPUT_DIR + "skill_tokens_df.csv"
skill_tokens_df.to_csv(skill_tokens_df_outfile, index=False)

In [78]:
# Save cleaned job skill tokens in file
skill_tokens_long_df_outfile = OUTPUT_DIR + "skill_tokens_long_df.csv"
skill_tokens_long_df.to_csv(skill_tokens_long_df_outfile, index=False)