## Load libraries

In [1]:
# Libraries to work with dataset
import pandas as pd

# Libraries to preprocess text
import nltk
# from nltk.corpus import stopwords
import re
import string
# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import wordnet
# from collections import Counter
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Libraries to visualize data
from tqdm import tqdm     # displaying progress bar while running computation

# Download neccessary resources
# nltk.download('punkt')
# nltk.download('stopwords')

## Configurate and declare global variables

In [2]:
BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"

## Import data

In [3]:
title_df = pd.read_csv(
    INPUT_DIR + "refined_jobpost_data.csv", usecols=["job_id", "job_title"]
)
pd.set_option("display.max_colwidth", None)

## Explore data

In [4]:
title_df.shape

(18992, 2)

In [5]:
title_df.head(10)

Unnamed: 0,job_id,job_title
0,1,Chief Financial Officer
1,2,Full-time Community Connections Intern (paid internship)
2,3,Country Coordinator
3,4,BCC Specialist
4,5,Software Developer
5,6,Saleswoman
6,7,Chief Accountant/ Finance Assistant
7,8,Non-paid part or full time Programmatic Intern
8,9,Assistant to Managing Director
9,10,"Program Assistant (INL), FSN-8; FP-6*"


In [6]:
title_df.columns.values

array(['job_id', 'job_title'], dtype=object)

## Text preprocessing

In [15]:
# def nltk_tag_to_wordnet_tag(nltk_tag):
#     """Helper function to convert nltk POS tags to wordnet POS tags"""
#     if nltk_tag.startswith("J"):
#         return wordnet.ADJ
#     elif nltk_tag.startswith("V"):
#         return wordnet.VERB
#     elif nltk_tag.startswith("N"):
#         return wordnet.NOUN
#     elif nltk_tag.startswith("R"):
#         return wordnet.ADV
#     else:
#         return None


# def lemmatize_token(token):
#     lemmatizer = WordNetLemmatizer()
#     nltk_tagged = nltk.pos_tag(token)
#     wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
#     lemmatized_sentence = []
#     for word, tag in wordnet_tagged:
#         if tag is None:
#             # if no tag found then use as it is
#             lemmatized_sentence.append(word)
#         else:
#             # else use the tag to lemmatize the token
#             lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))

#     return " ".join(lemmatized_sentence)


class pre_process_text:
    """Function to pre process text once for all steps:
    - Removing URLs
    - Removing stop words
    - Removing special characters and numbers
    - Removing rare words (frequency = 1)
    - Lowercasing
    - Lemmatizing

    Inputs:
    - text: raw text to normalize
    - spec_chars: list of special characters to remove
    - stop_words: set of stopwords to remove

    Return: normalized text
    """

    def __init__(self, text, spec_chars, stop_words):
        self.text = text
        self.spec_chars = spec_chars
        self.stop_words = stop_words

    def convert2string(self):
        self.text = str(self.text)
        return self

    def lowercase(self):
        # convert to string and lowercasing
        self.text = self.text.lower()
        return self

    def remove_url(self):
        # remove URLs
        self.text = re.sub(r"http\S+", "", self.text)
        self.text = re.sub(r"www\S+", "", self.text)
        return self

    def remove_spec_chars(self):
        # remove special characters and numbers
        # self.text = self.text.translate(str.maketrans("", "", self.spec_chars))
        for char in self.spec_chars:
            self.text = self.text.replace(char, " ")
        self.text = re.sub("[ ]{2,}", " ", self.text)
        return self

#     def replace_spec_chars(self):
#         # replace special characters and numbers by "|"
#         for char in self.spec_chars:
#             self.text = self.text.replace(char, "|")
#         self.text = re.sub("[|]{1,}", " | ", self.text)
#         return self

    def get_tokens(self):
        # tokenize
        self.text = nltk.word_tokenize(self.text)
        return self

#     def remove_stopwords(self):
#         # remove stop words
#         self.text = [word for word in self.text if word not in self.stop_words]
#         return self

    def remove_shortwords(self):
        # remove words with length < 2
        self.text = [
            word
            for word in self.text
            if len(word) >= 2
        ]
        self.text = " ".join(self.text)
        return self

    def lemmatize(self):
        # lemmatize and remove stop words
        #         self.text = lemmatize_token(self.text)
        # #         lemma_sentence = []
        # #         for token in pre_model(self.text):
        # #             lemma_sentence.append(token.lemma_)
        self.text = [word.lemma_ for word in pre_model(self.text) if word.is_stop is False]
        self.text = " ".join(self.text)
        return self

    def normalize(self):
        self = self.convert2string()
        self = self.lowercase()
        self = self.remove_url()
#         self = self.replace_spec_chars()
        self = self.remove_spec_chars()
        self = self.get_tokens()
#         self = self.remove_stopwords()
        self = self.remove_shortwords()
        self = self.lemmatize()
        return self.text

In [8]:
# load the pre-trained Stanford GloVe model in Word2Vec format (dimension 300)
pre_model = spacy.load('en_core_web_lg')
pre_model.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [9]:
if 'parser' in pre_model.pipe_names:
    pre_model.remove_pipe('parser')
if 'ner' in pre_model.pipe_names:
    pre_model.remove_pipe('ner')
pre_model.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']

In [10]:
# Set stopwords and special characters to remove
# stop_words = set(stopwords.words("english"))
stop_words = set(STOP_WORDS)
spec_chars = string.punctuation + string.digits + "\n\r"

In [16]:
# Normalize text
tqdm.pandas(desc="Job Title Text Preprocessing")
clean_title_df = title_df[["job_title"]].progress_applymap(
    lambda x: pre_process_text(
        x, spec_chars=spec_chars, stop_words=stop_words
    ).normalize()
)
clean_title_df.insert(0, "job_id", title_df["job_id"])
clean_title_df.rename(columns={"job_title": "job_title_lemma"}, inplace=True)
clean_title_df

Job Title Text Preprocessing: 100%|█████████████████████████████████████████████| 18992/18992 [00:43<00:00, 441.07it/s]


Unnamed: 0,job_id,job_title_lemma
0,1,chief financial officer
1,2,time community connection intern pay internship
2,3,country coordinator
3,4,bcc specialist
4,5,software developer
...,...,...
18987,18997,senior creative ux ui designer
18988,18998,category development manager
18989,18999,operational marketing manager
18990,19000,head online sale department


In [17]:
# Tokenize text
tqdm.pandas(desc="Job Title word Tokenizing")
clean_title_df[['job_title_token']] = clean_title_df[["job_title_lemma"]].progress_applymap(
    lambda x: pre_process_text(
        x, spec_chars=spec_chars, stop_words=stop_words
    ).get_tokens().text
)
clean_title_df

Job Title word Tokenizing: 100%|██████████████████████████████████████████████| 18992/18992 [00:01<00:00, 12758.52it/s]


Unnamed: 0,job_id,job_title_lemma,job_title_token
0,1,chief financial officer,"[chief, financial, officer]"
1,2,time community connection intern pay internship,"[time, community, connection, intern, pay, internship]"
2,3,country coordinator,"[country, coordinator]"
3,4,bcc specialist,"[bcc, specialist]"
4,5,software developer,"[software, developer]"
...,...,...,...
18987,18997,senior creative ux ui designer,"[senior, creative, ux, ui, designer]"
18988,18998,category development manager,"[category, development, manager]"
18989,18999,operational marketing manager,"[operational, marketing, manager]"
18990,19000,head online sale department,"[head, online, sale, department]"


In [18]:
# Merge columns with original job_title for later use of visualization
clean_title_df = pd.merge(
    title_df,
    clean_title_df,
    how="inner",
    on=["job_id"],
)
clean_title_df

Unnamed: 0,job_id,job_title,job_title_lemma,job_title_token
0,1,Chief Financial Officer,chief financial officer,"[chief, financial, officer]"
1,2,Full-time Community Connections Intern (paid internship),time community connection intern pay internship,"[time, community, connection, intern, pay, internship]"
2,3,Country Coordinator,country coordinator,"[country, coordinator]"
3,4,BCC Specialist,bcc specialist,"[bcc, specialist]"
4,5,Software Developer,software developer,"[software, developer]"
...,...,...,...,...
18987,18997,Senior Creative UX/ UI Designer,senior creative ux ui designer,"[senior, creative, ux, ui, designer]"
18988,18998,Category Development Manager,category development manager,"[category, development, manager]"
18989,18999,Operational Marketing Manager,operational marketing manager,"[operational, marketing, manager]"
18990,19000,Head of Online Sales Department,head online sale department,"[head, online, sale, department]"


In [85]:
# def remove_rare_words(text, rare_words):
#     return " ".join([word for word in text.split() if word not in rare_words])

In [86]:
# # Remove rare words
# title_vocab = [
#     item
#     for sublist in clean_title_df["job_title"].values.tolist()
#     for item in nltk.word_tokenize(sublist)
# ]
# rare_words = [k for (k, v) in Counter(title_vocab).items() if v <= 1]

# print(len(rare_words))
# print(rare_words)

In [87]:
# tqdm.pandas(desc="Job Title Removing rare words")
# clean_title_df = clean_title_df.progress_applymap(
#     lambda x: remove_rare_words(x, rare_words=rare_words)
# )
# clean_title_df

## Save pre-processed text to csv file

In [19]:
clean_title_df.to_csv(OUTPUT_DIR + "clean_title_df.csv", index=False)