## Load libraries

In [77]:
# Libraries to work with dataset
import pandas as pd

# Libraries to preprocess text
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# from collections import Counter

# Libraries to visualize data
from tqdm import tqdm     # displaying progress bar while running computation

# Download neccessary resources
# nltk.download('punkt')
# nltk.download('stopwords')

## Configurate and declare global variables

In [78]:
BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"

## Import data

In [79]:
title_df = pd.read_csv(INPUT_DIR + "refined_jobpost_data.csv", usecols=["job_title"])
pd.set_option("display.max_colwidth", None)

## Explore data

In [80]:
title_df.shape

(18992, 1)

In [81]:
title_df.head(10)

Unnamed: 0,job_title
0,Chief Financial Officer
1,Full-time Community Connections Intern (paid internship)
2,Country Coordinator
3,BCC Specialist
4,Software Developer
5,Saleswoman
6,Chief Accountant/ Finance Assistant
7,Non-paid part or full time Programmatic Intern
8,Assistant to Managing Director
9,"Program Assistant (INL), FSN-8; FP-6*"


In [82]:
title_df.columns.values

array(['job_title'], dtype=object)

## Text preprocessing

In [83]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    """Helper function to convert nltk POS tags to wordnet POS tags"""
    if nltk_tag.startswith("J"):
        return wordnet.ADJ
    elif nltk_tag.startswith("V"):
        return wordnet.VERB
    elif nltk_tag.startswith("N"):
        return wordnet.NOUN
    elif nltk_tag.startswith("R"):
        return wordnet.ADV
    else:
        return None


def lemmatize_token(token):
    lemmatizer = WordNetLemmatizer()
    nltk_tagged = nltk.pos_tag(token)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if no tag found then use as it is
            lemmatized_sentence.append(word)
        else:
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))

    return " ".join(lemmatized_sentence)


class pre_process_text:
    """Function to pre process text once for all steps:
    - Removing URLs
    - Removing stop words
    - Removing special characters and numbers
    - Removing rare words (frequency = 1)
    - Lowercasing
    - Lemmatizing

    Inputs:
    - text: raw text to normalize
    - spec_chars: list of special characters to remove
    - stop_words: set of stopwords to remove

    Return: normalized text
    """

    def __init__(self, text, spec_chars, stop_words):
        self.text = text
        self.spec_chars = spec_chars
        self.stop_words = stop_words

    def convert2string(self):
        self.text = str(self.text)
        return self

    def lowercase(self):
        # convert to string and lowercasing
        self.text = self.text.lower()
        return self

    def remove_url(self):
        # remove URLs
        self.text = re.sub(r"http\S+", "", self.text)
        self.text = re.sub(r"www\S+", "", self.text)
        return self

    def remove_spec_chars(self):
        # remove special characters and numbers
        # self.text = self.text.translate(str.maketrans("", "", self.spec_chars))
        for char in self.spec_chars:
            self.text = self.text.replace(char, " ")
        self.text = re.sub("[ ]{2,}", " ", self.text)
        return self

    def replace_spec_chars(self):
        # replace special characters and numbers with |
        for char in self.spec_chars:
            self.text = self.text.replace(char, "|")
        self.text = re.sub("[|]{2,}", "|", self.text)
        return self

    def get_tokens(self):
        # tokenize
        self.text = nltk.word_tokenize(self.text)
        return self

    def remove_stopwords(self):
        # remove stop words
        self.text = [word for word in self.text if word not in self.stop_words]
        return self

    def lemmatize(self):
        # lemmatize
        self.text = lemmatize_token(self.text)
        return self

    def normalize(self):
        self = self.convert2string()
        self = self.lowercase()
        self = self.remove_url()
        self = self.replace_spec_chars()
        self = self.remove_spec_chars()
        self = self.get_tokens()
        self = self.remove_stopwords()
        self = self.lemmatize()
        return self.text

In [84]:
# Set stopwords, special characters, and rare_words to remove
stop_words = set(stopwords.words("english"))
spec_chars = string.punctuation + string.digits + "\n\r"

# Normalize text
tqdm.pandas(desc="Job Title Text Preprocessing")
clean_title_df = title_df.progress_applymap(
    lambda x: pre_process_text(
        x, spec_chars=spec_chars, stop_words=stop_words
    ).normalize()
)
clean_title_df

Job Title Text Preprocessing: 100%|████████████████████████████████████████████| 18992/18992 [00:13<00:00, 1435.65it/s]


Unnamed: 0,job_title
0,chief financial officer
1,full time community connection intern pay internship
2,country coordinator
3,bcc specialist
4,software developer
...,...
18987,senior creative ux ui designer
18988,category development manager
18989,operational marketing manager
18990,head online sale department


In [85]:
# def remove_rare_words(text, rare_words):
#     return " ".join([word for word in text.split() if word not in rare_words])

In [86]:
# # Remove rare words
# title_vocab = [
#     item
#     for sublist in clean_title_df["job_title"].values.tolist()
#     for item in nltk.word_tokenize(sublist)
# ]
# rare_words = [k for (k, v) in Counter(title_vocab).items() if v <= 1]

# print(len(rare_words))
# print(rare_words)

In [87]:
# tqdm.pandas(desc="Job Title Removing rare words")
# clean_title_df = clean_title_df.progress_applymap(
#     lambda x: remove_rare_words(x, rare_words=rare_words)
# )
# clean_title_df

## Save pre-processed text to csv file

In [88]:
clean_title_df.to_csv(OUTPUT_DIR + "clean_title_df.csv", index=False)