In [None]:
import re
import json
import unicodedata
import html
import textdistance
stanza.download('en') # download English model
nlp = stanza.Pipeline('en') # initialize English neural pipeline

In [None]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
nltk.download('punkt')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stop_words = set(stopwords.words('english'))

<h1>Job Posting modell</h1>

In [None]:
class JobPosting:
    def __init__(self, **kwargs):
        if 'json' in kwargs:
            self.__dict__ = json.loads(kwargs.get("json"))
            self.processedDescription = []
            self.title_prefixes = []
            self.title_suffixes = []
            self.title_middle_parts = []
            self.title_occupations = []
        else:
            self.id = kwargs.get("_id")
            self.title = kwargs.get("title")
            self.date = kwargs.get("date")
            self.description = kwargs.get("description")
            self.ner_sentences = []
            self.ner_tokens = []
            self.upos_sentences = []
            self.xpos_sentences = []
            self.lemmaDescSentences = []
            self.originalDescSentences = []
            self.processedDescription = []
            self.title_prefixes = []
            self.title_suffixes = []
            self.title_middle_parts = []
            self.title_occupations = []
        
    def __eq__(self, other):
        return self.id == other.id
    
    def __hash__(self):
        return hash(self.id)
        
    def append_ner_sentences(self, sentences):
        self.ner_sentences.append(sentences)

    def append_ner_tokens(self, sentences):
        self.ner_tokens.append(sentences)

    def append_upos_sentences(self, sentences):
        self.upos_sentences.append(sentences)

    def append_xpos_sentences(self, sentences):
        self.xpos_sentences.append(sentences)
        
    def appendLemmaDescSentence(self, sentence):
        self.lemmaDescSentences.append(sentence)
        
    def appendOriginalDescSentence(self, sentence):
        self.originalDescSentences.append(sentence)
    
    def appendProcessedDescription(self, sentence):
        self.processedDescription.append(sentence)

<h3>Restore job postings from file</h3>

In [None]:
#using a loop to avoid loading the hole file into memory
import os

postings = set()

with open(f'cleaning_descriptions/identified_postings_1910.jl') as file_in:    
    for line in file_in:
        postings.add(JobPosting(json = line))

len(postings)

<h1>Skill modell and data fetching from file</h1>

In [27]:
class Skill:
    def __init__(self, label):
        self.label = label
        self.nlpLabel = ''
        self.lemmaLabel = ''
        
    def setNlpLabel(self, nlpLabel):
        self.nlpLabel = nlpLabel
        
    def setLemmaLabel(self, lemmaLabel):
        self.lemmaLabel = lemmaLabel

skills = set()
        
with open('cleaning_descriptions/all_ict_skill_labels.csv') as file:
    lines = file.readlines()
    
for skill in [skill.strip() for skill in lines]:
    skills.add(Skill(skill))
    
for skill in skills:
    nlpResult = nlp(skill.label)
    skill.setNlpLabel(nlpResult)
    skill.setLemmaLabel(' '.join([word.lemma for word in nlpResult.sentences[0].words]))   


<h1>Text preprocessing and similarity calculation</h1>

In [28]:
#this and that are list objects
#0 <= acceptance <=1 
#the higher ts is the more similar thisElem and thatElem
def text_similarity(similarityObject, this, that, acceptance):
    results = []
    for thisElem in this:
        for thatElem in that:
            ts = similarityObject.similarity(thisElem, thatElem)
            nts = similarityObject.normalized_similarity(thisElem, thatElem)
            if(nts > acceptance):
                results.append((thisElem, thatElem, nts, ts))
    return results

def text_similarities(similarityObjects, acceptanceBaseObject, this, that, acceptance):
    results = []
    for thisElem in this:
        for thatElem in that:
            tss = []
            ntss = []
            
            for i, similarityObject in enumerate(similarityObjects):
                nts = similarityObject.normalized_similarity(thisElem, thatElem)
                if(i == acceptanceBaseObject and nts <= acceptance):
                    break
                tss.append(similarityObject.similarity(thisElem, thatElem))
                ntss.append(nts)
            
            if(ntss):
                results.append((thisElem, thatElem, ntss, tss))
    return results

def text_normalized_similarities(similarityObjects, acceptanceBaseObject, this, that, acceptance):
    results = []
    for thisElem in this:
        for thatElem in that:
            ntss = []
            
            for i, similarityObject in enumerate(similarityObjects):
                nts = similarityObject.normalized_similarity(thisElem, thatElem)
                if(i == acceptanceBaseObject and nts <= acceptance):
                    break                
                ntss.append(nts)
            
            if(ntss):
                results.append((thisElem, thatElem, ntss))
    return results

#generate n-grams from an input string where N in [1,...,maxLength]
def ngrams(input, maxLength, minLength = 1):
    input = input.split(' ')
    output = []
    for n in range(minLength, maxLength+1):
        for i in range(len(input)-n+1):
            output.append(input[i:i+n])
    return [' '.join(word) for word in output]

def fixed_ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return [' '.join(word) for word in output]

def removeMarkupAndControl(text):
    text = re.sub('<.*>', ' ', re.sub('</?\w[^>]*>', ' ', text)) #remove html tags
        #text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip() #remove unnecessary control chars and trim
    text = ''.join(ch for ch in text if (unicodedata.category(ch)[0]!="C")) #and unicodedata.category(ch)[0]!="P")) #C control, P punctuation http://www.fileformat.info/info/unicode/category/index.htm
    text = re.sub(' +', ' ', text) #more than one whitespace    
    return text

def separateMarkupAndControl(text):
    text = re.sub('<.*>', '.', re.sub('</?\w[^>]*>', '.', text)) #remove html tags
        #text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip() #remove unnecessary control chars and trim
    text = ''.join(ch for ch in text if (unicodedata.category(ch)[0]!="C")) #and unicodedata.category(ch)[0]!="P")) #C control, P punctuation http://www.fileformat.info/info/unicode/category/index.htm
    text = re.sub(' +', ' ', text) #more than one whitespace    
    text = re.sub('\.+', '. ', text)
    text = re.sub('^\. ', '', text)
    return text

<h1>Cleaning descriptions</h1>

In [29]:
identifying_pharses = {"job description","responsibilities","responsibility","competence","skill","experience","qualification","education","technical requirements",
                       "the role","core duties","qualities","requirement","ideal candidate","we would like from","personal attributes","activities",
                       "candidate will have"}

irrelevant_phrases = {"successful candidate", "bank holiday", "bank holidays", "about us","what we offer","salary is competitive","we would like to give you",
                      "what can expect from us","equal opportunity","equal opportunities","apply now","bonus scheme","please contact","further information",
                      "further details", "chesterzoo", "interviews held", "candidate privacy", "paste following link", "copy paste following", "employment agency", 
                      "temporary workers", "annual leave", "competitive basic salary", "competitive salary", "criminal conviction", "sexual", "sexual orientation", 
                      "national origin", "disability", "ethnicity", "diversity", "inclusion", "gym membership", "competitive pay", 
                      "on-site parking", "pension scheme", "free parking", "criminal offence", "employee assistance programme", "flexible working", "Willingness", 
                      "work flexible hours","eligibility to work", "flexible approach to working", "flexible attitude to working", "flexible with working", "flexible benefits",
                      "competative starting salary","development budget","employee assistance","free office fruit","free fruit","days vacation",
                      "pension contributions", "employee support programme","flexi time", "flexi-time","cycle to work","salary range","work from home","working from home",
                      "days holiday", "benefits package", "employee benefits", "0113 ","01244","01582","01865","01908", "deadline for applications",
                      "private health care", "linkedIn learning", "cafe on-site", "send cv", "send your cv", "please send", "mon","fri","monday","friday", "mondays",
                      "fridays", "weekend", "massage", "days of the week", "days of week", "days per week", "day's holiday", "pension plan", "career progression",
                      "childcare", "months free", "not eligible", "analyst programme", "charity", "birthday", "anniversary", "parental", "maternity","paternity",
                      "minutes walk","train station","fantastic team","passionate people", "recruitment process", "accountability", "out of hours", "days a week",
                      "home working", "social event", "per week", "24x7x365", "required"}
                      

#Patterns that if a paragraph contains, the whole paragraph will be removed. In paragraph I mean <li> tags mostly under a <ul>.
irrelevant_phrases_pattern = re.compile("|".join([f'\\b{re.escape(ips)}\\b' for ips in irrelevant_phrases]), re.I)
skill_pattern = re.compile("|".join([f'\\b{re.escape(skill.label)}\\b' for skill in skills]), re.I)
web_pattern = re.compile("www\\ [a-zA-Z0-9]*\\ com|www\\.[a-zA-Z0-9]*\\.com|http://|https://")
pound_pattern = re.compile("£[0-9]*")
hour_pattern = re.compile("\\b(\d{1,2})(am|pm)\\b")
month_pattern = re.compile("\\b(\d{1,2})(\s{0,1})months\\b")

corp_spec_stopwords = {"year", "years", "preferred", "equivalent", "experience", "proficient", "like", "000", "ability", "able", "etc", "field", "minimum", 
                       "must", "excellent", "outstanding", "strong", "would", "advantageous", "beneficial", "mandatory", "within", "24X7"}
#.^$*+?()[{\|
punct_list = r'[“”‘’!"\$%&\'\(\)\*+,\./:;<=>?@\[\]\^`\{\|\}~\\]' #-–_
punct_map = punct = {'’':' ','!':' ','"':' ','$':' ','%':' ','&':' ','\'':' ','(':' ',')':' ','*':' ','+':' ',',':' ','-':' ','.':' ','/':' ',':':' ',';':' ','<':' ','=':' ','>':' ','?':' ','@':' ','[':' ','\\':' ',']':' ','^':' ','_':' ','`':' ','{':' ','|':' ','}':' ','~':' ','–':' '}
change_dict = {'&amp;':'&'}

<h1>Approach 1: decompose job descriptions and retain informative parts</h1>

In [30]:
import bs4
from bs4 import BeautifulSoup

def find_list_sibling(t):
    next_sibling = t.next_sibling
    for i in range(3):
        if next_sibling:
            if next_sibling.name == 'ul':
                return next_sibling
            else:
                next_sibling = next_sibling.next_sibling
        else:
            return None

for posting in postings:
        soup = BeautifulSoup(' '.join(posting.description).replace('\n', '').replace('\r', ''), "html.parser")
        all_ps = soup.find_all('p')
        all_hs = soup.find_all(re.compile('^h[1-6]$'))
        informative_list = set()
        for t in all_ps + all_hs:
            for ip in identifying_pharses:
                if ip in t.get_text().lower():
                    ul = find_list_sibling(t)
                    if ul:
                        informative_list.add(ul)
                    else:
                        if t.next_sibling and isinstance(t.next_sibling, bs4.element.Tag):
                            if 'ul' in [c.name for c in t.next_sibling.contents]:
                                for c in t.next_sibling.contents:
                                    if c.name == 'ul':
                                        informative_list.add(c)
                    if not ul:
                        if t.parent.next_sibling and isinstance(t.parent.next_sibling, bs4.element.Tag):
                            if 'ul' in [c.name for c in t.parent.next_sibling.contents]:
                                for c in t.parent.next_sibling.contents:
                                    if c.name == 'ul':
                                        informative_list.add(c)
        if not informative_list:
            #display(HTML(' '.join(posting.description)))
            all_uls = soup.find_all('ul')
            for ul in all_uls:
                skill_match = re.findall(skill_pattern, ul.get_text(" ", strip=True))
                irrelevant_phrases_match = re.findall(irrelevant_phrases_pattern, ul.get_text(" ", strip=True))
                if skill_match and not irrelevant_phrases_match:
                    informative_list.add(ul)
                    
        if not informative_list:
            for t in soup.findAll(text=True):
                skill_match = re.findall(skill_pattern, t)
                irrelevant_phrases_match = re.findall(irrelevant_phrases_pattern, t)
                if skill_match and not irrelevant_phrases_match:
                    informative_list.add(t.strip())
                    
        if informative_list:
             posting.processedDescription = list(informative_list)
                
print(len([p for p in postings if p.processedDescription]))

5231


In [31]:
for posting in postings:
    posting.originalDescSentences = posting.processedDescription

In [32]:
for posting in postings:
    posting.processedDescription = posting.originalDescSentences

<h2>remove li or paragraph that contain one of the irrelevant patterns</h2>

In [33]:
for i, p in enumerate(postings):
    for paragraph in p.processedDescription:
        if isinstance(paragraph, bs4.element.Tag):
            if paragraph.name == 'ul':
                for li in paragraph.find_all('li'):
                    if re.findall(irrelevant_phrases_pattern, str(li)) or re.findall(web_pattern, str(li)) or re.findall(pound_pattern, str(li)) or re.findall(hour_pattern, str(li)) or re.findall(month_pattern, str(li)):
                        li.decompose()

In [34]:
for i, p in enumerate(postings):
    not_irrelevant = []
    for d in p.processedDescription:
        if not (re.findall(irrelevant_phrases_pattern, str(d)) or re.findall(web_pattern, str(d)) or re.findall(pound_pattern, str(d)) or re.findall(hour_pattern, str(d)) or re.findall(month_pattern, str(d))):
            not_irrelevant.append(str(d))
    p.processedDescription = not_irrelevant

<h2>remove markup, control, punctuation, corpus specific stopwords, and tokenize</h2>

In [35]:
for i, p in enumerate(postings):
    pd_texts = []
    for pd in p.processedDescription:
        #text = f'{pd.text} ' if isinstance(pd, bs4.element.Tag) else pd
        #pd_texts.append(removeMarkupAndControl(pd).strip())
        pd_texts.append(separateMarkupAndControl(pd).strip())
    p.processedDescription = pd_texts

In [38]:
for i, posting in enumerate(postings):
    texts = []
    for paragraph in posting.processedDescription:
        for sentence in tokenizer.tokenize(paragraph): #create sentences
            sentence = re.sub('&amp;', '&', re.sub('\d{1,3}\)', '', re.sub('(^-|–|_\s)|(\s-|–|_\s)', '', re.sub('0ffice', 'Office', re.sub('0365', 'O365', sentence)))))
            #sentence = sentence.translate(str.maketrans(punct)).lower() #remove punctuations
            sentence = ' '.join([w for w in word_tokenize(sentence, preserve_line=True) if w.lower() not in stop_words.union(corp_spec_stopwords) or w == 'IT']) #remove sw
            for s in re.split(punct_list, sentence):
                if s.strip() and len(s.strip()) > 2:
                    texts.append(s.strip())
    seen = set()    
    posting.processedDescription = [x for x in texts if not (x in seen or seen.add(x))]   

In [39]:
for i, posting in enumerate(postings):
    sentences = []
    for sentence in posting.processedDescription:
        if len(sentence.split(' ')) > 3:
            sentences = sentences + ngrams(sentence, 4, 2)
        else:
            sentences.append(sentence)    
    
    posting.processedDescription = [d for d in sentences if not (re.findall(irrelevant_phrases_pattern, str(d)) or re.findall(web_pattern, str(d)) or re.findall(pound_pattern, str(d)) or re.findall(hour_pattern, str(d)) or re.findall(month_pattern, str(d)))]
    #for s in sentences:
    #    if len(s.split(' ')) < 3:
    #        nlp_result = nlp(s)
    #        upos = ' '.join([word.upos for word in nlp_result.sentences[0].words])
    #        print(f'{s} - {upos}')

In [44]:
num_start_pattern = re.compile("^\d+\\b")
for p in postings:
    new_p_desc = []
    for li in p.processedDescription:
        if re.findall(num_start_pattern, li): 
            number = re.findall(num_start_pattern, li)
            if not new_p_desc:
                if len(li.split(' ')) > 2:
                    new_p_desc.append(' '.join(li.split(" ")[1:]))
                else:
                    continue
            elif re.findall(re.compile("^\d+$"), li):
                new_p_desc[-1] = f'{new_p_desc[-1]} {li}'
            elif new_p_desc[-1].split(' ')[-1] == number[0]:
                new_p_desc[-1] = f'{" ".join(new_p_desc[-1].split(" ")[:-1])} {li}'
            elif number[0] in new_p_desc[-1]:
                new_p_desc[-1] = f'{" ".join([new_p_desc[-1]] + [w for w in li.split(" ") if w not in new_p_desc[-1]])}'
            else:
                if len(number[0]) < 4 and len(li.split(' ')) < 3:
                    continue
                elif len(number[0]) < 4:
                    new_p_desc.append(' '.join(li.split(" ")[1:]))
                else:
                    new_p_desc[-1] = f'{new_p_desc[-1]} {li}'
        else:
            new_p_desc.append(li)
        p.processedDescription = new_p_desc

In [41]:
for p in postings:
    new_p_desc = []
    for li in p.processedDescription:
        if re.findall(re.compile("^# #"), li): 
            if len(li.split(' ')) > 2:
                new_p_desc.append(' '.join(li.split(" ")[2:]))
            else:
                continue
        elif re.findall(re.compile("^#"), li):
            new_p_desc.append(' '.join(li.split(" ")[1:]))
        else:
            new_p_desc.append(li)
        p.processedDescription = new_p_desc

<h1>Test and copy</h1>

In [45]:
rawPostings = [p for p in postings if len(p.processedDescription) > 2]
print(len(rawPostings))

5018


In [46]:
temp_pattern = re.compile("^#")
for i, p in enumerate(rawPostings):
    for j, li in enumerate(p.processedDescription):
        if re.findall(re.compile("^\d+\\b"), str(li)) or re.findall(irrelevant_phrases_pattern, str(li)) or re.findall(web_pattern, str(li)) or re.findall(pound_pattern, str(li)) or re.findall(hour_pattern, str(li)) or re.findall(temp_pattern, str(li)):
            print(f'---{li}')