In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from nltk import pos_tag
from pandas import DataFrame
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
import nltk
import re
import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [7]:

from storage import Storage

s = Storage()

# This is from https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
pos_tag_description_dict = s.load_object('pos_tag_description_dict')

In [4]:

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

In [25]:

# Create the tokenizer
HTML_SCANNER_REGEX = re.compile(r'</?\w+|\w+[#\+]*|:|\.|\?')
def html_regex_tokenizer(corpus):

    return [match.group() for match in re.finditer(HTML_SCANNER_REGEX, corpus)]


---
# Training

In [144]:

cypher_str = '''
    // Find all NavigableParents nodes in the graph so that pandas can filter out any nodes that are not required qualifications
    MATCH (np:NavigableParents)
    
    WHERE
        EXISTS(np.is_header)
        AND EXISTS(np.is_task_scope)
        AND EXISTS(np.is_minimum_qualification)
        AND EXISTS(np.is_preferred_qualification)
        AND EXISTS(np.is_legal_notification)
        AND EXISTS(np.is_job_title)
        AND EXISTS(np.is_office_location)
        AND EXISTS(np.is_job_duration)
        AND EXISTS(np.is_supplemental_pay)
        AND EXISTS(np.is_educational_requirement)
        AND EXISTS(np.is_interview_procedure)
        AND EXISTS(np.is_corporate_scope)
        AND EXISTS(np.is_posting_date)
        AND EXISTS(np.is_other)
    
    // Return the navigable parent and important features
    RETURN
        np.navigable_parent AS navigable_parent,
        np.is_header AS is_header,
        np.is_task_scope AS is_task_scope,
        np.is_minimum_qualification AS is_minimum_qualification,
        np.is_preferred_qualification AS is_preferred_qualification,
        np.is_legal_notification AS is_legal_notification,
        np.is_job_title AS is_job_title,
        np.is_office_location AS is_office_location,
        np.is_job_duration AS is_job_duration,
        np.is_supplemental_pay AS is_supplemental_pay,
        np.is_educational_requirement AS is_educational_requirement,
        np.is_interview_procedure AS is_interview_procedure,
        np.is_corporate_scope AS is_corporate_scope,
        np.is_posting_date AS is_posting_date,
        np.is_other AS is_other;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    pos_html_strs_df = DataFrame(row_objs_list)
    
    # (46439, 15)
    print(pos_html_strs_df.shape)

(46453, 15)


In [145]:

pos_html_strs_df['part_of_speech'] = ''

mask_series = (pos_html_strs_df.is_other == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-O'
mask_series = (pos_html_strs_df.is_posting_date == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-PD'
mask_series = (pos_html_strs_df.is_corporate_scope == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-CS'
mask_series = (pos_html_strs_df.is_interview_procedure == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-IP'
mask_series = (pos_html_strs_df.is_supplemental_pay == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-SP'
mask_series = (pos_html_strs_df.is_job_duration == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-JD'
mask_series = (pos_html_strs_df.is_office_location == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-OL'
mask_series = (pos_html_strs_df.is_job_title == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-JT'
mask_series = (pos_html_strs_df.is_legal_notification == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-LN'
mask_series = (pos_html_strs_df.is_task_scope == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-TS'
mask_series = (pos_html_strs_df.is_preferred_qualification == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-PQ'
mask_series = (pos_html_strs_df.is_educational_requirement == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-ER'
mask_series = (pos_html_strs_df.is_minimum_qualification == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = '-RQ'

mask_series = (pos_html_strs_df.is_header == 'True')
pos_html_strs_df.loc[mask_series, 'part_of_speech'] = pos_html_strs_df[mask_series].part_of_speech.map(lambda x: 'H'+x)
pos_html_strs_df.loc[~mask_series, 'part_of_speech'] = pos_html_strs_df[~mask_series].part_of_speech.map(lambda x: 'O'+x)

# (46439, 16)
print(pos_html_strs_df.shape)

(46453, 16)


In [146]:

columns_list = ['navigable_parent', 'part_of_speech', 'is_header']
pos_html_strs_df[columns_list]

Unnamed: 0,navigable_parent,part_of_speech,is_header
0,<b>Key Qualifications:</b>,H-RQ,True
1,• Work closely with our media agency to mar...,O-TS,False
2,• Maintain and further optimize a set of da...,O-TS,False
3,• Conduct ad hoc reporting with insightful ...,O-TS,False
4,• Partner with our WW data science team to ...,O-TS,False
...,...,...,...
46448,<li>Experience with dashboard and visualizatio...,O-PQ,False
46449,<li>Experience with management and monitoring ...,O-PQ,False
46450,<li>Experience with GxP data.</li>,O-PQ,False
46451,<i>This position is classified as exempt. The ...,O-SP,False


In [147]:

from imblearn.under_sampling import RandomUnderSampler

# Get the sampling strategy limit
vc_srs = pos_html_strs_df.part_of_speech.value_counts()
sampling_strategy_limit = int(vc_srs.median())

# Rebalance the data with the sampling strategy limit
counts_dict = pos_html_strs_df.groupby('part_of_speech').count().navigable_parent.to_dict()
sampling_strategy = {k: min(sampling_strategy_limit, v) for k, v in counts_dict.items()}
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)

# Define the tuple of arrays
data = rus.fit_resample(
    pos_html_strs_df.navigable_parent.values.reshape(-1, 1), pos_html_strs_df.part_of_speech.values.reshape(-1, 1)
)

# Recreate the Pandas DataFrame
rebalanced_df = DataFrame(data[0], columns=['navigable_parent'])
rebalanced_df['part_of_speech'] = data[1]

# (4357, 2)
print(rebalanced_df.shape)
rebalanced_df

(4473, 2)


Unnamed: 0,navigable_parent,part_of_speech
0,<b>Careers at UnitedHealthcare Employer &amp; ...,H-CS
1,<b>About Us</b>,H-CS
2,<b>Primer</b>,H-CS
3,We’re fostering a culture of shared values acr...,H-CS
4,<b>About Cognizant Cognizant (Nasdaq-100:</b>,H-CS
...,...,...
4468,<p>Design superb visualizations and access too...,O-TS
4469,<li>Performs additional duties as assigned</li>,O-TS
4470,<ots>Subject matter expert in the automated te...,O-TS
4471,<li>Train business users to use self-service t...,O-TS


In [160]:

# Sentences to parse
sentences_list = pos_html_strs_df.navigable_parent.tolist()
sentences_list[-2:]

['<i>This position is classified as exempt. The anticipated annual base salary range for candidates who will work in San Diego or remotely is $197,011 to $274,358. The final base salary offered to the successful candidate will be dependent upon several factors that may include but are not limited to the type and length of experience within the job, type and length of experience within the job, education, and other factors. Treeline Biosciences is a multi-state employer, and this salary range may not reflect positions that work in other cities or states.</i>', '<b>Notice to Search Firms/Third-Party Recruitment Agencies (Recruiters)</b>']

In [161]:

# Labels to apply
pos_symbols_list = pos_html_strs_df.part_of_speech.tolist()
pos_symbols_list[-2:]

['O-SP', 'H-LN']

In [162]:

# Tokenize the sentences
tokens_list = [html_regex_tokenizer(sentence) for sentence in sentences_list]
tokens_list[-2:]

[['<i', 'This', 'position', 'is', 'classified', 'as', 'exempt', '.', 'The', 'anticipated', 'annual', 'base', 'salary', 'range', 'for', 'candidates', 'who', 'will', 'work', 'in', 'San', 'Diego', 'or', 'remotely', 'is', '197', '011', 'to', '274', '358', '.', 'The', 'final', 'base', 'salary', 'offered', 'to', 'the', 'successful', 'candidate', 'will', 'be', 'dependent', 'upon', 'several', 'factors', 'that', 'may', 'include', 'but', 'are', 'not', 'limited', 'to', 'the', 'type', 'and', 'length', 'of', 'experience', 'within', 'the', 'job', 'type', 'and', 'length', 'of', 'experience', 'within', 'the', 'job', 'education', 'and', 'other', 'factors', '.', 'Treeline', 'Biosciences', 'is', 'a', 'multi', 'state', 'employer', 'and', 'this', 'salary', 'range', 'may', 'not', 'reflect', 'positions', 'that', 'work', 'in', 'other', 'cities', 'or', 'states', '.', '</i'], ['<b', 'Notice', 'to', 'Search', 'Firms', 'Third', 'Party', 'Recruitment', 'Agencies', 'Recruiters', '</b']]

In [163]:

# Get the parts of speech
pos_tags_list = [pos_tag(tokens) for tokens in tokens_list]
pos_tags_list[-2:]

[[('<i', 'NN'), ('This', 'DT'), ('position', 'NN'), ('is', 'VBZ'), ('classified', 'VBN'), ('as', 'IN'), ('exempt', 'NN'), ('.', '.'), ('The', 'DT'), ('anticipated', 'JJ'), ('annual', 'JJ'), ('base', 'NN'), ('salary', 'JJ'), ('range', 'NN'), ('for', 'IN'), ('candidates', 'NNS'), ('who', 'WP'), ('will', 'MD'), ('work', 'VB'), ('in', 'IN'), ('San', 'NNP'), ('Diego', 'NNP'), ('or', 'CC'), ('remotely', 'RB'), ('is', 'VBZ'), ('197', 'CD'), ('011', 'CD'), ('to', 'TO'), ('274', 'CD'), ('358', 'CD'), ('.', '.'), ('The', 'DT'), ('final', 'JJ'), ('base', 'NN'), ('salary', 'NN'), ('offered', 'VBD'), ('to', 'TO'), ('the', 'DT'), ('successful', 'JJ'), ('candidate', 'NN'), ('will', 'MD'), ('be', 'VB'), ('dependent', 'JJ'), ('upon', 'IN'), ('several', 'JJ'), ('factors', 'NNS'), ('that', 'WDT'), ('may', 'MD'), ('include', 'VB'), ('but', 'CC'), ('are', 'VBP'), ('not', 'RB'), ('limited', 'VBN'), ('to', 'TO'), ('the', 'DT'), ('type', 'NN'), ('and', 'CC'), ('length', 'NN'), ('of', 'IN'), ('experience', 'NN

In [164]:

# Define features to be used in the CRF model
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'word': word,
        'postag': postag
    }
    
    return features

def sent2features(sent):
    
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(pos_symbol, sent):
    
    return [pos_symbol] * len(sent)

# Prepare the training and test data
X = [sent2features(pos_tags) for pos_tags in pos_tags_list]
y = [sent2labels(pos_symbol, pos_tag) for pos_tag, pos_symbol in zip(pos_tags_list, pos_symbols_list)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the CRF model
pos_symbol_crf = CRF()

# Train the model
pos_symbol_crf.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = pos_symbol_crf.predict(X_test)


---
# Visualization

In [153]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_test)
y_pred_transformed = mlb.transform(y_pred)

In [159]:

# Compute the classification report
df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_, zero_division=0, output_dict=True), orient='index')
df.index.name = 'pos_symbol'
df.sort_values('f1-score')

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
H-JD,0.0,0.0,0.0,4
H-PD,0.0,0.0,0.0,2
O-O,0.0,0.0,0.0,11
H-LN,0.333333,0.142857,0.2,14
H-IP,0.571429,0.16,0.25,25
H-JT,0.416667,0.384615,0.4,13
H-O,0.45,0.45,0.45,20
H-RQ,0.479167,0.442308,0.46,52
H-TS,0.4,0.566038,0.46875,53
O-PQ,0.432836,0.707317,0.537037,41


In [155]:

import random

random_feature_dicts_list = random.choice(X)
print(random_feature_dicts_list)
if random_feature_dicts_list:
    pos_symbols_list = pos_symbol_crf.predict_single(random_feature_dicts_list)
    assert len(set(pos_symbols_list)) == 1
    display(pos_symbols_list[0])

[{'word': '<ocs', 'postag': 'JJ'}, {'word': 'Client', 'postag': 'NN'}, {'word': ':', 'postag': ':'}, {'word': 'T', 'postag': 'NNP'}, {'word': 'MOBILE', 'postag': 'NNP'}, {'word': 'US', 'postag': 'NNP'}, {'word': 'INC', 'postag': 'NNP'}, {'word': '.', 'postag': '.'}, {'word': '</ocs', 'postag': 'NN'}]


'O-CS'

In [156]:

mask_series = (pos_html_strs_df.part_of_speech == 'H-JD')
pos_html_strs_df[mask_series].navigable_parent.tolist()

["It's a 12 months contract, Please let me know if this would be something you’d be interested in learning more and feel free to call me in case you’ve any questions.", '<span style="color:#201f1e">Duration- Full time Hire</span>', '<b>Duration:</b>', '<b>Duration</b>', '<b>Employee Status :</b>', '<b>Employee Status</b>', '<b>Job Schedule:</b>', '<b>Regular, consistent and punctual attendance. Must be able to work nights and weekends, variable schedule(s) as necessary.</b>', '<b>Schedule</b>', '<b>Shift :</b>', '<b>Shift</b>', '<b>Standard Hours</b>', '<b>Work schedule:</b>', '<b>Working time:</b>', '<div class="jobsearch-JobDescriptionSection-sectionItemKey icl-u-textBold">Job Type</div>', '<div>Contract</div>', '<p>Contract Length:</p>', '<p>Schedule:</p>', 'Day Job', 'Fixed Term Contractor', '12+ Contract', 'Start ASAP', 'Duration Long Term Contract', '<b>Minimum 2 years</b>', 'Full-Time / LONG TERM Contract (like a Perm position)', '4 years', '<span style="color:black">Days</span>

In [157]:

mask_series = (pos_html_strs_df.part_of_speech == 'H-PD')
pos_html_strs_df[mask_series].navigable_parent.tolist()

['<b>Job Posting :</b>', '<b>Job Posting</b>', '<div>Published</div>', '<b>Publication Date:</b>']

In [158]:

mask_series = (pos_html_strs_df.part_of_speech == 'O-O')
pos_html_strs_df[mask_series].navigable_parent.tolist()

['<span style="font-size:12px">Hello Dave Babbitt</span>', '#LI-AH1', '<span style="font-weight: 400;">#li-remote</span>', ')', '<b>#LI-DNI</b>', '<b>3</b>', '<b>or</b>', '<i>.</i>', '<i>Learn why</i>', '<i>at least</i>', '<i>here</i>', '<li>Or any combination of education and experience to meet the above requirements</li>', '<p>#LI-SAP1</p>', '<p>OR</p>', '<p>·</p>', 'None', 'The', 'and', '<li>XX</li>', 'Hiring', '<li>Others as applicable</li>', 'candidates for this role', '<b>10+</b>', '<span style="margin:0px;font-size:8pt;line-height:normal;color:rgb(102,102,102)">Phone:\xa0+631 (267) 4883 Ext. - 160</span>', '<span style="margin:0px;font-size:8pt;line-height:normal;color:rgb(102,102,102)">Mob: +1631-641-2812</span>', '<span style="font-weight: 400;">-</span>', ').', '!', '•', '/', 'job preferences', 'you are', '<span class="css-kyg8or eu4oa1w0">Posted 1 day ago</span>', 'No matching', '<div>#LI-REMOTE</div>', '<span class="css-kyg8or eu4oa1w0">Urgently hiring</span>', 'n', 'In add