In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from neo4j.exceptions import ServiceUnavailable
from nltk import pos_tag
from pandas import DataFrame
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
import humanize
import nltk
import os
import re
import sys
import time
import warnings

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
if ('../py' not in sys.path): sys.path.insert(1, '../py')

In [3]:

t0 = t1 = time.time()

# Get the Storage object
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

# Get the HeaderAnalysis object
from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the WebScrapingUtilities object
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the CypherUtilities object and Neo4j driver
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

try:
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')

# Get the HeaderCategories object
from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

# Get the LrUtilities object
from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

# Get the SectionLRClassifierUtilities object
from section_classifier_utils import SectionLRClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Get the SectionSGDClassifierUtilities object
from section_classifier_utils import SectionSGDClassifierUtilities
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Get the SectionCRFClassifierUtilities object
from section_classifier_utils import SectionCRFClassifierUtilities
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

# Get the CrfUtilities object
from crf_utils import CrfUtilities
crf = CrfUtilities(
    ha=ha, hc=hc, cu=cu, lru=lru, slrcu=slrcu, scrfcu=scrfcu, ssgdcu=ssgdcu, verbose=True
)

duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')

Utility libraries created in 6 seconds


In [4]:

# Check if the scrfcu has built its parts-of-speech conditional random field elements
# Parts-of-speech CRF elements normally built in 29 minutes and 57 seconds
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is available
Parts-of-speech conditional random field elements built in 2 seconds


In [5]:

crf_name = crf.word2features.__doc__.split('.')[0].split(' ')[-1]
if s and s.pickle_exists(crf_name):
    crf.CRF = s.load_object(crf_name)
if hasattr(crf, 'CRF'):
    print('predict_single is available')
else:
    print('predict_single is not available')

predict_single is available


In [6]:

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient descent elements built in {duration_str}')

I have 49,102 labeled parts of speech in here
predict_single is available
Parts-of-speech stochastic gradient descent elements built in 10 seconds


In [7]:

# Check if the lru has built its parts-of-speech logistic regression model
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(verbose=False)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

predict_single is available
Parts-of-speech logistic regression elements built in 7 seconds


In [8]:

# This is from https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
pos_tag_description_dict = s.load_object('pos_tag_description_dict')

In [9]:

# Create the tokenizer
HTML_SCANNER_REGEX = re.compile(r'</?\w+|\w+[#\+]*|:|\.|\?')
def html_regex_tokenizer(corpus):

    return [match.group() for match in re.finditer(HTML_SCANNER_REGEX, corpus)]


---
# Data Exploration
This stage involves loading and examining the dataset to gain a better understanding of its structure, content, and quality. Data exploration may involve tasks such as data visualization, summary statistics, and data cleaning.

In [7]:

t0 = time.time()
cu.populate_pos_relationships(verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech relationships repopulated in {duration_str}')


                        MATCH
                            (pos:PartsOfSpeech {is_header: false, is_other: true}),
                            (np:NavigableParents {is_header: false, is_other: true})
                        MERGE (pos)-[r:SUMMARIZES]->(np);
Parts-of-speech relationships repopulated in 4 seconds


In [18]:

cypher_str = '''
    // Filter for NavigableParents nodes with an unambiguous SUMMARIZES relationship
    MATCH (np:NavigableParents)
    WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) = 1
    WITH np
    
    // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
    MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
    
    // Return the navigable parent and important properties
    RETURN
        pos.pos_symbol AS pos_symbol,
        pos.is_header AS pos_is_header,
        pos.is_task_scope AS pos_is_task_scope,
        pos.is_minimum_qualification AS pos_is_minimum_qualification,
        pos.is_preferred_qualification AS pos_is_preferred_qualification,
        pos.is_legal_notification AS pos_is_legal_notification,
        pos.is_job_title AS pos_is_job_title,
        pos.is_office_location AS pos_is_office_location,
        pos.is_job_duration AS pos_is_job_duration,
        pos.is_supplemental_pay AS pos_is_supplemental_pay,
        pos.is_educational_requirement AS pos_is_educational_requirement,
        pos.is_interview_procedure AS pos_is_interview_procedure,
        pos.is_corporate_scope AS pos_is_corporate_scope,
        pos.is_posting_date AS pos_is_posting_date,
        pos.is_other AS pos_is_other,
        
        np.navigable_parent AS navigable_parent,
        np.is_header AS np_is_header,
        np.is_task_scope AS np_is_task_scope,
        np.is_minimum_qualification AS np_is_minimum_qualification,
        np.is_preferred_qualification AS np_is_preferred_qualification,
        np.is_legal_notification AS np_is_legal_notification,
        np.is_job_title AS np_is_job_title,
        np.is_office_location AS np_is_office_location,
        np.is_job_duration AS np_is_job_duration,
        np.is_supplemental_pay AS np_is_supplemental_pay,
        np.is_educational_requirement AS np_is_educational_requirement,
        np.is_interview_procedure AS np_is_interview_procedure,
        np.is_corporate_scope AS np_is_corporate_scope,
        np.is_posting_date AS np_is_posting_date,
        np.is_other AS np_is_other;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    pos_html_strs_df = DataFrame(row_objs_list)
    
    # (47670, 30)
    print(pos_html_strs_df.shape)

(47670, 30)


In [9]:

# Prove that only unambiguous relationships exist
navigable_parent = pos_html_strs_df.navigable_parent.value_counts().head(1).index[0]
mask_series = (pos_html_strs_df.navigable_parent == navigable_parent)
df = pos_html_strs_df[mask_series].T
mask_series = (df[0] != false)
df[mask_series]

Unnamed: 0,0
pos_symbol,H-TS
pos_is_header,True
pos_is_task_scope,True
navigable_parent,<div>Leadership &amp; Talent Management:</div>
np_is_header,True
np_is_task_scope,True


In [10]:

features_list = ['is_task_scope', 'is_minimum_qualification', 'is_preferred_qualification', 'is_legal_notification', 'is_job_title',
                 'is_office_location', 'is_job_duration', 'is_supplemental_pay', 'is_educational_requirement', 'is_interview_procedure',
                 'is_corporate_scope', 'is_posting_date', 'is_other']
mask_series = False
for feature_str in features_list:
    mask_series |= (pos_html_strs_df[f'pos_{feature_str}'] != pos_html_strs_df[f'np_{feature_str}'])
df = pos_html_strs_df[mask_series].head(5).T
df

Unnamed: 0,4091,7169,7422,12084,12928
pos_symbol,O-TS,O-TS,O-TS,O-RQ,O-RQ
pos_is_header,False,False,False,False,False
pos_is_task_scope,True,True,True,False,False
pos_is_minimum_qualification,False,False,False,True,True
pos_is_preferred_qualification,False,False,False,False,False
pos_is_legal_notification,False,False,False,False,False
pos_is_job_title,False,False,False,False,False
pos_is_office_location,False,False,False,False,False
pos_is_job_duration,False,False,False,False,False
pos_is_supplemental_pay,False,False,False,False,False


In [11]:

columns_list = ['navigable_parent', 'pos_symbol', 'pos_is_header']
pos_html_strs_df[columns_list].sample(5)

Unnamed: 0,navigable_parent,pos_symbol,pos_is_header
3859,<ots>Lead the Compensation workstream to desig...,O-TS,False
33273,<ojt>Role ID: 4634030</ojt>,O-JT,False
43084,<oip>Role Primary Contact (Email ID): prashant...,O-IP,False
36196,<ojt>Role ID: 4587609</ojt>,O-JT,False
43567,"<oip>Role Primary Contact: Vora,Devang</oip>",O-IP,False



---
# Data Preparation
This stage involves preparing the dataset for training by transforming, cleaning, and pre-processing the data. Data preparation may include tasks such as feature selection, normalization, and data augmentation.

In [19]:

# Prepare the training and test data
t1 = time.time()
part_of_speech_dict = pos_html_strs_df.set_index('navigable_parent').pos_symbol.to_dict()
y_actual = [(pos_symbol, ) for navigable_parent, pos_symbol in part_of_speech_dict.items()]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Actual POS symbol list created in {duration_str}')

Actual POS symbol list created in 0 seconds


In [21]:

[f'crf.CRF.{fn}' for fn in dir(crf.CRF) if 'predict' in fn.lower()]

['crf.CRF.predict', 'crf.CRF.predict_marginals', 'crf.CRF.predict_marginals_single', 'crf.CRF.predict_single']

In [22]:

crf.CRF.predict?

[1;31mSignature:[0m [0mcrf[0m[1;33m.[0m[0mCRF[0m[1;33m.[0m[0mpredict[0m[1;33m([0m[0mX[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Make a prediction.

Parameters
----------
X : list of lists of dicts
    feature dicts in python-crfsuite format

Returns
-------
y : list of lists of strings
    predicted labels
[1;31mFile:[0m      c:\users\daveb\onedrive\documents\github\job-hunting\jh_env\lib\site-packages\sklearn_crfsuite\estimator.py
[1;31mType:[0m      method

In [16]:

t1 = time.time()
from pandas import Series

y_predicted = []
for navigable_parent in part_of_speech_dict.keys():
    predictions_list = crf.CRF.predict_single(navigable_parent)
    pos_symbol = 'O-O'
    if predictions_list:
        pos_symbol = Series(predictions_list).mode().squeeze()
        if type(pos_symbol) != str:
            pos_symbol = predictions_list[0]
    y_predicted.append((pos_symbol, ))
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

Predicted labels created in 12 seconds


In [17]:

y_predicted[-2:]

[('O-IP',), ('O-IP',)]

In [18]:

y_actual[-2:]

[('O-O',), ('O-O',)]

In [19]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_actual)
y_pred_transformed = mlb.transform(y_predicted)
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)

In [20]:

# Compute the classification report
pos_symbol_crf_df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_,
                                                              zero_division=0, output_dict=True), orient='index')
pos_symbol_crf_df.index.name = 'pos_symbol'
pos_symbol_crf_df.sort_values('recall', ascending=False)

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O-IP,0.10241,0.999385,0.185782,4882
micro avg,0.102349,0.102349,0.102349,47670
weighted avg,0.010488,0.102349,0.019026,47670
samples avg,0.102349,0.102349,0.102349,47670
macro avg,0.003939,0.038438,0.007145,47670
H-JT,0.0,0.0,0.0,77
H-LN,0.0,0.0,0.0,68
H-IP,0.0,0.0,0.0,103
H-JD,0.0,0.0,0.0,31
O-TS,0.0,0.0,0.0,6983



---
# Data Preparation
This stage involves preparing the dataset for training by transforming, cleaning, and pre-processing the data. Data preparation may include tasks such as feature selection, normalization, and data augmentation.

In [None]:

# Rebalance the data with the sampling strategy limit
from imblearn.under_sampling import RandomUnderSampler

# Get the sampling strategy limit
vc_srs = pos_html_strs_df.pos_symbol.value_counts()
sampling_strategy_limit = int(vc_srs.median())
print(f'The median count for all symbols is close to {sampling_strategy_limit}. We are going to under-sample to that.')

# Get the random under-sampler
counts_dict = pos_html_strs_df.groupby('pos_symbol').count().navigable_parent.to_dict()
sampling_strategy = {k: min(sampling_strategy_limit, v) for k, v in counts_dict.items()}
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)

# Define the tuple of arrays
resampled_data = rus.fit_resample(
    pos_html_strs_df.navigable_parent.values.reshape(-1, 1), pos_html_strs_df.pos_symbol.values.reshape(-1, 1)
)

# Recreate the Pandas DataFrame
rebalanced_df = DataFrame(resampled_data[0], columns=['navigable_parent'])
rebalanced_df['pos_symbol'] = resampled_data[1]

# (4491, 2)
print(rebalanced_df.shape)
rebalanced_df.groupby('pos_symbol').count().sort_values('navigable_parent').head(10)

In [None]:

# Sentences to parse
sentences_list = rebalanced_df.navigable_parent.tolist()
sentences_list[-2:]

In [None]:

# Labels to apply
pos_symbols_list = rebalanced_df.pos_symbol.tolist()
pos_symbols_list[-2:]

In [None]:

# Tokenize the sentences
tokens_list = [html_regex_tokenizer(sentence) for sentence in sentences_list]
tokens_list[-2:]


---
# CRF Training using Parts-of-Speech

In [None]:

# Get the parts of speech
pos_tags_list = [pos_tag(tokens) for tokens in tokens_list]
pos_tags_list[-2:]

In [21]:

# Define features to be used in the CRF model
def sent2features(sent):

    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(pos_symbol, sent):

    return [pos_symbol] * len(sent)
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'word': word,
        'postag': postag
    }
    
    return features

# Prepare the training and test data
X = [sent2features(pos_tags) for pos_tags in pos_tags_list]
y = [sent2labels(pos_symbol, pos_tag) for pos_tag, pos_symbol in zip(pos_tags_list, pos_symbols_list)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the CRF model
pos_symbol_crf = CRF()

# Train the model
pos_symbol_crf.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = pos_symbol_crf.predict(X_test)
y_pred[-2:], y_test[-2:]

NameError: name 'pos_tags_list' is not defined

In [None]:

X_test[-2:]


---
# Visualization

In [None]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_test)
y_pred_transformed = mlb.transform(y_pred)

In [None]:

# Compute the classification report
pos_symbol_crf_df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_,
                                                              zero_division=0, output_dict=True), orient='index')
pos_symbol_crf_df.index.name = 'pos_symbol'
pos_symbol_crf_df.sort_values('recall', ascending=False)

In [None]:

tokens_list = [html_regex_tokenizer(navigable_parent)]
pos_tags_list = [pos_tag(tokens) for tokens in tokens_list]
X = [sent2features(pos_tags) for pos_tags in pos_tags_list]
y_pred = pos_symbol_crf.predict(X)
y_pred

In [None]:

# Define features to be used in the CRF model
y_pred = []
y_test = []
for tl, pred_symbol, test_symbol in zip(
    tokens_list, [slrcu.predict_single(child_str) for child_str in sentences_list], [ys_list[0] if ys_list else None for ys_list in y]
):
    if test_symbol is not None:
        y_pred.append([pred_symbol] * len(tl))
        y_test.append([test_symbol] * len(tl))
y_pred[-2:], y_test[-2:]

In [None]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_test)
y_pred_transformed = mlb.transform(y_pred)
mlb.classes_

In [None]:

# Compute the classification report
pos_symbol_lr_df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_,
                                                             zero_division=0, output_dict=True), orient='index')
pos_symbol_lr_df.index.name = 'pos_symbol'
pos_symbol_lr_df.sort_values('recall', ascending=False)


---
# Model Selection
This stage involves selecting the appropriate model or algorithm for the task at hand. Model selection may involve tasks such as hyperparameter tuning, cross-validation, and model evaluation.

In [10]:

import random

if s and s.pickle_exists('HEADER_PATTERN_DICT'):
    header_pattern_dict = s.load_object('HEADER_PATTERN_DICT')
SAVES_HTML_FOLDER = os.path.join(s.saves_folder, 'html')
files_list = os.listdir(SAVES_HTML_FOLDER)
file_name = random.choice(files_list)
while file_name not in header_pattern_dict:
    file_name = random.choice(files_list)
child_strs_list = ha.get_child_strs_from_file(file_name=file_name)
child_tags_list = ha.get_child_tags_list(child_strs_list)
feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
feature_tuple_list = []
for feature_dict in feature_dict_list:
    feature_tuple_list.append(hc.get_feature_tuple(
        feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=scrfcu.predict_single,
        pos_sgd_predict_single=ssgdcu.predict_single
    ))
features_list = sorted(crf.word2features(feature_tuple_list, int(len(feature_tuple_list)/2)).keys(), key=lambda x: len(x))
features_list

['postag', 'position', '-3:postag', '-2:postag', '-1:postag', '+0:postag', '+1:postag', '+2:postag', '+3:postag', 'tag.span_set', 'tag.lists_set', 'tag.forms_set', 'tag.anchor_set', 'tag.tables_set', 'tag.frames_set', 'tag.general_set', '-3:tag.span_set', '-2:tag.span_set', '-1:tag.span_set', '+0:tag.span_set', '+1:tag.span_set', '+2:tag.span_set', '+3:tag.span_set', 'tag.null_element', '-3:previous==tag', '-3:tag.lists_set', '-3:tag.forms_set', '-2:previous==tag', '-2:tag.lists_set', '-2:tag.forms_set', '-1:previous==tag', '-1:tag.lists_set', '-1:tag.forms_set', '+0:previous==tag', '+0:tag.lists_set', '+0:tag.forms_set', '+1:previous==tag', '+1:tag.lists_set', '+1:tag.forms_set', '+2:previous==tag', '+2:tag.lists_set', '+2:tag.forms_set', '+3:previous==tag', '+3:tag.lists_set', '+3:tag.forms_set', '-3:tag.anchor_set', '-3:tag.tables_set', '-3:tag.frames_set', '-2:tag.anchor_set', '-2:tag.tables_set', '-2:tag.frames_set', '-1:tag.anchor_set', '-1:tag.tables_set', '-1:tag.frames_set', '

In [None]:

features_list = ['+1:postag', '+1:tag.basic_text_set', '+1:tag.block_elements_set', '+1:tag.document_body_elements_set',
                 '+1:tag.inline_elements_set', '+1:tag.lists_set', '+1:tag.null_element',
                 '+1:tag.other_block_elements_set', '+1:tag.phrase_elements_set', '+1:tag.presentation_set', '+1:tag.section_headings_set',
                 '+1:tag==previous', '+2:postag', '+2:tag.basic_text_set',
                 '+2:tag.block_elements_set', '+2:tag.document_body_elements_set', '+2:tag.inline_elements_set', '+2:tag.lists_set',
                 '+2:tag.null_element', '+2:tag.other_block_elements_set', '+2:tag.phrase_elements_set',
                 '+2:tag.presentation_set', '+2:tag.section_headings_set', '+2:tag==previous', '+3:postag', '+3:tag.basic_text_set',
                 '+3:tag.block_elements_set', '+3:tag.document_body_elements_set',
                 '+3:tag.inline_elements_set', '+3:tag.lists_set', '+3:tag.null_element', '+3:tag.other_block_elements_set',
                 '+3:tag.phrase_elements_set', '+3:tag.presentation_set', '+3:tag.section_headings_set',
                 '+3:tag==previous', '-1:postag', '-1:previous==tag', '-1:tag.basic_text_set', '-1:tag.block_elements_set',
                 '-1:tag.document_body_elements_set', '-1:tag.inline_elements_set', '-1:tag.lists_set',
                 '-1:tag.null_element', '-1:tag.other_block_elements_set', '-1:tag.phrase_elements_set', '-1:tag.presentation_set',
                 '-1:tag.section_headings_set', 'bias', 'child_str.pos_lr_predict_single', 'position',
                 'postag', 'tag.basic_text_set', 'tag.block_elements_set', 'tag.consecutive_next_tags', 'tag.document_body_elements_set',
                 'tag.inline_elements_set', 'tag.lists_set', 'tag.null_element',
                 'tag.other_block_elements_set', 'tag.phrase_elements_set', 'tag.presentation_set', 'tag.section_headings_set', 'BOS', 'EOS']
token_count = 10
for i in range(token_count):
    if i == 0:
        print('BOS')
    if i == (token_count - 1):
        print('EOS')
    for word_offset in range(-3, 4):
        if token_count > (word_offset + i) >= 0:
            feature_key = f'{"+" if word_offset >= 0 else "-"}{abs(word_offset)}:postag'
            print(feature_key)

In [11]:

cypher_str = '''
    // Find all files that contain labeled HTML strings
    MATCH (np:NavigableParents)  // Find all nodes with the label `NavigableParents` and assign them to the variable `np`
    WHERE
        (np.is_header IS NOT NULL)  // Filter nodes that have the property `is_header`
        AND (np.is_task_scope IS NOT NULL)  // Filter nodes that have the property `is_task_scope`
        AND (np.is_minimum_qualification IS NOT NULL)  // Filter nodes that have the property `is_minimum_qualification`
        AND (np.is_preferred_qualification IS NOT NULL)  // Filter nodes that have the property `is_preferred_qualification`
        AND (np.is_legal_notification IS NOT NULL)  // Filter nodes that have the property `is_legal_notification`
        AND (np.is_job_title IS NOT NULL)  // Filter nodes that have the property `is_job_title`
        AND (np.is_office_location IS NOT NULL)  // Filter nodes that have the property `is_office_location`
        AND (np.is_job_duration IS NOT NULL)  // Filter nodes that have the property `is_job_duration`
        AND (np.is_supplemental_pay IS NOT NULL)  // Filter nodes that have the property `is_supplemental_pay`
        AND (np.is_educational_requirement IS NOT NULL)  // Filter nodes that have the property `is_educational_requirement`
        AND (np.is_interview_procedure IS NOT NULL)  // Filter nodes that have the property `is_interview_procedure`
        AND (np.is_corporate_scope IS NOT NULL)  // Filter nodes that have the property `is_corporate_scope`
        AND (np.is_posting_date IS NOT NULL)  // Filter nodes that have the property `is_posting_date`
        AND (np.is_other IS NOT NULL)  // Filter nodes that have the property `is_other`
    WITH np  // Store the value of each `np` node
    
    // Find all nodes connected to `np` by an outgoing `NEXT` relationship, and assign the relationship to `r`
    MATCH (np:NavigableParents)-[r:NEXT]->(:NavigableParents)
    
    // Return the value of the `file_name` property of the `r` relationship, with the alias `file_name`
    RETURN DISTINCT r.file_name AS file_name;
'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    filenames_df = DataFrame(row_objs_list)
    
    # (7711, 1)
    print(filenames_df.shape)

(7896, 1)


In [12]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

In [13]:

from tqdm import tqdm_notebook as tqdm
# from tqdm.notebook import tqdm
import random
from itertools import groupby

# Assume only one feature to be used in the CRF model
def word2features(feature_tuples_list, i):
    null_element = 'plaintext'
    this_feature_tuple = feature_tuples_list[i]
    this_tag = this_feature_tuple[0]
    child_str = this_feature_tuple[1]
    this_pos = this_feature_tuple[2]
    result_str = 'O'

    if (i > 0):
        previous_tag = feature_tuples_list[i-1][0]
        if (FEATURE_STR == '-1:previous==tag'):
            return {FEATURE_STR: previous_tag == this_tag}
        for es in cu.elements_sets_list:
            if (FEATURE_STR == f'-1:tag.{es}'):
                return {FEATURE_STR: previous_tag in eval(f'cu.{es}')}
        if (FEATURE_STR == '-1:tag.null_element'):
            return {FEATURE_STR: previous_tag == null_element}
        previous_pos = feature_tuples_list[i-1][2]
        if (FEATURE_STR == '-1:postag'):
            return {FEATURE_STR: previous_pos}
    if (i > 1):
        previous_tag = feature_tuples_list[i-2][0]
        if (FEATURE_STR == '-2:previous==tag'):
            return {FEATURE_STR: previous_tag == this_tag}
        for es in cu.elements_sets_list:
            if (FEATURE_STR == f'-2:tag.{es}'):
                return {FEATURE_STR: previous_tag in eval(f'cu.{es}')}
        if (FEATURE_STR == '-2:tag.null_element'):
            return {FEATURE_STR: previous_tag == null_element}
        previous_pos = feature_tuples_list[i-2][2]
        if (FEATURE_STR == '-2:postag'):
            return {FEATURE_STR: previous_pos}
    if (i > 2):
        previous_tag = feature_tuples_list[i-3][0]
        if (FEATURE_STR == '-3:previous==tag'):
            return {FEATURE_STR: previous_tag == this_tag}
        for es in cu.elements_sets_list:
            if (FEATURE_STR == f'-3:tag.{es}'):
                return {FEATURE_STR: previous_tag in eval(f'cu.{es}')}
        if (FEATURE_STR == '-3:tag.null_element'):
            return {FEATURE_STR: previous_tag == null_element}
        previous_pos = feature_tuples_list[i-3][2]
        if (FEATURE_STR == '-3:postag'):
            return {FEATURE_STR: previous_pos}
    if (i < len(feature_tuples_list)-1):
        next_tag = feature_tuples_list[i+1][0]
        for es in cu.elements_sets_list:
            if (FEATURE_STR == f'+1:tag.{es}'):
                return {FEATURE_STR: next_tag in eval(f'cu.{es}')}
        if (FEATURE_STR == '+1:tag.null_element'):
            return {FEATURE_STR: next_tag == null_element}
        if (FEATURE_STR == '+1:previous==tag'):
            return {FEATURE_STR: next_tag == this_tag}
        next_pos = feature_tuples_list[i+1][2]
        if (FEATURE_STR == '+1:postag'):
            return {FEATURE_STR: next_pos}
    if i < len(feature_tuples_list)-2:
        third_tag = feature_tuples_list[i+2][0]
        for es in cu.elements_sets_list:
            if (FEATURE_STR == f'+2:tag.{es}'):
                return {FEATURE_STR: third_tag in eval(f'cu.{es}')}
        if (FEATURE_STR == '+2:tag.null_element'):
            return {FEATURE_STR: third_tag == null_element}
        if (FEATURE_STR == '+2:previous==tag'):
            return {FEATURE_STR: third_tag == next_tag}
        third_pos = feature_tuples_list[i+2][2]
        if (FEATURE_STR == '+2:postag'):
            return {FEATURE_STR: third_pos}
        if (FEATURE_STR == 'tag.consecutive_next_tags'):
            labels_list = [label for token, child_str, label in feature_tuples_list][i:]
            consecutives_list = []
            for k, v in groupby(labels_list):
                consecutives_list.append((k, len(list(v))))
            consecutive_next_tags = 0
            try:
                consecutive_next_tags = consecutives_list[1][1]
            except:
                pass
            return {FEATURE_STR: consecutive_next_tags}
    if i < len(feature_tuples_list)-3:
        fourth_tag = feature_tuples_list[i+3][0]
        for es in cu.elements_sets_list:
            if (FEATURE_STR == f'+3:tag.{es}'):
                return {FEATURE_STR: fourth_tag in eval(f'cu.{es}')}
        if (FEATURE_STR == '+3:tag.null_element'):
            return {FEATURE_STR: fourth_tag == null_element}
        if (FEATURE_STR == '+3:previous==tag'):
            return {FEATURE_STR: fourth_tag == third_tag}
        fourth_pos = feature_tuples_list[i+3][2]
        if (FEATURE_STR == '+3:postag'):
            return {FEATURE_STR: fourth_pos}

    if (FEATURE_STR == 'BOS'):
        return {FEATURE_STR: (i <= 0)}
    if (FEATURE_STR == 'EOS'):
        return {FEATURE_STR: (i >= len(feature_tuples_list)-1)}
    if (FEATURE_STR == 'child_str.pos_crf_predict_single') and hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
        return {FEATURE_STR: scrfcu.predict_single(child_str)}
    if (FEATURE_STR == 'child_str.pos_lr_predict_single') and hasattr(slrcu, 'pos_predict_percent_fit_dict'):
        return {FEATURE_STR: slrcu.predict_single(child_str)}
    if (FEATURE_STR == 'child_str.pos_sgd_predict_single') and hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
        return {FEATURE_STR: ssgdcu.predict_single(child_str)}
    if (FEATURE_STR == 'position'):
        return {FEATURE_STR: i+1}
    if (FEATURE_STR in ['+0:postag', 'postag']):
        return {FEATURE_STR: this_pos}
    for es in cu.elements_sets_list:
        if (FEATURE_STR in [f'+0:tag.{es}', f'tag.{es}']):
            return {FEATURE_STR: this_tag in eval(f'cu.{es}')}
    if (FEATURE_STR in ['+0:tag.null_element', 'tag.null_element']):
        return {FEATURE_STR: this_tag == null_element}

    features_dict = {FEATURE_STR: result_str}

    return features_dict

In [14]:

def get_pos_and_feature_tuples_list(file_name):
    # pbar_files.set_description('{:<50}'.format(file_name.split('.')[0].replace('_Indeed_com', '')[-50:]))
    # pbar_files.update()
    # pbar_files.refresh()
    child_strs_list = ha.get_child_strs_from_file(file_name=file_name)
    child_tags_list = ha.get_child_tags_list(child_strs_list)
    feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
    feature_tuple_list = []
    for feature_dict in feature_dict_list:
        feature_tuple_list.append(
            hc.get_feature_tuple(feature_dict, pos_lr_predict_single=None, pos_crf_predict_single=None, pos_sgd_predict_single=None)
        )
    noned_pos_list = [part_of_speech_dict.get(child_str) for child_str in child_strs_list]
    noned_feature_tuple_list = sent2features(feature_tuple_list)
    pos_list = []
    feature_tuples_list = []
    for pos, feature_tuple in zip(noned_pos_list, noned_feature_tuple_list):
        if pos is not None:
            pos_list.append(pos)
            feature_tuples_list.append(feature_tuple)
    
    return pos_list, feature_tuples_list

In [15]:

def get_row_dict(X_train, X_test, y_train, y_test):
    crf.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = crf.predict(X_test)

    # Fit and transform the y_test and y_pred sequences
    y_test_transformed = mlb.fit_transform(y_test)
    y_pred_transformed = mlb.transform(y_pred)

    # Compute the classification report
    df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_,
                                                   zero_division=0, output_dict=True), orient='index')

    # Grab the recall and f1 score of the O-RQ classifcation
    # as our metrics and add it as a row in our to-be-dataframe
    # (A high recall is desirable when the cost of a false negative prediction is high)
    row_dict = {
        'feature_str': FEATURE_STR,
        'o_rq_recall': df[(df.index == 'O-RQ')]['recall'].squeeze(),
        'o_rq_f1_score': df[(df.index == 'O-RQ')]['f1-score'].squeeze()
    }
    
    return row_dict

In [None]:

if s.pickle_exists('o_rq_metrics_df'):
    o_rq_metrics_df = s.load_object('o_rq_metrics_df')
else:
    rows_list = []
    pbar_features = tqdm(features_list, total=len(features_list))
    files_list = random.choices(filenames_df.file_name, k=200)
    # pbar_files = tqdm(files_list, total=len(files_list))
    for FEATURE_STR in pbar_features:
        pbar_features.set_description(FEATURE_STR)

        # Prepare the training and test data
        X = []
        y = []
        # pbar_files.reset(total=None)
        for file_name in files_list:
            pos_list, feature_tuples_list = get_pos_and_feature_tuples_list(file_name)
            y.append(pos_list)
            X.append(feature_tuples_list)
        # pbar_files.close()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Create the CRF model
        crf = CRF()

        # Train the model
        try:
            row_dict = get_row_dict(X_train, X_test, y_train, y_test)
            rows_list.append(row_dict)

        except Exception as e:
            print(f'{e.__class__.__name__} error in {FEATURE_STR}: {str(e).strip()}')
    o_rq_metrics_df = DataFrame(rows_list)
    s.store_objects(o_rq_metrics_df=o_rq_metrics_df)

  0%|          | 0/212 [00:00<?, ?it/s]

In [None]:

# Only use the features with a higher f1 score than the parts-of-speech tag itself
mask_series = (o_rq_metrics_df.feature_str == 'postag')
o_rq_f1_score = o_rq_metrics_df[mask_series].o_rq_f1_score.squeeze()
mask_series = (o_rq_metrics_df.o_rq_f1_score > o_rq_f1_score)
features_list = o_rq_metrics_df[mask_series].sort_values('o_rq_f1_score', ascending=True).feature_str.tolist()

In [None]:

# Only the features with a higher f1 score than the parts-of-speech tag itself
mask_series = (o_rq_metrics_df.feature_str == 'postag')
o_rq_f1_score = o_rq_metrics_df[mask_series].o_rq_f1_score.squeeze()
print(f'O-RQ just using the postag gets an f1 score of {o_rq_f1_score}')
o_rq_recall = o_rq_metrics_df[mask_series].o_rq_recall.squeeze()
mask_series = (o_rq_metrics_df.o_rq_recall < o_rq_recall)
o_rq_metrics_df[mask_series].sort_values('o_rq_recall', ascending=True)

In [None]:

mask_series = (o_rq_metrics_df.o_rq_recall < 0.8)
'(' + '|'.join(o_rq_metrics_df[mask_series].feature_str) + ')'


---
# Data Exploration

In [None]:

import random

random_feature_dicts_list = random.choice(X)
print(random_feature_dicts_list)
if random_feature_dicts_list:
    pos_symbols_list = pos_symbol_crf.predict_single(random_feature_dicts_list)
    assert len(set(pos_symbols_list)) == 1
    display(pos_symbols_list[0])

In [None]:

mask_series = (pos_html_strs_df.pos_symbol == 'H-JD')
pos_html_strs_df[mask_series].navigable_parent.tolist()

In [None]:

mask_series = (pos_html_strs_df.pos_symbol == 'H-PD')
pos_html_strs_df[mask_series].navigable_parent.tolist()

In [None]:

mask_series = (pos_html_strs_df.pos_symbol == 'O-O')
pos_html_strs_df[mask_series].navigable_parent.tolist()