In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from neo4j.exceptions import ServiceUnavailable
from nltk import pos_tag
from pandas import DataFrame
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
import humanize
import nltk
import os
import re
import sys
import time
import warnings

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

try:
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, verbose=True)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')

Utility libraries created in 3 seconds


In [4]:

# Check if the crf has built its parts-of-speech classifier
# I have 46,327 labeled parts of speech in here
# Parts-of-speech CRF elements built in 19 minutes and 50 seconds
t0 = time.time()
if not hasattr(crf, 'pos_crf_predict_single'):
    crf.build_pos_conditional_random_field_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech CRF elements built in {duration_str}')

I have 46,408 labeled parts of speech in here
Parts-of-speech CRF elements built in 20 minutes and 56 seconds


In [None]:

# This is from https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
pos_tag_description_dict = s.load_object('pos_tag_description_dict')

In [None]:

# Create the tokenizer
HTML_SCANNER_REGEX = re.compile(r'</?\w+|\w+[#\+]*|:|\.|\?')
def html_regex_tokenizer(corpus):

    return [match.group() for match in re.finditer(HTML_SCANNER_REGEX, corpus)]


---
# Data Exploration
This stage involves loading and examining the dataset to gain a better understanding of its structure, content, and quality. Data exploration may involve tasks such as data visualization, summary statistics, and data cleaning.

In [None]:

t0 = time.time()
cu.populate_pos_relationships(verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech relationships repopulated in {duration_str}')

In [5]:

cypher_str = '''
    // Filter for NavigableParents nodes with an unambiguous SUMMARIZES relationship
    MATCH (np:NavigableParents)
    WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) = 1
    WITH np
    
    // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
    MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
    
    // Return the navigable parent and important properties
    RETURN
        pos.pos_symbol AS pos_symbol,
        pos.is_header AS pos_is_header,
        pos.is_task_scope AS pos_is_task_scope,
        pos.is_minimum_qualification AS pos_is_minimum_qualification,
        pos.is_preferred_qualification AS pos_is_preferred_qualification,
        pos.is_legal_notification AS pos_is_legal_notification,
        pos.is_job_title AS pos_is_job_title,
        pos.is_office_location AS pos_is_office_location,
        pos.is_job_duration AS pos_is_job_duration,
        pos.is_supplemental_pay AS pos_is_supplemental_pay,
        pos.is_educational_requirement AS pos_is_educational_requirement,
        pos.is_interview_procedure AS pos_is_interview_procedure,
        pos.is_corporate_scope AS pos_is_corporate_scope,
        pos.is_posting_date AS pos_is_posting_date,
        pos.is_other AS pos_is_other,
        
        np.navigable_parent AS navigable_parent,
        np.is_header AS np_is_header,
        np.is_task_scope AS np_is_task_scope,
        np.is_minimum_qualification AS np_is_minimum_qualification,
        np.is_preferred_qualification AS np_is_preferred_qualification,
        np.is_legal_notification AS np_is_legal_notification,
        np.is_job_title AS np_is_job_title,
        np.is_office_location AS np_is_office_location,
        np.is_job_duration AS np_is_job_duration,
        np.is_supplemental_pay AS np_is_supplemental_pay,
        np.is_educational_requirement AS np_is_educational_requirement,
        np.is_interview_procedure AS np_is_interview_procedure,
        np.is_corporate_scope AS np_is_corporate_scope,
        np.is_posting_date AS np_is_posting_date,
        np.is_other AS np_is_other;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    pos_html_strs_df = DataFrame(row_objs_list)
    
    # (46327, 30)
    print(pos_html_strs_df.shape)

(46408, 30)


In [None]:

# Prove that only unambiguous relationships exist
navigable_parent = pos_html_strs_df.navigable_parent.value_counts().head(1).index[0]
mask_series = (pos_html_strs_df.navigable_parent == navigable_parent)
df = pos_html_strs_df[mask_series].T
mask_series = (df[0] != 'False')
df[mask_series]

In [None]:

features_list = ['is_task_scope', 'is_minimum_qualification', 'is_preferred_qualification', 'is_legal_notification', 'is_job_title', 'is_office_location', 'is_job_duration', 'is_supplemental_pay',
                 'is_educational_requirement', 'is_interview_procedure', 'is_corporate_scope', 'is_posting_date', 'is_other']
mask_series = False
for feature_str in features_list:
    mask_series |= (pos_html_strs_df[f'pos_{feature_str}'] != pos_html_strs_df[f'np_{feature_str}'])
df = pos_html_strs_df[mask_series].head(5).T
df

In [None]:

columns_list = ['navigable_parent', 'pos_symbol', 'pos_is_header']
pos_html_strs_df[columns_list].sample(5)


---
# Data Preparation
This stage involves preparing the dataset for training by transforming, cleaning, and pre-processing the data. Data preparation may include tasks such as feature selection, normalization, and data augmentation.

In [43]:

t0 = time.time()
y_predicted = [(crf.pos_crf_predict_single(navigable_parent), ) for navigable_parent in part_of_speech_dict.keys()]
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

Predicted labels created in 34 seconds


In [39]:

# Prepare the training and test data
part_of_speech_dict = pos_html_strs_df.set_index('navigable_parent').pos_symbol.to_dict()
y_actual = [(pos_symbol, ) for navigable_parent, pos_symbol in part_of_speech_dict.items()]

In [44]:

y_predicted[-2:]

[('O-RQ',), ('O-O',)]

In [40]:

y_actual[-2:]

[('O-O',), ('O-O',)]

In [45]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_actual)
y_pred_transformed = mlb.transform(y_predicted)
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)

In [46]:

# Compute the classification report
pos_symbol_crf_df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_, zero_division=0, output_dict=True), orient='index')
pos_symbol_crf_df.index.name = 'pos_symbol'
pos_symbol_crf_df.sort_values('recall', ascending=False)

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O-JT,0.99838,0.997672,0.998026,9881
O-PD,0.983498,0.990033,0.986755,301
O-RQ,0.935158,0.981415,0.957728,17164
O-IP,0.99538,0.978732,0.986986,4843
O-JD,0.983022,0.969849,0.976391,597
O-TS,0.985175,0.964537,0.974747,6683
weighted avg,0.950302,0.949082,0.947768,46408
micro avg,0.949307,0.949082,0.949195,46408
samples avg,0.949082,0.949082,0.949082,46408
O-CS,0.979167,0.898089,0.936877,785



---
# Data Preparation
This stage involves preparing the dataset for training by transforming, cleaning, and pre-processing the data. Data preparation may include tasks such as feature selection, normalization, and data augmentation.

In [32]:

# Rebalance the data with the sampling strategy limit
from imblearn.under_sampling import RandomUnderSampler

# Get the sampling strategy limit
vc_srs = pos_html_strs_df.pos_symbol.value_counts()
sampling_strategy_limit = int(vc_srs.median())

# Get the random under-sampler
counts_dict = pos_html_strs_df.groupby('pos_symbol').count().navigable_parent.to_dict()
sampling_strategy = {k: min(sampling_strategy_limit, v) for k, v in counts_dict.items()}
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)

# Define the tuple of arrays
resampled_data = rus.fit_resample(
    pos_html_strs_df.navigable_parent.values.reshape(-1, 1), pos_html_strs_df.pos_symbol.values.reshape(-1, 1)
)

# Recreate the Pandas DataFrame
rebalanced_df = DataFrame(resampled_data[0], columns=['navigable_parent'])
rebalanced_df['pos_symbol'] = resampled_data[1]

# (4491, 2)
print(rebalanced_df.shape)
rebalanced_df.groupby('pos_symbol').count().sort_values('navigable_parent').head(10)

(4285, 2)


Unnamed: 0_level_0,navigable_parent
pos_symbol,Unnamed: 1_level_1
H-PD,5
H-JD,28
O-ER,31
H-ER,61
H-LN,62
H-JT,73
O-O,73
H-OL,86
H-IP,92
O-LN,95


In [33]:

# Sentences to parse
sentences_list = rebalanced_df.navigable_parent.tolist()
sentences_list[-2:]

['<ots>GIS application developer.</ots>', "<ots>Test Planning &amp; execution: Expertise in the requirement's analysis, test planning, designing and execution of the test scripts.</ots>"]

In [34]:

# Labels to apply
pos_symbols_list = rebalanced_df.pos_symbol.tolist()
pos_symbols_list[-2:]

['O-TS', 'O-TS']

In [35]:

# Tokenize the sentences
tokens_list = [html_regex_tokenizer(sentence) for sentence in sentences_list]
tokens_list[-2:]

[['<ots', 'GIS', 'application', 'developer', '.', '</ots'], ['<ots', 'Test', 'Planning', 'amp', 'execution', ':', 'Expertise', 'in', 'the', 'requirement', 's', 'analysis', 'test', 'planning', 'designing', 'and', 'execution', 'of', 'the', 'test', 'scripts', '.', '</ots']]

In [36]:

def sent2features(sent):
    
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(pos_symbol, sent):
    
    return [pos_symbol] * len(sent)


---
# CRF Training using Parts-of-Speech

In [37]:

# Get the parts of speech
pos_tags_list = [pos_tag(tokens) for tokens in tokens_list]
pos_tags_list[-2:]

[[('<ots', 'NNS'), ('GIS', 'NNP'), ('application', 'NN'), ('developer', 'NN'), ('.', '.'), ('</ots', 'NNS')], [('<ots', 'JJ'), ('Test', 'NNP'), ('Planning', 'NNP'), ('amp', 'JJ'), ('execution', 'NN'), (':', ':'), ('Expertise', 'NN'), ('in', 'IN'), ('the', 'DT'), ('requirement', 'NN'), ('s', 'NN'), ('analysis', 'NN'), ('test', 'NN'), ('planning', 'NN'), ('designing', 'NN'), ('and', 'CC'), ('execution', 'NN'), ('of', 'IN'), ('the', 'DT'), ('test', 'NN'), ('scripts', 'NN'), ('.', '.'), ('</ots', 'NNS')]]

In [38]:

# Define features to be used in the CRF model
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'word': word,
        'postag': postag
    }
    
    return features

# Prepare the training and test data
X = [sent2features(pos_tags) for pos_tags in pos_tags_list]
y = [sent2labels(pos_symbol, pos_tag) for pos_tag, pos_symbol in zip(pos_tags_list, pos_symbols_list)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the CRF model
pos_symbol_crf = CRF()

# Train the model
pos_symbol_crf.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = pos_symbol_crf.predict(X_test)
y_pred[-2:], y_test[-2:]

([['O-JD', 'O-JD', 'O-JD', 'O-JD'], ['O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ']], [['O-JD', 'O-JD', 'O-JD', 'O-JD'], ['H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ', 'H-RQ']])

In [39]:

X_test[-2:]

[[{'word': '<strong', 'postag': 'RB'}, {'word': 'Full', 'postag': 'NNP'}, {'word': 'time', 'postag': 'NN'}, {'word': '</strong', 'postag': 'NN'}], [{'word': '<span', 'postag': 'JJ'}, {'word': 'style', 'postag': 'NN'}, {'word': 'font', 'postag': 'JJ'}, {'word': 'weight', 'postag': 'NN'}, {'word': ':', 'postag': ':'}, {'word': '400', 'postag': 'CD'}, {'word': 'Previous', 'postag': 'JJ'}, {'word': 'experience', 'postag': 'NN'}, {'word': 'building', 'postag': 'VBG'}, {'word': 'end', 'postag': 'NN'}, {'word': 'to', 'postag': 'TO'}, {'word': 'end', 'postag': 'VB'}, {'word': 'machine', 'postag': 'NN'}, {'word': 'learning', 'postag': 'VBG'}, {'word': 'systems', 'postag': 'NNS'}, {'word': '</span', 'postag': 'VBP'}]]


---
# Visualization

In [40]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_test)
y_pred_transformed = mlb.transform(y_pred)

In [41]:

# Compute the classification report
pos_symbol_crf_df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_, zero_division=0, output_dict=True), orient='index')
pos_symbol_crf_df.index.name = 'pos_symbol'
pos_symbol_crf_df.sort_values('recall', ascending=False)

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O-JT,1.0,1.0,1.0,50
O-PD,0.955556,0.955556,0.955556,45
O-JD,0.962963,0.945455,0.954128,55
O-IP,0.958333,0.938776,0.948454,49
H-ER,0.75,0.9,0.818182,10
O-PQ,0.507042,0.8,0.62069,45
O-TS,0.877193,0.78125,0.826446,64
O-ER,0.5,0.75,0.6,4
O-OL,0.647059,0.75,0.694737,44
O-RQ,0.813953,0.744681,0.777778,47


In [48]:

tokens_list = [html_regex_tokenizer(navigable_parent)]
pos_tags_list = [pos_tag(tokens) for tokens in tokens_list]
X = [sent2features(pos_tags) for pos_tags in pos_tags_list]
y_pred = pos_symbol_crf.predict(X)
y_pred

'O-IP'

In [50]:

# Check if the lru has built its parts-of-speech logistic regression model
if not hasattr(lru, 'POS_PREDICT_PERCENT_FIT_DICT'):
    lru.build_pos_logistic_regression_elements(verbose=True)

In [47]:

# Define features to be used in the CRF model
y_pred = []
y_test = []
for tl, pred_symbol, test_symbol in zip(tokens_list, [lru.pos_lr_predict_single(child_str) for child_str in sentences_list], [ys_list[0] if ys_list else None for ys_list in y]):
    if test_symbol is not None:
        y_pred.append([pred_symbol] * len(tl))
        y_test.append([test_symbol] * len(tl))
y_pred[-2:], y_test[-2:]

([['H-IP', 'H-IP', 'H-IP', 'H-IP', 'H-IP', 'H-IP', 'H-IP', 'H-IP', 'H-IP', 'H-IP'], ['O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS']], [['O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS'], ['O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS']])

In [48]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_test)
y_pred_transformed = mlb.transform(y_pred)
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)

In [49]:

# Compute the classification report
pos_symbol_lr_df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_, zero_division=0, output_dict=True), orient='index')
pos_symbol_lr_df.index.name = 'pos_symbol'
pos_symbol_lr_df.sort_values('recall', ascending=False)

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O-ER,0.496241,0.835443,0.622642,79
O-LN,0.265101,0.669492,0.379808,118
H-SP,0.316,0.467456,0.377088,169
H-O,0.060606,0.40708,0.105505,113
H-CS,0.775194,0.395257,0.52356,253
H-LN,0.362069,0.344262,0.352941,61
H-IP,0.306818,0.238938,0.268657,113
O-TS,0.235294,0.237154,0.23622,253
H-TS,0.059076,0.217391,0.092905,253
O-SP,0.410853,0.209486,0.277487,253



---
# Model Selection
This stage involves selecting the appropriate model or algorithm for the task at hand. Model selection may involve tasks such as hyperparameter tuning, cross-validation, and model evaluation.

In [86]:

features_list = ['+1:postag', '+1:tag.basic_text_set', '+1:tag.block_elements_set', '+1:tag.document_body_elements_set', '+1:tag.inline_elements_set', '+1:tag.lists_set', '+1:tag.null_element',
                 '+1:tag.other_block_elements_set', '+1:tag.phrase_elements_set', '+1:tag.presentation_set', '+1:tag.section_headings_set', '+1:tag==previous', '+2:postag', '+2:tag.basic_text_set',
                 '+2:tag.block_elements_set', '+2:tag.document_body_elements_set', '+2:tag.inline_elements_set', '+2:tag.lists_set', '+2:tag.null_element', '+2:tag.other_block_elements_set', '+2:tag.phrase_elements_set',
                 '+2:tag.presentation_set', '+2:tag.section_headings_set', '+2:tag==previous', '+3:postag', '+3:tag.basic_text_set', '+3:tag.block_elements_set', '+3:tag.document_body_elements_set',
                 '+3:tag.inline_elements_set', '+3:tag.lists_set', '+3:tag.null_element', '+3:tag.other_block_elements_set', '+3:tag.phrase_elements_set', '+3:tag.presentation_set', '+3:tag.section_headings_set',
                 '+3:tag==previous', '-1:postag', '-1:previous==tag', '-1:tag.basic_text_set', '-1:tag.block_elements_set', '-1:tag.document_body_elements_set', '-1:tag.inline_elements_set', '-1:tag.lists_set',
                 '-1:tag.null_element', '-1:tag.other_block_elements_set', '-1:tag.phrase_elements_set', '-1:tag.presentation_set', '-1:tag.section_headings_set', 'bias', 'child_str.pos_lr_predict_single', 'position',
                 'postag', 'tag.basic_text_set', 'tag.block_elements_set', 'tag.consecutive_next_tags', 'tag.document_body_elements_set', 'tag.inline_elements_set', 'tag.lists_set', 'tag.null_element',
                 'tag.other_block_elements_set', 'tag.phrase_elements_set', 'tag.presentation_set', 'tag.section_headings_set', 'BOS', 'EOS']
cypher_str = '''
    // Find all files that contain labeled HTML strings
    MATCH (np:NavigableParents)  // Find all nodes with the label `NavigableParents` and assign them to the variable `np`
    WHERE
        (np.is_header IS NOT NULL)  // Filter nodes that have the property `is_header`
        AND (np.is_task_scope IS NOT NULL)  // Filter nodes that have the property `is_task_scope`
        AND (np.is_minimum_qualification IS NOT NULL)  // Filter nodes that have the property `is_minimum_qualification`
        AND (np.is_preferred_qualification IS NOT NULL)  // Filter nodes that have the property `is_preferred_qualification`
        AND (np.is_legal_notification IS NOT NULL)  // Filter nodes that have the property `is_legal_notification`
        AND (np.is_job_title IS NOT NULL)  // Filter nodes that have the property `is_job_title`
        AND (np.is_office_location IS NOT NULL)  // Filter nodes that have the property `is_office_location`
        AND (np.is_job_duration IS NOT NULL)  // Filter nodes that have the property `is_job_duration`
        AND (np.is_supplemental_pay IS NOT NULL)  // Filter nodes that have the property `is_supplemental_pay`
        AND (np.is_educational_requirement IS NOT NULL)  // Filter nodes that have the property `is_educational_requirement`
        AND (np.is_interview_procedure IS NOT NULL)  // Filter nodes that have the property `is_interview_procedure`
        AND (np.is_corporate_scope IS NOT NULL)  // Filter nodes that have the property `is_corporate_scope`
        AND (np.is_posting_date IS NOT NULL)  // Filter nodes that have the property `is_posting_date`
        AND (np.is_other IS NOT NULL)  // Filter nodes that have the property `is_other`
    WITH np  // Store the value of each `np` node
    MATCH (np:NavigableParents)-[r:NEXT]->(:NavigableParents)  // Find all nodes connected to `np` by an outgoing `NEXT` relationship, and assign the relationship to `r`
    RETURN DISTINCT r.file_name AS file_name;  // Return the value of the `file_name` property of the `r` relationship, with the alias `file_name`
'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    filenames_df = DataFrame(row_objs_list)
    
    # (7711, 1)
    print(filenames_df.shape)

(7723, 1)


In [23]:

from hc_utils import HeaderCategories
from lr_utils import LrUtilities
from sklearn.preprocessing import MultiLabelBinarizer

hc = HeaderCategories(cu=cu, verbose=False)
    
# Keep the total creation time to less than one hour by adjusting the sampling strategy limit
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

In [19]:

# Check if the lru has retrained its isheader classifier
if not hasattr(lru, 'ISHEADER_PREDICT_PERCENT_FIT'):
    
    # Check if the lru has built its isheader logistic regression model
    if not hasattr(lru, 'ISHEADER_LR'):
        lru.build_isheader_logistic_regression_elements(verbose=True)
    
    lru.retrain_isheader_classifier(verbose=True)

In [None]:

from tqdm import tqdm_notebook as tqdm
# from tqdm.notebook import tqdm
import random
from itertools import groupby

# Assume only one feature to be used in the CRF model
def word2features(feature_tuples_list, i):
    null_element = 'plaintext'
    this_feature_tuple = feature_tuples_list[i]
    this_tag = this_feature_tuple[0]
    child_str = this_feature_tuple[1]
    this_pos = this_feature_tuple[2]
    result_str = 'O'

    if (i > 0):
        previous_tag = feature_tuples_list[i-1][0]
        if (FEATURE_STR == '-1:previous==tag'):
            return {FEATURE_STR: previous_tag == this_tag}
        if (FEATURE_STR == '-1:tag.basic_text_set'):
            return {FEATURE_STR: previous_tag in cu.basic_text_set}
        if (FEATURE_STR == '-1:tag.block_elements_set'):
            return {FEATURE_STR: previous_tag in cu.block_elements_set}
        if (FEATURE_STR == '-1:tag.document_body_elements_set'):
            return {FEATURE_STR: previous_tag in cu.document_body_elements_set}
        if (FEATURE_STR == '-1:tag.inline_elements_set'):
            return {FEATURE_STR: previous_tag in cu.inline_elements_set}
        if (FEATURE_STR == '-1:tag.lists_set'):
            return {FEATURE_STR: previous_tag in cu.lists_set}
        if (FEATURE_STR == '-1:tag.null_element'):
            return {FEATURE_STR: previous_tag == null_element}
        if (FEATURE_STR == '-1:tag.other_block_elements_set'):
            return {FEATURE_STR: previous_tag in cu.other_block_elements_set}
        if (FEATURE_STR == '-1:tag.phrase_elements_set'):
            return {FEATURE_STR: previous_tag in cu.phrase_elements_set}
        if (FEATURE_STR == '-1:tag.presentation_set'):
            return {FEATURE_STR: previous_tag in cu.presentation_set}
        if (FEATURE_STR == '-1:tag.section_headings_set'):
            return {FEATURE_STR: previous_tag in cu.section_headings_set}
        previous_pos = feature_tuples_list[i-1][2]
        if (FEATURE_STR == '-1:postag'):
            return {FEATURE_STR: previous_pos}
    if (i < len(feature_tuples_list)-1):
        next_tag = feature_tuples_list[i+1][0]
        if (FEATURE_STR == '+1:tag.basic_text_set'):
            return {FEATURE_STR: next_tag in cu.basic_text_set}
        if (FEATURE_STR == '+1:tag.block_elements_set'):
            return {FEATURE_STR: next_tag in cu.block_elements_set}
        if (FEATURE_STR == '+1:tag.document_body_elements_set'):
            return {FEATURE_STR: next_tag in cu.document_body_elements_set}
        if (FEATURE_STR == '+1:tag.inline_elements_set'):
            return {FEATURE_STR: next_tag in cu.inline_elements_set}
        if (FEATURE_STR == '+1:tag.lists_set'):
            return {FEATURE_STR: next_tag in cu.lists_set}
        if (FEATURE_STR == '+1:tag.null_element'):
            return {FEATURE_STR: next_tag == null_element}
        if (FEATURE_STR == '+1:tag.other_block_elements_set'):
            return {FEATURE_STR: next_tag in cu.other_block_elements_set}
        if (FEATURE_STR == '+1:tag.phrase_elements_set'):
            return {FEATURE_STR: next_tag in cu.phrase_elements_set}
        if (FEATURE_STR == '+1:tag.presentation_set'):
            return {FEATURE_STR: next_tag in cu.presentation_set}
        if (FEATURE_STR == '+1:tag.section_headings_set'):
            return {FEATURE_STR: next_tag in cu.section_headings_set}
        if (FEATURE_STR == '+1:tag==previous'):
            return {FEATURE_STR: next_tag == this_tag}
        next_pos = feature_tuples_list[i+1][2]
        if (FEATURE_STR == '+1:postag'):
            return {FEATURE_STR: next_pos}
    if i < len(feature_tuples_list)-2:
        third_tag = feature_tuples_list[i+2][0]
        if (FEATURE_STR == '+2:tag.basic_text_set'):
            return {FEATURE_STR: third_tag in cu.basic_text_set}
        if (FEATURE_STR == '+2:tag.block_elements_set'):
            return {FEATURE_STR: third_tag in cu.block_elements_set}
        if (FEATURE_STR == '+2:tag.document_body_elements_set'):
            return {FEATURE_STR: third_tag in cu.document_body_elements_set}
        if (FEATURE_STR == '+2:tag.inline_elements_set'):
            return {FEATURE_STR: third_tag in cu.inline_elements_set}
        if (FEATURE_STR == '+2:tag.lists_set'):
            return {FEATURE_STR: third_tag in cu.lists_set}
        if (FEATURE_STR == '+2:tag.null_element'):
            return {FEATURE_STR: third_tag == null_element}
        if (FEATURE_STR == '+2:tag.other_block_elements_set'):
            return {FEATURE_STR: third_tag in cu.other_block_elements_set}
        if (FEATURE_STR == '+2:tag.phrase_elements_set'):
            return {FEATURE_STR: third_tag in cu.phrase_elements_set}
        if (FEATURE_STR == '+2:tag.presentation_set'):
            return {FEATURE_STR: third_tag in cu.presentation_set}
        if (FEATURE_STR == '+2:tag.section_headings_set'):
            return {FEATURE_STR: third_tag in cu.section_headings_set}
        if (FEATURE_STR == '+2:tag==previous'):
            return {FEATURE_STR: third_tag == next_tag}
        third_pos = feature_tuples_list[i+2][2]
        if (FEATURE_STR == '+2:postag'):
            return {FEATURE_STR: third_pos}
        labels_list = [label for token, child_str, label in feature_tuples_list][i:]
        consecutives_list = []
        for k, v in groupby(labels_list):
            consecutives_list.append((k, len(list(v))))
        if (consecutives_list[0][1] > 1):
            consecutive_next_tags = 0
        else:
            consecutive_next_tags = consecutives_list[1][1]
        if (FEATURE_STR == 'tag.consecutive_next_tags'):
            return {FEATURE_STR: consecutive_next_tags}
    if i < len(feature_tuples_list)-3:
        fourth_tag = feature_tuples_list[i+3][0]
        if (FEATURE_STR == '+3:tag.basic_text_set'):
            return {FEATURE_STR: fourth_tag in cu.basic_text_set}
        if (FEATURE_STR == '+3:tag.block_elements_set'):
            return {FEATURE_STR: fourth_tag in cu.block_elements_set}
        if (FEATURE_STR == '+3:tag.document_body_elements_set'):
            return {FEATURE_STR: fourth_tag in cu.document_body_elements_set}
        if (FEATURE_STR == '+3:tag.inline_elements_set'):
            return {FEATURE_STR: fourth_tag in cu.inline_elements_set}
        if (FEATURE_STR == '+3:tag.lists_set'):
            return {FEATURE_STR: fourth_tag in cu.lists_set}
        if (FEATURE_STR == '+3:tag.null_element'):
            return {FEATURE_STR: fourth_tag == null_element}
        if (FEATURE_STR == '+3:tag.other_block_elements_set'):
            return {FEATURE_STR: fourth_tag in cu.other_block_elements_set}
        if (FEATURE_STR == '+3:tag.phrase_elements_set'):
            return {FEATURE_STR: fourth_tag in cu.phrase_elements_set}
        if (FEATURE_STR == '+3:tag.presentation_set'):
            return {FEATURE_STR: fourth_tag in cu.presentation_set}
        if (FEATURE_STR == '+3:tag.section_headings_set'):
            return {FEATURE_STR: fourth_tag in cu.section_headings_set}
        if (FEATURE_STR == '+3:tag==previous'):
            return {FEATURE_STR: fourth_tag == third_tag}
        fourth_pos = feature_tuples_list[i+3][2]
        if (FEATURE_STR == '+3:postag'):
            return {FEATURE_STR: fourth_pos}

    if (FEATURE_STR == 'BOS'):
        return {FEATURE_STR: (i <= 0)}
    if (FEATURE_STR == 'EOS'):
        return {FEATURE_STR: (i >= len(feature_tuples_list)-1)}
    if (FEATURE_STR == 'bias'):
        return {FEATURE_STR: 1.0}
    if (FEATURE_STR == 'child_str.pos_lr_predict_single') and hasattr(lru, 'POS_PREDICT_PERCENT_FIT_DICT'):
        return {FEATURE_STR: lru.pos_lr_predict_single(child_str)}
    if (FEATURE_STR == 'position'):
        return {FEATURE_STR: i+1}
    if (FEATURE_STR == 'postag'):
        return {FEATURE_STR: this_pos}
    if (FEATURE_STR == 'tag.basic_text_set'):
        return {FEATURE_STR: this_tag in cu.basic_text_set}
    if (FEATURE_STR == 'tag.block_elements_set'):
        return {FEATURE_STR: this_tag in cu.block_elements_set}
    if (FEATURE_STR == 'tag.document_body_elements_set'):
        return {FEATURE_STR: this_tag in cu.document_body_elements_set}
    if (FEATURE_STR == 'tag.inline_elements_set'):
        return {FEATURE_STR: this_tag in cu.inline_elements_set}
    if (FEATURE_STR == 'tag.lists_set'):
        return {FEATURE_STR: this_tag in cu.lists_set}
    if (FEATURE_STR == 'tag.null_element'):
        return {FEATURE_STR: this_tag == null_element}
    if (FEATURE_STR == 'tag.other_block_elements_set'):
        return {FEATURE_STR: this_tag in cu.other_block_elements_set}
    if (FEATURE_STR == 'tag.phrase_elements_set'):
        return {FEATURE_STR: this_tag in cu.phrase_elements_set}
    if (FEATURE_STR == 'tag.presentation_set'):
        return {FEATURE_STR: this_tag in cu.presentation_set}
    if (FEATURE_STR == 'tag.section_headings_set'):
        return {FEATURE_STR: this_tag in cu.section_headings_set}

    features_dict = {FEATURE_STR: result_str}

    return features_dict

In [48]:

def get_pos_and_feature_tuples_list(file_name):
    # pbar_files.set_description('{:<50}'.format(file_name.split('.')[0].replace('_Indeed_com', '')[-50:]))
    # pbar_files.update()
    # pbar_files.refresh()
    child_strs_list = ha.get_child_strs_from_file(file_name=file_name)
    child_tags_list = ha.get_child_tags_list(child_strs_list)
    is_header_list = []
    for is_header, child_str in zip(ha.get_is_header_list(child_strs_list), child_strs_list):
        if is_header is None:
            probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
            idx = probs_list.index(max(probs_list))
            is_header = [True, False][idx]
        is_header_list.append(is_header)
    feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
    feature_tuple_list = []
    for feature_dict in feature_dict_list:
        feature_tuple_list.append(
            hc.get_feature_tuple(feature_dict, pos_lr_predict_single=None, pos_crf_predict_single=None)
        )
    noned_pos_list = [part_of_speech_dict.get(child_str) for child_str in child_strs_list]
    noned_feature_tuple_list = sent2features(feature_tuple_list)
    pos_list = []
    feature_tuples_list = []
    for pos, feature_tuple in zip(noned_pos_list, noned_feature_tuple_list):
        if pos is not None:
            pos_list.append(pos)
            feature_tuples_list.append(feature_tuple)
    
    return pos_list, feature_tuples_list

In [118]:

def get_row_dict(X_train, X_test, y_train, y_test):
    crf.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = crf.predict(X_test)

    # Fit and transform the y_test and y_pred sequences
    y_test_transformed = mlb.fit_transform(y_test)
    y_pred_transformed = mlb.transform(y_pred)

    # Compute the classification report
    df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_, zero_division=0, output_dict=True), orient='index')

    # Grab the recall and f1 score of the O-RQ classifcation
    # as our metrics and add it as a row in our to-be-dataframe
    # (A high recall is desirable when the cost of a false negative prediction is high)
    row_dict = {
        'feature_str': FEATURE_STR,
        'o_rq_recall': df[(df.index == 'O-RQ')]['recall'].squeeze(),
        'o_rq_f1_score': df[(df.index == 'O-RQ')]['f1-score'].squeeze()
    }
    
    return row_dict

In [None]:

if s.pickle_exists('o_rq_metrics_df'):
    o_rq_metrics_df = s.load_object('o_rq_metrics_df')
else:
    rows_list = []
    pbar_features = tqdm(features_list, total=len(features_list))
    files_list = random.choices(filenames_df.file_name, k=200)
    # pbar_files = tqdm(files_list, total=len(files_list))
    for FEATURE_STR in pbar_features:
        pbar_features.set_description(FEATURE_STR)

        # Prepare the training and test data
        X = []
        y = []
        # pbar_files.reset(total=None)
        for file_name in files_list:
            pos_list, feature_tuples_list = get_pos_and_feature_tuples_list(file_name)
            y.append(pos_list)
            X.append(feature_tuples_list)
        # pbar_files.close()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Create the CRF model
        crf = CRF()

        # Train the model
        try:
            row_dict = get_row_dict(X_train, X_test, y_train, y_test)
            rows_list.append(row_dict)

        except Exception as e:
            print(f'{e.__class__} error in {FEATURE_STR}: {str(e).strip()}')
    o_rq_metrics_df = DataFrame(rows_list)
    s.store_objects(o_rq_metrics_df=o_rq_metrics_df)

In [None]:

# Only use the features with a higher f1 score than the parts-of-speech tag itself
mask_series = (o_rq_metrics_df.feature_str == 'postag')
o_rq_f1_score = o_rq_metrics_df[mask_series].o_rq_f1_score.squeeze()
mask_series = (o_rq_metrics_df.o_rq_f1_score > o_rq_f1_score)
features_list = o_rq_metrics_df[mask_series].sort_values('o_rq_f1_score', ascending=True).feature_str.tolist()

In [111]:

# Only the features with a higher f1 score than the parts-of-speech tag itself
mask_series = (o_rq_metrics_df.feature_str == 'postag')
o_rq_f1_score = o_rq_metrics_df[mask_series].o_rq_f1_score.squeeze()
print(f'O-RQ just using the postag gets an f1 score of {o_rq_f1_score}')
o_rq_recall = o_rq_metrics_df[mask_series].o_rq_recall.squeeze()
mask_series = (o_rq_metrics_df.o_rq_recall < o_rq_recall)
o_rq_metrics_df[mask_series].sort_values('o_rq_recall', ascending=True)

O-RQ just using the postag gets an f1 score of 0.9411764705882354


Unnamed: 0,feature_str,o_rq_recall,o_rq_f1_score
52,tag.basic_text_set,0.0,0.0
49,child_str.pos_lr_predict_single,0.0,0.0
56,tag.inline_elements_set,0.0,0.0
59,tag.other_block_elements_set,0.028571,0.055556
58,tag.null_element,0.03125,0.058824
10,+1:tag.section_headings_set,0.054054,0.095238
48,bias,0.0625,0.102564
7,+1:tag.other_block_elements_set,0.0625,0.114286
8,+1:tag.phrase_elements_set,0.0625,0.111111
50,position,0.064516,0.111111


In [114]:

mask_series = (o_rq_metrics_df.o_rq_recall < 0.8)
'(' + '|'.join(o_rq_metrics_df[mask_series].feature_str) + ')'

'(+1:tag.basic_text_set|+1:tag.block_elements_set|+1:tag.document_body_elements_set|+1:tag.inline_elements_set|+1:tag.lists_set|+1:tag.null_element|+1:tag.other_block_elements_set|+1:tag.phrase_elements_set|+1:tag.presentation_set|+1:tag.section_headings_set|+1:tag==previous|+2:postag|+2:tag.block_elements_set|+2:tag.document_body_elements_set|+2:tag.phrase_elements_set|+3:postag|-1:previous==tag|-1:tag.block_elements_set|-1:tag.document_body_elements_set|-1:tag.lists_set|-1:tag.phrase_elements_set|-1:tag.presentation_set|bias|child_str.pos_lr_predict_single|position|tag.basic_text_set|tag.block_elements_set|tag.document_body_elements_set|tag.inline_elements_set|tag.lists_set|tag.null_element|tag.other_block_elements_set|tag.phrase_elements_set|tag.presentation_set|tag.section_headings_set|EOS)'


---
# Data Exploration

In [None]:

import random

random_feature_dicts_list = random.choice(X)
print(random_feature_dicts_list)
if random_feature_dicts_list:
    pos_symbols_list = pos_symbol_crf.predict_single(random_feature_dicts_list)
    assert len(set(pos_symbols_list)) == 1
    display(pos_symbols_list[0])

In [None]:

mask_series = (pos_html_strs_df.pos_symbol == 'H-JD')
pos_html_strs_df[mask_series].navigable_parent.tolist()

In [None]:

mask_series = (pos_html_strs_df.pos_symbol == 'H-PD')
pos_html_strs_df[mask_series].navigable_parent.tolist()

In [None]:

mask_series = (pos_html_strs_df.pos_symbol == 'O-O')
pos_html_strs_df[mask_series].navigable_parent.tolist()