In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from neo4j.exceptions import ServiceUnavailable
from nltk import pos_tag
from pandas import DataFrame
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
import humanize
import nltk
import os
import re
import sys
import time
import warnings

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

try:
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, verbose=True)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')

Utility libraries created in 8 seconds



---
# Data Preparation
This stage involves preparing the dataset for comparison by cleaning and pre-processing the data.

In [4]:

t0 = time.time()
cu.populate_pos_relationships(verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech relationships repopulated in {duration_str}')


                        MATCH (pos:PartsOfSpeech {is_header: 'False', is_other: 'True'}), (np:NavigableParents {is_header: 'False', is_other: 'True'})
                        MERGE (pos)-[r:SUMMARIZES]->(np);
Parts-of-speech relationships repopulated in 7 seconds


In [5]:

cypher_str = '''
    // Filter for NavigableParents nodes with an unambiguous SUMMARIZES relationship
    MATCH (np:NavigableParents)
    WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) = 1
    WITH np
    
    // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
    MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
    
    // Return the navigable parent and important properties
    RETURN
        pos.pos_symbol AS pos_symbol,
        pos.is_header AS pos_is_header,
        pos.is_task_scope AS pos_is_task_scope,
        pos.is_minimum_qualification AS pos_is_minimum_qualification,
        pos.is_preferred_qualification AS pos_is_preferred_qualification,
        pos.is_legal_notification AS pos_is_legal_notification,
        pos.is_job_title AS pos_is_job_title,
        pos.is_office_location AS pos_is_office_location,
        pos.is_job_duration AS pos_is_job_duration,
        pos.is_supplemental_pay AS pos_is_supplemental_pay,
        pos.is_educational_requirement AS pos_is_educational_requirement,
        pos.is_interview_procedure AS pos_is_interview_procedure,
        pos.is_corporate_scope AS pos_is_corporate_scope,
        pos.is_posting_date AS pos_is_posting_date,
        pos.is_other AS pos_is_other,
        
        np.navigable_parent AS navigable_parent,
        np.is_header AS np_is_header,
        np.is_task_scope AS np_is_task_scope,
        np.is_minimum_qualification AS np_is_minimum_qualification,
        np.is_preferred_qualification AS np_is_preferred_qualification,
        np.is_legal_notification AS np_is_legal_notification,
        np.is_job_title AS np_is_job_title,
        np.is_office_location AS np_is_office_location,
        np.is_job_duration AS np_is_job_duration,
        np.is_supplemental_pay AS np_is_supplemental_pay,
        np.is_educational_requirement AS np_is_educational_requirement,
        np.is_interview_procedure AS np_is_interview_procedure,
        np.is_corporate_scope AS np_is_corporate_scope,
        np.is_posting_date AS np_is_posting_date,
        np.is_other AS np_is_other;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    pos_html_strs_df = DataFrame(row_objs_list)
    
    # (46408, 30)
    print(pos_html_strs_df.shape)

(46569, 30)


In [6]:

columns_list = ['navigable_parent', 'pos_symbol', 'pos_is_header']
pos_html_strs_df[columns_list].sample(5)

Unnamed: 0,navigable_parent,pos_symbol,pos_is_header
27103,"<li>Ghost Inspector, selenium, or other browse...",O-PQ,False
17040,<li>Min. 3 years experience in Python</li>,O-RQ,False
17189,<li>2 - SAP MM Materials Management (P3 - Adva...,O-RQ,False
24455,<li>Knowledge of the energy industry - familia...,O-RQ,False
43730,"<oip>Role Primary Contact: Dang,Gary</oip>",O-IP,False


In [7]:

# Prepare the comparison data
part_of_speech_dict = pos_html_strs_df.set_index('navigable_parent').pos_symbol.to_dict()
y_actual = [(pos_symbol, ) for pos_symbol in part_of_speech_dict.values()]


---
# Parts-of-speech Prediction by Conditional Random Fields Algorithm
## CRF Data Preparation
This stage involves preparing the CRF dataset for evaluation by transforming the data with a multi-label binarizer.

In [9]:

# Check if the crf has built its parts-of-speech classifier
t1 = time.time()
if not hasattr(crf, 'pos_crf_predict_single'):# or crf.is_flask_running()
    crf.build_pos_conditional_random_field_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech CRF elements built in {duration_str}')

I have 46,569 labeled parts of speech in here
Parts-of-speech CRF elements built in 19 minutes and 34 seconds


In [10]:

t0 = time.time()
y_crf_predicted = [(crf.pos_crf_predict_single(navigable_parent), ) for navigable_parent in part_of_speech_dict.keys()]
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

Predicted labels created in 34 seconds


In [11]:

y_crf_predicted[-2:]

[('H-RQ',), ('H-TS',)]

In [12]:

y_actual[-2:]

[('O-O',), ('O-O',)]

In [13]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_actual_transformed = mlb.fit_transform(y_actual)
y_crf_pred_transformed = mlb.transform(y_crf_predicted)
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)


## CRF Visualization

In [14]:

# Compute the classification report
pos_symbol_crf_df = DataFrame.from_dict(classification_report(y_actual_transformed, y_crf_pred_transformed, target_names=mlb.classes_, zero_division=0, output_dict=True),
                                        orient='index')
pos_symbol_crf_df.index.name = 'pos_symbol'
pos_symbol_crf_df.sort_values('recall', ascending=False)

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O-JT,0.998077,0.997572,0.997824,9883
O-PD,0.958199,0.990033,0.973856,301
O-RQ,0.933278,0.981129,0.956605,17222
O-IP,0.995183,0.977175,0.986097,4863
O-JD,0.984694,0.966611,0.975569,599
O-TS,0.984749,0.963588,0.974053,6701
weighted avg,0.948983,0.947669,0.946299,46569
micro avg,0.947893,0.947669,0.947781,46569
samples avg,0.947669,0.947669,0.947669,46569
O-CS,0.977901,0.897338,0.935889,789



---
# Parts-of-speech Prediction by Logistic Regression Algorithm
## LR Data Preparation
This stage involves preparing the LR dataset for evaluation by transforming the data with a multi-label binarizer.

In [16]:

# Keep the total creation time to less than one hour by adjusting the sampling strategy limit
# I have 47,686 labeled parts of speech in here
# Parts-of-speech logistic regression elements built in 2 hours, 3 minutes and 10 seconds
t1 = time.time()
if not (hasattr(lru, 'POS_PREDICT_PERCENT_FIT_DICT')):# or crf.is_flask_running()
    lru.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)#6_400
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 47,946 labeled parts of speech in here
Parts-of-speech logistic regression elements built in 1 hour, 59 minutes and 54 seconds


In [17]:

t0 = time.time()
y_lr_predicted = [(lru.pos_lr_predict_single(navigable_parent), ) for navigable_parent in part_of_speech_dict.keys()]
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

Predicted labels created in 9 hours, 5 minutes and 5 seconds


In [18]:

y_lr_predicted[-2:]

[('O-O',), ('O-RQ',)]

In [19]:

y_actual[-2:]

[('O-O',), ('O-O',)]

In [20]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_actual_transformed = mlb.fit_transform(y_actual)
y_lr_pred_transformed = mlb.transform(y_lr_predicted)
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)


## LR Visualization

In [None]:

# Compute the classification report
pos_symbol_lr_df = DataFrame.from_dict(classification_report(y_actual_transformed, y_lr_pred_transformed, target_names=mlb.classes_, zero_division=0, output_dict=True),
                                        orient='index')
pos_symbol_lr_df.index.name = 'pos_symbol'
pos_symbol_lr_df.sort_values('recall', ascending=False)