In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

from pandas import DataFrame

from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities()
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, verbose=True)

In [4]:

import time
import humanize
from datetime import datetime
import winsound

duration = 1000  # milliseconds
freq = 880  # Hz
width_inches = 18.0
height_inches = 3.0
bin_count = 12
print(f'Last run on {datetime.now()}')

Last run on 2023-02-12 07:19:01.740213



----
# Create the Child Strings List Dictionary

In [None]:

cypher_str = """
    MATCH (:PartsOfSpeech)-[r:SUMMARIZES]->(:NavigableParents)
    DELETE r;"""

In [5]:

# 1 file names added to the list dictionary in 4 minutes and 22 seconds, for a total of 1,063
t0 = time.time()
old_child_strs_list_dict = cu.s.load_object('CHILD_STRS_LIST_DICT')
old_length = len(old_child_strs_list_dict)
new_child_strs_list_dict = cu.get_rebuilt_child_strs_list_dictionary(verbose=True)
new_length = len(new_child_strs_list_dict)
strings_added = new_length - old_length
if (strings_added > 0):
    cu.CHILD_STRS_LIST_DICT = new_child_strs_list_dict
    cu.s.store_objects(CHILD_STRS_LIST_DICT=cu.CHILD_STRS_LIST_DICT, verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'{strings_added} file names added to the list dictionary in {duration_str}, for a total of {new_length:,}')


            MATCH (np:NavigableParents)-[r:NEXT]->(:NavigableParents)
            RETURN
                np.navigable_parent AS navigable_parent,
                np.is_header AS is_header,
                r.sequence_order AS sequence_order,
                r.file_name AS file_name
            ORDER BY
                r.file_name,
                r.sequence_order;
364 file names added to the list dictionary in 2 minutes and 48 seconds, for a total of 1,427


In [6]:

# I have 1,060 hand-labeled parts-of-speech patterns in here
t0 = time.time()
lru.build_pos_logistic_regression_elements(verbose=False)
crf.retrain_pos_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Parts-of-speech classifier retrained in {duration_str}')

MATCH (np:NavigableParents {navigable_parent: "<p>Job Type: Full-time</p>"})<-[s:SUMMARIZES]-(ht:HeaderTags) RETURN ht.header_tag AS header_tag;
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\HEADER_PATTERN_DICT.pkl
I have 1,424 hand-labeled parts-of-speech patterns in here
Training the Conditional Random Fields model with 1,424 parts-of-speech labels
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\CRF.pkl
Retraining complete
Parts-of-speech classifier retrained in 2 hours, 14 minutes and 23 seconds


In [38]:

# Try to find a part-of-speech for every orphan child string
t0 = time.time()
import pandas as pd
import numpy as np

def do_cypher_tx(tx, verbose=False):
    cypher_str = """
        MATCH (np:NavigableParents)
        WHERE
           NOT EXISTS {
                MATCH (:PartsOfSpeech)-[:SUMMARIZES]->(np:NavigableParents)
            }
            AND NOT (
                np.is_header IS NULL
                AND np.is_task_scope IS NULL
                AND np.is_minimum_qualification IS NULL
                AND np.is_preferred_qualification IS NULL
                AND np.is_educational_requirement IS NULL
                AND np.is_legal_notification IS NULL
                AND np.is_other IS NULL
                AND np.is_corporate_scope IS NULL
                AND np.is_job_title IS NULL
                AND np.is_office_location IS NULL
                AND np.is_job_duration IS NULL
                AND np.is_supplemental_pay IS NULL
                AND np.is_interview_procedure IS NULL
                AND np.is_posting_date IS NULL
            )
        RETURN
            np.navigable_parent AS navigable_parent,
            np.is_header AS np_is_header,
            np.is_task_scope AS np_is_task_scope,
            np.is_minimum_qualification AS np_is_minimum_qualification,
            np.is_preferred_qualification AS np_is_preferred_qualification,
            np.is_educational_requirement AS np_is_educational_requirement,
            np.is_legal_notification AS np_is_legal_notification,
            np.is_other AS np_is_other,
            np.is_corporate_scope AS np_is_corporate_scope,
            np.is_job_title AS np_is_job_title,
            np.is_office_location AS np_is_office_location,
            np.is_job_duration AS np_is_job_duration,
            np.is_supplemental_pay AS np_is_supplemental_pay,
            np.is_interview_procedure AS np_is_interview_procedure,
            np.is_posting_date AS np_is_posting_date;"""
    if verbose:
        clear_output(wait=True)
        print(cypher_str)
    results_list = tx.run(query=cypher_str, parameters=None)
    values_list = []
    for record in results_list:
        values_list.append(dict(record.items()))

    return values_list
with cu.driver.session() as session:
    row_objs_list = session.read_transaction(do_cypher_tx, verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
# winsound.Beep(freq, duration)
print(f'Orphan child strings found in {duration_str}')

In [50]:

t0 = time.time()
columns_list = ['np_is_header', 'np_is_task_scope', 'np_is_minimum_qualification', 'np_is_preferred_qualification', 'np_is_educational_requirement', 'np_is_legal_notification',
                'np_is_other', 'np_is_corporate_scope', 'np_is_job_title', 'np_is_office_location', 'np_is_job_duration', 'np_is_supplemental_pay', 'np_is_interview_procedure',
                'np_is_posting_date']
df = pd.DataFrame(row_objs_list).replace([None], np.nan).groupby(columns_list).count().sort_values('navigable_parent', ascending=False)
for row_index, row_series in df.iterrows():
    
    # Iterate over the values and variable names
    for value, var_name in zip(row_index, columns_list):
        exec(f'{var_name} = {value}')
    
    cypher_str = f"""
        MATCH
            (pos:PartsOfSpeech {{
                is_header: '{np_is_header}',
                is_task_scope: '{np_is_task_scope}',
                is_minimum_qualification: '{np_is_minimum_qualification}',
                is_preferred_qualification: '{np_is_preferred_qualification}',
                is_educational_requirement: '{np_is_educational_requirement}',
                is_legal_notification: '{np_is_legal_notification}',
                is_other: '{np_is_other}',
                is_corporate_scope: '{np_is_corporate_scope}',
                is_job_title: '{np_is_job_title}',
                is_office_location: '{np_is_office_location}',
                is_job_duration: '{np_is_job_duration}',
                is_supplemental_pay: '{np_is_supplemental_pay}',
                is_interview_procedure: '{np_is_interview_procedure}',
                is_posting_date: '{np_is_posting_date}'
            }}),
            (np:NavigableParents {{
                is_header: '{np_is_header}',
                is_task_scope: '{np_is_task_scope}',
                is_minimum_qualification: '{np_is_minimum_qualification}',
                is_preferred_qualification: '{np_is_preferred_qualification}',
                is_educational_requirement: '{np_is_educational_requirement}',
                is_legal_notification: '{np_is_legal_notification}',
                is_other: '{np_is_other}',
                is_corporate_scope: '{np_is_corporate_scope}',
                is_job_title: '{np_is_job_title}',
                is_office_location: '{np_is_office_location}',
                is_job_duration: '{np_is_job_duration}',
                is_supplemental_pay: '{np_is_supplemental_pay}',
                is_interview_procedure: '{np_is_interview_procedure}',
                is_posting_date: '{np_is_posting_date}'
            }})
        MERGE (pos)-[r: SUMMARIZES]->(np);"""
    with cu.driver.session() as session:
        session.write_transaction(cu.do_cypher_tx, cypher_str)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
# winsound.Beep(freq, duration)
print(f'Parts-of-speech resummarized in {duration_str}')