In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [4]:

from datetime import datetime
import humanize
import os
import sys
import time

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [5]:

t0 = t1 = time.time()

# Get the Storage object
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

# Get the HeaderAnalysis object
from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the WebScrapingUtilities object
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the CypherUtilities object and Neo4j driver
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 0 seconds
Last run on 2023-04-04 14:21:31.109452


In [6]:

def generate_child_strs(verbose=False):
    def do_cypher_tx(tx):
        cypher_str = '''
            // Find all NavigableParents nodes in the graph
            MATCH (np:NavigableParents)

            // That do not have any parts-of-speech symbol but O-TS
            WHERE
                (np.is_header = 'False')
                AND (np.is_task_scope = 'True')
                AND NOT (
                    (np.is_minimum_qualification = 'True')
                    OR (np.is_preferred_qualification = 'True')
                    OR (np.is_educational_requirement = 'True')
                    OR (np.is_legal_notification = 'True')
                    OR (np.is_job_title = 'True')
                    OR (np.is_office_location = 'True')
                    OR (np.is_job_duration = 'True')
                    OR (np.is_supplemental_pay = 'True')
                    OR (np.is_interview_procedure = 'True')
                    OR (np.is_corporate_scope = 'True')
                    OR (np.is_posting_date = 'True')
                    OR (np.is_other = 'True')
                    )

            // Return the navigable parent
            RETURN np.navigable_parent AS navigable_parent;'''
        results_list = tx.run(query=cypher_str, parameters={})

        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx)
    if verbose:
        print(len(row_objs_list))
    for row_obj in row_objs_list:
        if 'navigable_parent' in row_obj:
            yield row_obj['navigable_parent']

In [268]:

def generate_child_strs(verbose=False):
    def do_cypher_tx(tx):
        cypher_str = '''
            // Find all NavigableParents nodes in the graph
            MATCH (np:NavigableParents)

            // That are assumed to have parts-of-speech symbol that are O-RQ
            WHERE
                ((np.navigable_parent CONTAINS 'Education')
                OR (np.navigable_parent CONTAINS 'EDUCATION'))
                AND NOT ((np.navigable_parent CONTAINS 'Experience')
                OR (np.navigable_parent CONTAINS 'EXPERIENCE'))
                AND NOT ((np.is_header = 'True')
                AND (np.is_educational_requirement = 'True'))

            // Return the navigable parent
            RETURN np.navigable_parent AS navigable_parent;'''
        results_list = tx.run(query=cypher_str, parameters={})

        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx)
    if verbose:
        print(len(row_objs_list))
    for row_obj in row_objs_list:
        if 'navigable_parent' in row_obj:
            yield row_obj['navigable_parent']

In [7]:

CHILD_STRS_LIST = list(generate_child_strs(verbose=False))
len(CHILD_STRS_LIST)

6980

In [270]:

CHILD_STRS_LIST[:10]

['<li>$250 Annual Education Stipend + StackerU continuous learning curriculum</li>', '<p>Bird Conservancy of the Rockies is a 501 (c)(3) non-profit headquartered at the Environmental Learning Center at Barr Lake State Park with a satellite office in Fort Collins, CO. Bird Conservancy of the Rockies conserves birds and their habitats through an integrated approach of Science, Education, and Stewardship. Our work radiates from the Rockies to the Great Plains, Mexico and beyond.</p>', '. Education completed in foreign colleges or universities may be used to meet the requirements. Please refer to http://www.opm.gov/qualifications/policy/ApplicationOfStds-04.asp for more information.', '<b>Basic Education Requirement:</b>', '<b>Education cannot be substituted for experience at this grade level.</b>', '<b>In addition to the Basic Education Requirement above, to qualify for this position at the GS-14 level, you must meet the following:</b>', '<b>Qualifications and Education Requirements:</b>'

In [8]:

ZMQInteractiveShell_obj = get_ipython()
def get_cypher_code():
    output_str = ''
    tag_str = CHILD_STRS_LIST.pop()
    output_str += f'\n# {len(CHILD_STRS_LIST):,} to go\n'
    if "'" in tag_str:
        tag_str = tag_str.replace('"', '\\"')
        output_str += f'child_str = "{tag_str}"\n'
    else:
        output_str += f"child_str = '{tag_str}'\n"
    output_str += "def do_cypher_tx(tx, navigable_parent, verbose=False):\n"
    output_str += "    cypher_str = '''\n"
    output_str += "        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})\n"
    output_str += "        SET\n"
    output_str += "            np.is_header = 'True',\n"
    output_str += "            np.is_task_scope = 'False',\n"
    output_str += "            np.is_minimum_qualification = 'False',\n"
    output_str += "            np.is_preferred_qualification = 'False',\n"
    output_str += "            np.is_educational_requirement = 'True',\n"
    output_str += "            np.is_legal_notification = 'False',\n"
    output_str += "            np.is_other = 'False',\n"
    output_str += "            np.is_corporate_scope = 'False',\n"
    output_str += "            np.is_job_title = 'False',\n"
    output_str += "            np.is_office_location = 'False',\n"
    output_str += "            np.is_job_duration = 'False',\n"
    output_str += "            np.is_supplemental_pay = 'False',\n"
    output_str += "            np.is_interview_procedure = 'False',\n"
    output_str += "            np.is_posting_date = 'False'\n"
    output_str += "        ''' + cu.return_everything_str + ';'\n"
    output_str += "    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})\n"
    output_str += "    \n"
    output_str += "    return [dict(record.items()) for record in results_list]\n"
    output_str += "with cu.driver.session() as session:\n"
    output_str += "    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)\n"
    output_str += "ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=True)\n"
    output_str += "row_objs_list"
    
    return output_str

In [None]:

ZMQInteractiveShell_obj.set_next_input(text=get_cypher_code(), replace=True)

In [9]:

# 6,979 to go
child_str = 'Contribute to the development of analytics libraries and frameworks.'
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = 'True',
            np.is_task_scope = 'False',
            np.is_minimum_qualification = 'False',
            np.is_preferred_qualification = 'False',
            np.is_educational_requirement = 'True',
            np.is_legal_notification = 'False',
            np.is_other = 'False',
            np.is_corporate_scope = 'False',
            np.is_job_title = 'False',
            np.is_office_location = 'False',
            np.is_job_duration = 'False',
            np.is_supplemental_pay = 'False',
            np.is_interview_procedure = 'False',
            np.is_posting_date = 'False'
        ''' + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
    
    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=True)
row_objs_list