In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from datetime import datetime
import humanize
import os
import sys
import time

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

t0 = t1 = time.time()

# Get the Storage object
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

# Get the HeaderAnalysis object
from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the WebScrapingUtilities object
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the CypherUtilities object and Neo4j driver
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 2 seconds
Last run on 2023-04-05 15:53:23.666561


In [4]:

t1 = time.time()
cu.populate_pos_relationships(verbose=False)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech relationships repopulated in {duration_str}')

Parts-of-speech relationships repopulated in 11 seconds


In [9]:

def generate_child_strs(verbose=False):
    def do_cypher_tx(tx):
        cypher_str = '''
            // Filter for NavigableParents nodes with an unambiguous SUMMARIZES relationship
            MATCH (np:NavigableParents)
            WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) >= 1
            WITH np
            
            // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES
            // relationship to a PartsOfSpeech node
            MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
            WHERE
                (np.navigable_parent STARTS WITH "<orq>Ability to ")
                //AND (pos.pos_symbol = "O-RQ")
                AND (size(pos.navigable_parent) < 50)

            // Return the navigable parent
            RETURN np.navigable_parent AS navigable_parent
            ORDER BY size(navigable_parent) ASC;'''
        results_list = tx.run(query=cypher_str, parameters={})

        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx)
    if verbose:
        print(len(row_objs_list))
    for row_obj in row_objs_list:
        if 'navigable_parent' in row_obj:
            yield row_obj['navigable_parent']

In [10]:

CHILD_STRS_LIST = list(generate_child_strs(verbose=False))
len(CHILD_STRS_LIST)

0

In [383]:

import re

base_dict = {
    'header': 'False',
    'task_scope': 'False',
    'minimum_qualification': 'False',
    'preferred_qualification': 'False',
    'educational_requirement': 'False',
    'legal_notification': 'False',
    'other': 'False',
    'corporate_scope': 'False',
    'job_title': 'False',
    'office_location': 'False',
    'job_duration': 'False',
    'supplemental_pay': 'False',
    'interview_procedure': 'False',
    'posting_date': 'False'
}
features_list = ['is_' + cypher_suffix for cypher_suffix in base_dict.keys() if cypher_suffix != 'header']
html_suffixes_list = ['ts', 'rq', 'pq', 'er', 'ln', 'o', 'cs', 'jt', 'ol', 'jd', 'sp', 'ip', 'pd']
tag_regex = re.compile(f"<(/?)[ho]({'|'.join(html_suffixes_list)})>")
for child_str in CHILD_STRS_LIST:
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = '''
            MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
            ''' + cu.return_everything_str + ';'
        results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
    if row_objs_list:
        child_dict = row_objs_list[0]
        del child_dict['navigable_parent']
        html_prefix = ['h', 'o'][['True', 'False'].index(child_dict['is_header'])]
        del child_dict['is_header']
        true_key = [k for k, v in child_dict.items() if v == 'True'][0]
        html_suffix = html_suffixes_list[features_list.index(true_key)]
        new_child_str = tag_regex.sub(rf'<\g<1>{html_prefix}{html_suffix}>', child_str)
        def do_cypher_tx(tx, old_child_str, new_child_str):
            cypher_str = '''
                MATCH (np:NavigableParents {navigable_parent: $old_child_str})
                SET np.navigable_parent = $new_child_str
                ''' + cu.return_everything_str + ';'
            results_list = tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})

            return [dict(record.items()) for record in results_list]
        with cu.driver.session() as session:
            row_objs_list = session.write_transaction(do_cypher_tx, old_child_str=child_str, new_child_str=new_child_str)

In [342]:

def generate_child_strs(verbose=False):
    def do_cypher_tx(tx):
        cypher_str = '''
            // Filter for NavigableParents nodes with an unambiguous SUMMARIZES relationship
            MATCH (np:NavigableParents)
            WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) >= 1
            WITH np

            // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
            MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
            WHERE
                (np.navigable_parent STARTS WITH "<orq>Ability to ")
                AND (size(np.navigable_parent) < 40)
                AND (pos.pos_symbol = "O-RQ")

            // Return the navigable parent
            RETURN np.navigable_parent AS navigable_parent
            ORDER BY size(navigable_parent) ASC;'''
        results_list = tx.run(query=cypher_str, parameters={})

        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx)
    if verbose:
        print(len(row_objs_list))
    for row_obj in row_objs_list:
        if 'navigable_parent' in row_obj:
            yield row_obj['navigable_parent']

In [343]:

CHILD_STRS_LIST = list(generate_child_strs(verbose=False))
len(CHILD_STRS_LIST)

46

In [344]:

CHILD_STRS_LIST[:10]

['<orq>Ability to lead</orq>', '<orq>Ability to jIRA).</orq>', '<orq>Ability to elysian.</orq>', '<orq>Ability to pMO Lead</orq>', '<orq>Ability to pVC Lead</orq>', '<orq>Ability to data Lead</orq>', '<orq>Ability to data Lake</orq>', '<orq>Ability to no travel.</orq>', '<orq>Ability to oracle HCM</orq>', '<orq>Ability to no Travel.</orq>']

In [234]:

ZMQInteractiveShell_obj = get_ipython()
def get_cypher_code():
    output_str = ''
    tag_str = CHILD_STRS_LIST.pop()
    output_str += f'\n# {len(CHILD_STRS_LIST):,} to go\n'
    if "'" in tag_str:
        tag_str = tag_str.replace('"', '\\"')
        output_str += f'child_str = "{tag_str}"\n'
    else:
        output_str += f"child_str = '{tag_str}'\n"
    output_str += "def do_cypher_tx(tx, navigable_parent, verbose=False):\n"
    output_str += "    cypher_str = '''\n"
    output_str += "        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})\n"
    output_str += "        SET\n"
    output_str += "            np.is_header = 'False',\n"
    output_str += "            np.is_task_scope = 'False',\n"
    output_str += "            np.is_minimum_qualification = 'False',\n"
    output_str += "            np.is_preferred_qualification = 'False',\n"
    output_str += "            np.is_educational_requirement = 'False',\n"
    output_str += "            np.is_legal_notification = 'False',\n"
    output_str += "            np.is_other = 'True',\n"
    output_str += "            np.is_corporate_scope = 'False',\n"
    output_str += "            np.is_job_title = 'False',\n"
    output_str += "            np.is_office_location = 'False',\n"
    output_str += "            np.is_job_duration = 'False',\n"
    output_str += "            np.is_supplemental_pay = 'False',\n"
    output_str += "            np.is_interview_procedure = 'False',\n"
    output_str += "            np.is_posting_date = 'False'\n"
    output_str += "        ''' + cu.return_everything_str + ';'\n"
    output_str += "    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})\n"
    output_str += "    \n"
    output_str += "    return [dict(record.items()) for record in results_list]\n"
    output_str += "with cu.driver.session() as session:\n"
    output_str += "    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)\n"
    output_str += "row_objs_list"
    
    return output_str

In [None]:

ZMQInteractiveShell_obj.set_next_input(text=get_cypher_code(), replace=True)

In [None]:

# 0 to go
child_str = '<orq>Ability to lead</orq>'
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = 'False',
            np.is_task_scope = 'False',
            np.is_minimum_qualification = 'False',
            np.is_preferred_qualification = 'False',
            np.is_educational_requirement = 'False',
            np.is_legal_notification = 'False',
            np.is_other = 'True',
            np.is_corporate_scope = 'False',
            np.is_job_title = 'False',
            np.is_office_location = 'False',
            np.is_job_duration = 'False',
            np.is_supplemental_pay = 'False',
            np.is_interview_procedure = 'False',
            np.is_posting_date = 'False'
        ''' + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
    
    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
row_objs_list