In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

# Get the Neo4j driver
from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

In [4]:

from neo4j.exceptions import ServiceUnavailable

try:
    version_str = cu.driver.verify_connectivity()
    print(f'======== {version_str} ========')
    
    from hc_utils import HeaderCategories
    hc = HeaderCategories(cu=cu, verbose=False)
    
    from section_utils import SectionUtilities
    su = SectionUtilities(s=s, ha=ha, cu=cu, verbose=False)
    
    from lr_utils import LrUtilities
    lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
    
    from crf_utils import CrfUtilities
    crf = CrfUtilities(ha=ha, hc=hc, cu=cu, verbose=False)
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    # print(str(e).strip())
    raise ServiceUnavailable('You need to start Neo4j as a console')
except Exception as e:
    print(e.__class__)



In [5]:

import time
import humanize
from pandas import DataFrame
import os
from datetime import datetime
import winsound

duration = 1000  # milliseconds
freq = 440  # Hz
print(f'Last run on {datetime.now()}')

Last run on 2022-12-07 12:52:00.094285



----

In [6]:

t0 = time.time()
lru.build_isheader_logistic_regression_elements(verbose=False)
lru.retrain_isheader_classifier(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Is-header classifier retrained in {duration_str}')

Is-header classifier retrained in 9 seconds



----

In [11]:

cypher_str = f'''
    MATCH (np:NavigableParents)
    WHERE np.is_header IS NULL
    RETURN
        np.navigable_parent AS navigable_parent;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
navigable_parents_list = []
if row_objs_list:
    navigable_parents_list = DataFrame(row_objs_list).navigable_parent.tolist()
print(f'Only {len(navigable_parents_list):,} more un-predicted navigable parents to go!')

Only 0 more un-predicted navigable parents to go!


In [12]:

for child_str in navigable_parents_list:
    probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
    idx = probs_list.index(max(probs_list))
    is_header = [True, False][idx]
    cypher_str = f'''
        MERGE (np:NavigableParents {{navigable_parent: "{cu.escape_text(child_str)}"}})
        SET np.is_header = '{is_header}';'''
    # print(cypher_str)
    try:
        with cu.driver.session() as session:
            session.write_transaction(cu.do_cypher_tx, cypher_str)
    except Exception as e:
        print(f'{e.__class__} error: {str(e).strip()}')
        print(cypher_str)
        break

In [38]:

import numpy as np

cypher_str = f'''
    MATCH (np:NavigableParents)
    RETURN
        np.is_header AS is_header,
        np.navigable_parent AS navigable_parent;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
isheaders_df = DataFrame(row_objs_list)
isheaders_df['predicted_is_header'] = np.nan
for row_index, row_series in isheaders_df.iterrows():
    actual_is_header = bool(row_series.is_header)
    child_str = str(row_series.navigable_parent)
    probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
    idx = probs_list.index(max(probs_list))
    predicted_is_header = [True, False][idx]
    isheaders_df.loc[row_index, 'predicted_is_header'] = predicted_is_header

In [39]:

isheaders_df.is_header = isheaders_df.is_header.map(lambda x: bool(x))
isheaders_df.predicted_is_header = isheaders_df.predicted_is_header.map(lambda x: bool(x))
mask_series = (isheaders_df.is_header ^ isheaders_df.predicted_is_header)
isheaders_df[mask_series]

Unnamed: 0,is_header,navigable_parent,predicted_is_header
0,True,<b>Key Qualifications:</b>,False
3,True,• Conduct ad hoc reporting with insightful ...,False
10,True,<b>Education/Experience</b>,False
16,True,• Knowledge of process automation with a sc...,False
18,True,• Familiar with various instrumentation map...,False
...,...,...,...
41508,True,<li>Automotive industry experience</li>,False
41509,True,<li>Experience in testing Android applications...,False
41510,True,<li>Familiarity with Cucumber and Gherkin is a...,False
41513,True,Company covers 100% of premiums for Life Insur...,False


In [136]:

# Define the is header business logic for the interactive widget app
def get_isheader(**kwargs):
    # {'index': '18617', 'html_widget': 'COVID-19 considerations:', 'toggle_button': True}
    idx = int(kwargs['index'])
    child_str = str(kwargs['html_widget'])
    is_header = bool(kwargs['toggle_button'])
    mask_series = (isheaders_df.index == idx)
    isheaders_df.loc[mask_series, 'is_header'] = is_header
    isheaders_df.loc[mask_series, 'predicted_is_header'] = is_header
    cypher_str = f'''
        MERGE (np:NavigableParents {{navigable_parent: "{cu.escape_text(child_str)}"}})
        SET np.is_header = '{is_header}';'''
    try:
        with cu.driver.session() as session:
            session.write_transaction(cu.do_cypher_tx, cypher_str)
    except Exception as e:
        print(f'{e.__class__} error: {str(e).strip()}')
        print(cypher_str)
        raise
    
    mask_series = (isheaders_df.is_header ^ isheaders_df.predicted_is_header)
    display(isheaders_df[mask_series].shape[0], kwargs)
    if isheaders_df[mask_series].shape[0]:
        row_df = isheaders_df[mask_series].sample(1)
        is_header = bool(row_df.is_header.squeeze())
        navigable_parent = str(row_df.navigable_parent.squeeze())
        predicted_is_header = bool(row_df.predicted_is_header.squeeze())
        row_index = row_df.index.max()
    else:
        is_header = True
        navigable_parent = 'Finished'
        predicted_is_header = True
        row_index = -1
    for i, widget in enumerate(ip.children):
        if type(widget) == ipywidgets.widgets.widget_string.HTML:
            html_widget = ip.children[i]
            html_widget.value = str(navigable_parent)
        elif type(widget) == ipywidgets.widgets.widget_string.Label:
            hidden_label = ip.children[i]
            hidden_label.value = str(row_index)

In [137]:

# Prep the widget app for first use
mask_series = (isheaders_df.is_header ^ isheaders_df.predicted_is_header)
if isheaders_df[mask_series].shape[0]:
    row_df = isheaders_df[mask_series].sample(1)
    is_header = bool(row_df.is_header.squeeze())
    navigable_parent = str(row_df.navigable_parent.squeeze())
    predicted_is_header = bool(row_df.predicted_is_header.squeeze())
    row_index = row_df.index.max()
else:
    is_header = True
    navigable_parent = 'Finished'
    predicted_is_header = True
    row_index = -1

In [138]:

# Prep the widgets themselves for first use
import ipywidgets
from ipywidgets import HTML, ToggleButtons, Label, Layout, interactive
from IPython.display import display

kwargs = {}
kwargs['index'] = Label(value=str(row_index), layout=Layout(visibility='hidden'))
kwargs['html_widget'] = HTML(
    value=str(navigable_parent),
    description=''
    )
kwargs['toggle_button'] = ToggleButtons(
    options=[True, False],
    disabled=False,
    button_style='',
    description=''
    )
ip = interactive(get_isheader, {'manual': True}, **kwargs)
for i, widget in enumerate(ip.children):
    if type(widget) == ipywidgets.widgets.widget_string.HTML:
        html_widget = ip.children[i]
    elif type(widget) == ipywidgets.widgets.widget_string.Label:
        hidden_label = ip.children[i]
    elif type(widget) == ipywidgets.widgets.ToggleButtons:
        toggle_button = ip.children[i]
    elif type(widget) == ipywidgets.widgets.widget_button.Button:
        submit_button = ip.children[i]
        submit_button.description = 'Submit'
    elif type(widget) == ipywidgets.widgets.widget_output.Output:
        output_widget = ip.children[i]

In [149]:

# Display the app
from ipywidgets import VBox, HBox

display(VBox([HBox([toggle_button, submit_button]), html_widget, output_widget]))

VBox(children=(HBox(children=(ToggleButtons(description='toggle_button', index=1, options=(True, False), value…

In [175]:

import re

html_regex = re.compile('<[^<]+?>')
mask_series = isheaders_df.navigable_parent.map(lambda x: html_regex.sub('', str(x)).endswith(':'))
isheaders_df.loc[mask_series, 'is_header'] = True
isheaders_df.loc[mask_series, 'predicted_is_header'] = True

In [171]:

mask_series = isheaders_df.navigable_parent.map(lambda x: (str(x).startswith('<div>') and str(x).endswith(':</div>')))
isheaders_df.loc[mask_series, 'is_header'] = True
isheaders_df.loc[mask_series, 'predicted_is_header'] = True

In [None]:

mask_series = isheaders_df.navigable_parent.map(lambda x: (str(x).startswith('<b>') and str(x).endswith('</b>') and (str(x).upper() == str(x))))
isheaders_df.loc[mask_series, 'is_header'] = True
isheaders_df.loc[mask_series, 'predicted_is_header'] = True

In [164]:

import re

html_regex = re.compile('<[^<]+?>')
mask_series = isheaders_df.navigable_parent.map(lambda x: (':' in html_regex.sub('', str(x))) and not html_regex.sub('', str(x)).endswith(':'))
isheaders_df.loc[mask_series, 'is_header'] = False
isheaders_df.loc[mask_series, 'predicted_is_header'] = False

In [176]:

for row_index, row_series in isheaders_df.iterrows():
    actual_is_header = bool(row_series.is_header)
    child_str = str(row_series.navigable_parent)
    cypher_str = f'''
        MERGE (np:NavigableParents {{navigable_parent: "{cu.escape_text(child_str)}"}})
        SET np.is_header = '{actual_is_header}';'''
    try:
        with cu.driver.session() as session:
            session.write_transaction(cu.do_cypher_tx, cypher_str)
    except Exception as e:
        print(f'{e.__class__} error: {str(e).strip()}')
        print(cypher_str)
        raise