In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

%matplotlib inline
from datetime import datetime
from neo4j.exceptions import ServiceUnavailable
import humanize
import os
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

In [3]:

t0 = t1 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the Neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

from is_header_sgd_classifier import IsHeaderSgdClassifier
ihu = IsHeaderSgdClassifier(ha=ha, cu=cu, verbose=False)

try:
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

from section_classifier_utils import SectionLRClassifierUtilities, SectionSGDClassifierUtilities, SectionCRFClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, slrcu=slrcu, scrfcu=scrfcu, ssgdcu=ssgdcu, verbose=True)

from section_utils import SectionUtilities
su = SectionUtilities(wsu=wsu, ihu=ihu, hc=hc, crf=crf, slrcu=slrcu, scrfcu=scrfcu, ssgdcu=ssgdcu, verbose=False)

duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')

Utility libraries created in 4 seconds


In [26]:

# Prove that the cypher utilities is better at getting a feature dictionary
from IPython.display import HTML

test_child_strs_list = ["<oip>Role Is Sold: Yes</oip>"]
test_child_tags_list = cu.get_child_tags_list(test_child_strs_list)
test_is_header_list = cu.get_is_header_list(test_child_strs_list)
cu_dict = cu.get_feature_dict_list(test_child_tags_list, test_child_strs_list)[0]
hc_dict = hc.get_feature_dict_list(test_child_tags_list, test_is_header_list, test_child_strs_list)[0]

sorted_cu_dict = dict(sorted(cu_dict.items()))
sorted_hc_dict = dict(sorted(hc_dict.items()))

rows_list = []
for cu_key, hc_key in zip(sorted_cu_dict, sorted_hc_dict):
    rows_list.append('<tr><td>' + '</td><td>'.join([cu_key, str(sorted_cu_dict[cu_key]), hc_key, str(sorted_hc_dict[hc_key])]) + '</td></tr>')
HTML('<table><tr><th>CU Key</th><th>CU Value</th><th>HC Key</th><th>HC Value</th></tr>' + ''.join(rows_list) + '</table>')

CU Key,CU Value,HC Key,HC Value
child_str,Role Is Sold: Yes,child_str,Role Is Sold: Yes
initial_tag,oip,initial_tag,oip
is_corporate_scope,False,is_corporate_scope,
is_educational_requirement,False,is_educational_requirement,
is_header,False,is_header,False
is_interview_procedure,True,is_interview_procedure,
is_job_duration,False,is_job_duration,
is_job_title,False,is_job_title,
is_legal_notification,False,is_legal_notification,
is_minimum_qualification,False,is_minimum_qualification,
