In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from datetime import datetime
import humanize
import os
import re
import shutil
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [4]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from is_header_sgd_classifier import IsHeaderSgdClassifier
ihu = IsHeaderSgdClassifier(ha=ha, cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, verbose=True)

from section_classifier_utils import SectionLRClassifierUtilities, SectionCRFClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

from section_utils import SectionUtilities
su = SectionUtilities(wsu=wsu, ihu=None, hc=hc, crf=None, slrcu=slrcu, verbose=False)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 3 seconds
Last run on 2023-03-22 17:02:30.502738


In [7]:

# Train the isheader classifier
t1 = time.time()
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Is-header classifier trained in {duration_str}')

I have 48,226 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 96,452 records trained
Is-header classifier trained in 9 seconds


In [8]:

# Keep the total creation time to less than one hour by adjusting the sampling strategy limit
# I have 47,946 labeled parts of speech in here
# pos_lr_predict_single is now available
# Parts-of-speech logistic regression elements built in 2 hours, 18 minutes and 55 seconds
t1 = time.time()
if not (hasattr(slrcu, 'pos_predict_percent_fit_dict') or crf.is_flask_running()):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 48,758 labeled parts of speech in here
predict_single is now available
Parts-of-speech logistic regression elements built in 1 hour, 58 minutes and 56 seconds


In [9]:

# Check if the crf has built its parts-of-speech classifier
# I have 46,569 labeled parts of speech in here
# Parts-of-speech CRF elements built in 20 minutes and 35 seconds
t1 = time.time()
if not (hasattr(scrfcu, 'pos_predict_percent_fit_dict') or crf.is_flask_running()):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech CRF elements built in {duration_str}')

I have 48,758 labeled parts of speech in here
predict_single is now available
Parts-of-speech CRF elements built in 26 minutes and 21 seconds



----

In [None]:

# Run this if you haven't already created the file
file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\data\html\other_email.html'
file_name = re.sub(r'[^A-Za-z0-9]+', ' ', '''
    Senior Data Scientist (NLP)
    Storm3 United States Remote
    ''').strip().replace(' ', '_') + '.html'
new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if os.path.isfile(new_file_path):
    file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'
    new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if not os.path.isfile(new_file_path):
    shutil.copy(file_path, os.path.join(cu.SAVES_HTML_FOLDER, file_name))
    page_soup = wsu.get_page_soup(file_path)
    div_soup = page_soup.find_all(name='div', id='jobDescriptionText')[0]
    child_strs_list = ha.get_navigable_children(div_soup, [])
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
    print(file_name)

In [None]:

# Add the posting URL to the file name only if you have one
posting_url = ''
if posting_url:
    cypher_str = f'''
        MATCH (fn:FileNames {{file_name: "{file_name}"}})
        SET fn.posting_url = "{posting_url}"
        RETURN fn;'''
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
    display(row_objs_list)

In [13]:

child_tags_list = ha.get_child_tags_list(child_strs_list)

feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
feature_tuple_list = []
for feature_dict in feature_dict_list:
    feature_tuple_list.append(hc.get_feature_tuple(
        feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=scrfcu.predict_single
    ))
crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))

In [116]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
winsound.Beep(freq, duration)

['O-JT', 'O-JT', 'O-OL', 'O-SP', 'O-TS', 'O-TS', 'H-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'H-RQ', 'H-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'H-PQ', 'H-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'H-SP', 'O-SP', 'O-SP', 'O-SP', 'O-SP', 'O-SP', 'O-SP', 'O-IP', 'O-IP', 'O-CS', 'O-CS']
[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]


[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]



----

In [None]:
raise

In [114]:

# Display the context of an individual child string
idx = 46
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = s.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
print(f'{idx} {pos_symbol}) {child_str}')

[3, 7, 11, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 42]
46 H-RQ) el.


In [96]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<li>Presented your work at a reputable scientific or technical conference</li>" in basic_quals_dict: 0


In [115]:

def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = """MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = 'False',
            np.is_task_scope = 'False',
            np.is_minimum_qualification = 'False',
            np.is_preferred_qualification = 'False',
            np.is_educational_requirement = 'False',
            np.is_legal_notification = 'False',
            np.is_other = 'False',
            np.is_corporate_scope = 'True',
            np.is_job_title = 'False',
            np.is_office_location = 'False',
            np.is_job_duration = 'False',
            np.is_supplemental_pay = 'False',
            np.is_interview_procedure = 'False',
            np.is_posting_date = 'False'
        """ + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=True); row_objs_list

1 iterations seen during updating fit for a total of 96,490 records trained


[{'navigable_parent': 'el.', 'is_header': 'False', 'is_task_scope': 'False', 'is_qualification': None, 'is_minimum_qualification': 'False', 'is_preferred_qualification': 'False', 'is_legal_notification': 'False', 'is_job_title': 'False', 'is_office_location': 'False', 'is_job_duration': 'False', 'is_supplemental_pay': 'False', 'is_educational_requirement': 'False', 'is_interview_procedure': 'False', 'is_corporate_scope': 'True', 'is_posting_date': 'False', 'is_other': 'False'}]

In [99]:

# Show what's in the database already for this html string
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        ''' + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
row_objs_list

[{'navigable_parent': '<p>Benefits:</p>', 'is_header': 'True', 'is_task_scope': 'False', 'is_qualification': 'False', 'is_minimum_qualification': 'False', 'is_preferred_qualification': 'False', 'is_legal_notification': 'False', 'is_job_title': 'False', 'is_office_location': 'False', 'is_job_duration': 'False', 'is_supplemental_pay': 'True', 'is_educational_requirement': 'False', 'is_interview_procedure': 'False', 'is_corporate_scope': 'False', 'is_posting_date': 'False', 'is_other': 'False'}]

In [None]:

# Remove this particular child string from the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
child_str = child_strs_list[idx]
basic_quals_dict.pop(child_str)
# basic_quals_dict[child_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')


----