In [1]:

%pprint
%matplotlib inline
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine)
from pandas import DataFrame
import re
import os

Pretty printing has been turned OFF
Utility libraries created in 5 seconds



---
# Load needed libraries and functions

In [2]:

# Check if the slrcu has built its parts-of-speech logistic regression elements
# Parts-of-speech logistic regression elements is normally built in 1 hour, 27 minutes and 21 seconds
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'): print('predict_single is available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech logistic regression elements built in {duration_str}'; print(speech_str)

I have 48,129 labeled parts of speech in here


Train the POS Classifiers: 100%|███████████████| 25/25 [00:00<00:00, 259.02it/s]

predict_single is available
Parts-of-speech logistic regression elements built in 7 seconds





In [3]:

# Check if the scrfcu has built its parts-of-speech conditional random field elements
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech conditional random field elements built in {duration_str}'; print(speech_str)

predict_single is now available
Parts-of-speech conditional random field elements built in 2 seconds


In [4]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier normally trained in 15 hours, 42 minutes and 41 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'): crf.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(crf, 'CRF'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'POS classifier trained in {duration_str}'; print(speech_str)

predict_single is now available
POS classifier trained in 0 seconds


In [5]:

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech stochastic gradient descent elements built in {duration_str}'; print(speech_str)

I have 48,129 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 10 seconds


In [6]:

# Train the isheader classifier
t1 = time.time()
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-header classifier trained in {duration_str}'; print(speech_str)

I have 53,652 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 53,652 records trained
Is-header classifier trained in 8 seconds


In [7]:

# Check if the lru has built its is-qualified classifier
t1 = time.time()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified LR elements built in {duration_str}'; print(speech_str)

I have 532,546 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 10 seconds



----

In [8]:

# Run this if you haven't already created the file
import re
import shutil

file_name = ''
if file_name: file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
else:
    file_path = os.path.abspath('../data/html/other_email.html')
    command_str = fr'"C:\Program Files\Notepad++\notepad++.exe" {file_path}'
    !{command_str}

In [9]:

# Clean up and prettify the HTML
wsu.clean_job_posting(file_path)

# You need to edit other_email.html first
with open(file_path, 'r', encoding=nu.encoding_type) as f:
    html_text = f.read()
    assert '<div id="jobDescriptionText"' in html_text, 'Fix the HTML to include <div id="jobDescriptionText" and run this cell again'

In [10]:

if not file_name:
    file_name = re.sub(r'[^A-Za-z0-9]+', ' ', '''
        Electrical Engineer / Parts Engineering, Chelmsford, MA
        ''').strip().replace(' ', '_') + '.html'
    new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    if os.path.isfile(new_file_path):
        file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'
        new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    print(new_file_path)
    if not os.path.isfile(new_file_path):
        shutil.copy(file_path, os.path.join(cu.SAVES_HTML_FOLDER, file_name))
        print(file_name)
page_soup = wsu.get_page_soup(file_path)
div_soup = page_soup.find_all(name='div', id='jobDescriptionText')[0]
child_strs_list = hau.get_navigable_children(div_soup, [])
cu.ensure_filename(file_name, verbose=False)
cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)

In [11]:

# Add the posting URL to the file name only if you have one
posting_url = ''
if posting_url:
    cypher_str = f'''
        MATCH (fn:FileNames {{file_name: "{file_name}"}})
        SET fn.posting_url = "{posting_url}"
        RETURN fn;'''
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
    display(row_objs_list)

[{'fn': <Node element_id='4:b43cc890-b807-4db1-9204-e49abf75c484:1089966' labels=frozenset({'FileNames'}) properties={'file_name': 'pj7e4x_ai_engineer_Tellius_Hybrid_San_Francisco_CA_USA_SquarePegHires_com.html', 'posting_url': 'https://www.squarepeghires.com/jobs/pj7e4x/ai-engineer?utm_source=apollo&utm_medium=email&utm_campaign=general'}>}]