In [1]:

%pprint
%matplotlib inline
import sys
if ('../py' not in sys.path): sys.path.insert(1, '../py')
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine)
from pandas import DataFrame
import re
import os

Pretty printing has been turned OFF
Utility libraries created in 5 seconds



---
# Load needed libraries and functions

In [8]:

# Check if the slrcu has built its parts-of-speech logistic regression elements
# Parts-of-speech logistic regression elements is normally built in 1 hour, 27 minutes and 21 seconds
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'): print('predict_single is available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech logistic regression elements built in {duration_str}'; print(speech_str)

I have 49,102 labeled parts of speech in here
predict_single is available
Parts-of-speech logistic regression elements built in 7 seconds


In [2]:

# Check if the scrfcu has built its parts-of-speech conditional random field elements
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech conditional random field elements built in {duration_str}'; print(speech_str)

predict_single is now available
Parts-of-speech conditional random field elements built in 1 second


In [3]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier normally trained in 15 hours, 42 minutes and 41 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'): crf.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(crf, 'CRF'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'POS classifier trained in {duration_str}'; print(speech_str)

predict_single is now available
POS classifier trained in 0 seconds


In [4]:

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech stochastic gradient descent elements built in {duration_str}'; print(speech_str)

I have 49,102 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 9 seconds


In [5]:

# Train the isheader classifier
t1 = time.time()
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-header classifier trained in {duration_str}'; print(speech_str)

I have 50,395 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 50,395 records trained
Is-header classifier trained in 6 seconds


In [20]:

# Check if the lru has built its is-qualified classifier
t1 = time.time()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified LR elements built in {duration_str}'; print(speech_str)

I have 424,879 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 6 seconds



----

In [6]:

# Run this if you haven't already created the file, but need to edit other_email.html first
import re
import shutil

file_name = ''
if file_name: file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
else:
    file_path = os.path.abspath('../data/html/other_email.html')
    command_str = fr'"C:\Program Files\Notepad++\notepad++.exe" {file_path}'
    print(command_str)
    !{command_str}
    file_name = re.sub(r'[^A-Za-z0-9]+', ' ', '''
        Machine Learning Engineer
        Emplenter Solutions
        ''').strip().replace(' ', '_') + '.html'
    new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    if os.path.isfile(new_file_path):
        file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'
        new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    if not os.path.isfile(new_file_path):
        shutil.copy(file_path, os.path.join(cu.SAVES_HTML_FOLDER, file_name))
        print(file_name)
page_soup = wsu.get_page_soup(file_path)
div_soup = page_soup.find_all(name='div', id='jobDescriptionText')[0]
child_strs_list = hau.get_navigable_children(div_soup, [])
cu.ensure_filename(file_name, verbose=False)
cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)

"C:\Program Files\Notepad++\notepad++.exe" C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\data\html\other_email.html
Machine_Learning_Engineer_Emplenter_Solutions.html


In [16]:

# Add the posting URL to the file name only if you have one
posting_url = ''
if posting_url:
    cypher_str = f'''
        MATCH (fn:FileNames {{file_name: "{file_name}"}})
        SET fn.posting_url = "{posting_url}"
        RETURN fn;'''
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
    display(row_objs_list)


---
# Training

In [21]:

# You need to run this again if you changed the
# qualification dictionary below or in another notebook
t1 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
lru.sync_basic_quals_dict(sampling_strategy_limit=None, verbose=False)

lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified classifer retrained in {duration_str}'; print(speech_str)

I have 18,695 hand-labeled qualification strings in here
I have 595,061 is-qualified vocabulary tokens in here
Is-qualified classifer retrained in 1 minute and 46 seconds


In [22]:

t1 = time.time()
print(file_name)
child_tags_list = hau.get_child_tags_list(child_strs_list)
feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
feature_tuple_list = []
for feature_dict in feature_dict_list:
    feature_tuple_list.append(hc.get_feature_tuple(
        feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=scrfcu.predict_single,
        pos_sgd_predict_single=ssgdcu.predict_single
    ))
crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'CRF and child strings list recreated in {duration_str}'; print(speech_str)

Machine_Learning_Engineer_Emplenter_Solutions.html
CRF and child strings list recreated in 28 seconds


In [23]:

t1 = time.time()
db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech displayed in {duration_str}'; print(speech_str); speech_engine.say(speech_str); speech_engine.runAndWait()

['H-JT', 'O-JT', 'H-OL', 'O-OL', 'H-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'H-ER', 'O-ER', 'H-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'H-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-PQ', 'O-TS']
[12, 14, 15, 16, 17, 18, 19, 20, 21, 22]


[12, 14, 15, 16, 17, 18, 19, 20, 21, 22]
Parts-of-speech displayed in 3 seconds


In [None]:
raise

In [12]:

basic_quals_dict = nu.load_object('basic_quals_dict'); column_name = 'is_task_scope'
for idx in list(range(4, 11)):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = '''\n            MATCH (np:NavigableParents {navigable_parent: $navigable_parent})\n            ''' + cu.return_everything_str + ';'
        results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_job_title = {str(column_name == 'is_job_title').lower()},
                np.is_corporate_scope = {str(column_name == 'is_corporate_scope').lower()},
                np.is_task_scope = {str(column_name == 'is_task_scope').lower()},
                np.is_educational_requirement = {str(column_name == 'is_educational_requirement').lower()},
                np.is_minimum_qualification = {str(column_name == 'is_minimum_qualification').lower()},
                np.is_preferred_qualification = {str(column_name == 'is_preferred_qualification').lower()},
                np.is_supplemental_pay = {str(column_name == 'is_supplemental_pay').lower()},
                np.is_office_location = {str(column_name == 'is_office_location').lower()},
                np.is_job_duration = {str(column_name == 'is_job_duration').lower()},
                np.is_interview_procedure = {str(column_name == 'is_interview_procedure').lower()},
                np.is_legal_notification = {str(column_name == 'is_legal_notification').lower()},
                np.is_other = {str(column_name == 'is_other').lower()},
                np.is_posting_date = {str(column_name == 'is_posting_date').lower()}
            ''' + cu.return_everything_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

4 H-TS) <h3 class="jobSectionHeader">Task Scope:</h3>
5 O-TS) <li>Responsibilities: Develop and improve machine learning models through all stages, including design, training, validation, and implementation.</li>
6 O-TS) <li>Responsibilities: Analyze large-scale numerical and textual data to uncover insights and identify trends.</li>
7 O-TS) <li>Responsibilities: Collaborate with a cross-functional team of data engineers, data scientists, and data visualization experts to deliver impactful projects.</li>
8 H-TS) <li>Responsibilities: Research and evaluate new and emerging technologies.</li>
9 O-TS) <li>Responsibilities: Develop data science solutions utilizing various tools and cloud computing infrastructure.</li>
10 O-TS) <li>Responsibilities: Perform additional duties as assigned.</li>


In [41]:

# Display the context of an individual child string
idx = 32
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
print(f'{idx} {pos_symbol}) {child_str}')

[10, 14, 15, 16, 17, 18, 19, 21, 22, 23]
32 O-PQ) <li>Qualifications: Experience working with utility asset information is desirable.</li>


In [42]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 0
nu.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<li>Qualifications: Experience working with utility asset information is desirable.</li>" in basic_quals_dict: 0


In [None]:

# Fix Headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in [18]:
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_header = true
            ''' + cu.return_everything_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

In [13]:

# Fix Non-headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in range(5, 11):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_header = false
            ''' + cu.return_everything_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

5 O-TS) <li>Responsibilities: Develop and improve machine learning models through all stages, including design, training, validation, and implementation.</li>
6 O-TS) <li>Responsibilities: Analyze large-scale numerical and textual data to uncover insights and identify trends.</li>
7 O-TS) <li>Responsibilities: Collaborate with a cross-functional team of data engineers, data scientists, and data visualization experts to deliver impactful projects.</li>
8 H-TS) <li>Responsibilities: Research and evaluate new and emerging technologies.</li>
9 O-TS) <li>Responsibilities: Develop data science solutions utilizing various tools and cloud computing infrastructure.</li>
10 O-TS) <li>Responsibilities: Perform additional duties as assigned.</li>



----
## Maintenance

In [14]:

# Display cypher necessary to apply for all the jobs you qualify for that you haven't yet
import pyperclip

cypher_str = f'''
    // Get job application links
    MATCH (fn:FileNames)
    WHERE
        fn.percent_fit >= 0.8 AND
        ((fn.is_closed IS NULL) OR (fn.is_closed = false)) AND
        ((fn.is_opportunity_application_emailed IS NULL) OR (fn.is_opportunity_application_emailed = false))
    RETURN
        fn.percent_fit AS percent_fit,
        fn.file_name AS filename,
        fn.posting_url AS posting_url
    ORDER BY fn.percent_fit DESC;'''
pyperclip.copy(cypher_str)
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    df = DataFrame(row_objs_list)
    display(df)
    
    for filename in df.filename:
        print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{filename}"]
SET fn.is_closed = true
RETURN fn;""")
        break
    
    for filename in df.filename:
        print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{filename}"]
SET fn.is_opportunity_application_emailed = true, fn.opportunity_application_email_date = date()
RETURN fn;""")
        break
speech_str = 'Job application cypher code copied to clipboard'; speech_engine.say(speech_str); speech_engine.runAndWait()

In [24]:

# Break up overly-long O-RQs:
# Ensure you have already run the "Fix Parts-of-Speech and Quals for 
# this posting" cells above or displayed the context of an 
# individual child string above. Don't close the Notepad++ window 
# until you have replaced the child string
def display_file_in_text_editor(file_name):
    text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
    file_path = osp.abspath(osp.join(hau.SAVES_HTML_FOLDER, file_name))
    !"{text_editor_path}" "{file_path}"
display_file_in_text_editor(file_name)
cu.rebuild_filename_node(file_name, navigable_parent=None, verbose=True)
speech_str = 'File name node rebuild completed'; speech_engine.say(speech_str); speech_engine.runAndWait()


                MATCH
                    (np:NavigableParents {navigable_parent: "<li>Preferred: Knowledge of Machine Learning Operations (MLOps) and CI/CD tools for automating the build, test, and deployment of models in production environments.</li>"}),
                    (ht:HeaderTags {header_tag: "li"})
                MERGE (ht)-[r:SUMMARIZES]->(np);



----