In [None]:

%pprint
%matplotlib inline
import sys
import os.path as osp

executable_path = sys.executable; scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts')
py_folder = osp.abspath('../py'); ffmpeg_folder = r'C:\ffmpeg\bin'
if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine)
from pandas import DataFrame
import re
import pyperclip
import ipywidgets as widgets
from IPython.display import display

In [2]:

# Check if the slrcu has built its parts-of-speech logistic regression elements
# Parts-of-speech logistic regression elements is normally built in 1 hour, 55 minutes and 45 seconds
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'): print('predict_single is available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech logistic regression elements built in {duration_str}'; print(speech_str)

I have 48,129 labeled parts of speech in here


Train the POS Classifiers: 100%|█████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 302.98it/s]

predict_single is available
Parts-of-speech logistic regression elements built in 8 seconds





In [3]:

# Check if the scrfcu has built its parts-of-speech conditional random field elements
# Parts-of-speech CRF elements normally built in 29 minutes and 57 seconds
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech conditional random field elements built in {duration_str}'; print(speech_str)

predict_single is now available
Parts-of-speech conditional random field elements built in 1 second


In [4]:

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech stochastic gradient descent elements built in {duration_str}'; print(speech_str)

I have 48,129 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 11 seconds


In [5]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier normally trained in 15 hours, 42 minutes and 41 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'): crf.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(crf, 'CRF'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'POS classifier trained in {duration_str}'; print(speech_str)

predict_single is now available
POS classifier trained in 0 seconds


In [6]:

# Check if the Logistic Regression utilities class has built its is-qualified classifier
t1 = time.time()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified LR elements built in {duration_str}'; print(speech_str)

I have 532,546 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 9 seconds


In [7]:

# Train the isheader classifier
t1 = time.time()
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-header classifier trained in {duration_str}'; print(speech_str)

I have 52,199 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 52,199 records trained
Is-header classifier trained in 10 seconds


In [8]:

speech_str = f'Last run on {datetime.now()}'; print(speech_str)

Last run on 2024-11-02 15:43:39.899685



---
# Training

In [20]:

# You need to run this again if you changed the qualification dictionary below or in another notebook
t1 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
lru.sync_basic_quals_dict(sampling_strategy_limit=None, verbose=False)

lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified classifer retrained in {duration_str}'; print(speech_str)

I have 22,617 hand-labeled qualification strings in here
I have 624,842 is-qualified vocabulary tokens in here
Is-qualified classifer retrained in 25 seconds



# Inference

In [21]:

t1 = time.time()

# Loop through all the unset %fit values, set them if you can, break for help if you can't
quals_list, file_name = lru.infer_from_hunting_dataframe(fitness_threshold=3/4, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Inference completed in {duration_str}'; print(speech_str); speech_engine.say(speech_str); speech_engine.runAndWait()

Basic Qualifications for Customs and Border Protection Officer United States:
*quals_list[0] = "Education Substitution: A bachelor's degree or successful completion of a full four (4)-year course of study in any field leading to a bachelor's degree from an accredited college or university; OR will receive a bachelor's degree from an accredited college or university within nine (9) months from the closing date of this announcement and will upload all official or unofficial transcripts before the closing date of this announcement; OR" (1.0)
*quals_list[1] = "Successful completion of the Academy is required for this position" (1.0)
*quals_list[2] = "This will be calculated using your resume and official or unofficial college transcripts submitted with your application" (1.0)
*quals_list[3] = "Experience: A minimum of three (3) years full-time general work experience that demonstrates the ability to meet and deal with people and the ability to learn and be able to apply what you have learn

In [None]:

d = time.time() - t0; duration_str = humanize.precisedelta(d, minimum_unit='minutes', format='%0.0f'); mpp = (d//60)/lru.max_togo if lru.max_togo else "N/A"
raise Exception(f'Postings from the previous email ingestion processed in {duration_str} taking about {mpp} minutes per post')

In [32]:

# Manually label the unscored qualification string
qualification_str = quals_list[1]
print(qualification_str); basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[qualification_str]) + '\n' if(qualification_str in basic_quals_dict) else '', end='')
basic_quals_dict[qualification_str] = 0
nu.store_objects(basic_quals_dict=basic_quals_dict)

United States: Relocate before starting work (Required)
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl



----
## Fix Parts-of-Speech and Quals for this posting

In [33]:

t1 = time.time()
file_path = osp.join(cu.SAVES_HTML_FOLDER, file_name)
if osp.isfile(file_path):
    child_strs_list = hau.get_child_strs_from_file(file_name=file_name)
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
    print(file_name)
    assert hasattr(slrcu, 'pos_predict_percent_fit_dict'), 'slrcu.predict_single needs to be available'
    pos_symbol_predictions_list = [slrcu.predict_single(sent_str) for sent_str in child_strs_list]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'CRF and child strings list recreated in {duration_str}'; print(speech_str)

66e7ffd59939f6f7_Customs_and_Border_Protection_Officer_United_States_Indeed_com.html
CRF and child strings list recreated in 1 minute and 22 seconds


In [34]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(pos_symbol_predictions_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech displayed in {duration_str}'; print(speech_str); speech_engine.say(speech_str); speech_engine.runAndWait()

['H-CS', 'O-SP', 'O-CS', 'O-TS', 'O-LN', 'H-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-SP', 'O-OL', 'O-SP', 'O-SP', 'O-SP', 'O-SP', 'O-IP', 'O-SP', 'O-TS', 'O-TS', 'O-TS', 'O-SP', 'O-SP', 'O-IP', 'O-IP', 'O-SP', 'H-OL', 'O-SP', 'H-RQ', 'H-RQ', 'O-RQ', 'O-RQ', 'O-IP', 'O-IP', 'H-RQ', 'O-RQ', 'O-RQ', 'O-LN', 'O-IP', 'O-LN', 'H-IP', 'H-RQ', 'O-IP', 'O-LN', 'O-LN', 'O-RQ', 'O-LN', 'O-IP', 'O-IP', 'O-IP', 'O-TS', 'O-LN', 'O-IP', 'O-IP', 'H-IP', 'O-TS', 'H-TS', 'O-TS', 'O-TS', 'O-IP', 'O-IP', 'O-OL', 'O-JD', 'O-SP', 'H-SP', 'O-SP', 'O-SP', 'O-SP', 'O-SP', 'O-SP', 'O-SP', 'H-JD', 'O-JD', 'H-RQ', 'O-PQ', 'O-PQ', 'H-RQ', 'O-RQ', 'O-OL']
[31, 32, 36, 37, 46, 78]


[31, 32, 36, 37, 46, 78]
Parts-of-speech displayed in 1 minute and 33 seconds


In [None]:
raise

In [37]:

basic_quals_dict = nu.load_object('basic_quals_dict'); column_name = 'is_interview_procedure'
for idx in list(range(39, 42)):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = '''\n            MATCH (np:NavigableParents {navigable_parent: $navigable_parent})\n            ''' + cu.return_every_np_str + ';'
        results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_job_title = {str(column_name == 'is_job_title').lower()},
                np.is_corporate_scope = {str(column_name == 'is_corporate_scope').lower()},
                np.is_task_scope = {str(column_name == 'is_task_scope').lower()},
                np.is_minimum_qualification = {str(column_name == 'is_minimum_qualification').lower()},
                np.is_preferred_qualification = {str(column_name == 'is_preferred_qualification').lower()},
                np.is_supplemental_pay = {str(column_name == 'is_supplemental_pay').lower()},
                np.is_office_location = {str(column_name == 'is_office_location').lower()},
                np.is_job_duration = {str(column_name == 'is_job_duration').lower()},
                np.is_interview_procedure = {str(column_name == 'is_interview_procedure').lower()},
                np.is_legal_notification = {str(column_name == 'is_legal_notification').lower()},
                np.is_other = {str(column_name == 'is_other').lower()},
                np.is_posting_date = {str(column_name == 'is_posting_date').lower()}
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

39 O-IP) <li>This will be calculated using your resume and official or unofficial transcripts submitted with your application.</li>
40 O-LN) <li>If you have previous or current law enforcement or military law enforcement experience, you may qualify at the GS-9 grade level.</li>
41 H-IP) <li>Refer to How to Apply section below for links to the GS-9 Job Opening Announcements at USAJOBS.</li>


In [30]:

# Display the context of an individual child string
idx = 31
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
print(f'{idx} {pos_symbol}) {child_str}')

[13]
31 H-OL) <h3 class="jobSectionHeader" pos="H-OL">Office Location:</h3>


In [21]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 0
nu.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<li>Bachelor's degree in computer science, Computer Engineering, or a related field is preferred.</li>" in basic_quals_dict: 0


In [16]:

# Fix Non-headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in range(30, 33):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET np.is_header = false
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
        ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False)

30 H-RQ) <p>Experience: A minimum of one (1) year of specialized full-time work experience equivalent to at least the next lower grade level that entails performance of duties in inspections work at borders, seaports, airports, or other ports of entry and/or work involving preliminary screening of persons for entry and immigration status, or compliance/regulatory work; OR</p>
31 O-LN) <p>Education Substitution: A bachelor's degree with Superior Academic Achievement based on (1) class standing, (2) grade-point average (3.0 or higher), or (3) honor society membership; OR one (1) full year of graduate level education in a field of study related to law enforcement (e.g., criminology, criminal justice, law enforcement, courts and judicial systems, corrections and rehabilitation, justice studies, homeland security, forensic technology and forensic psychology) from an accredited college or university; OR will receive a bachelor's degree with Superior Academic Achievement or one (1) full year 

In [65]:

# Fix Headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in [1, 3, 12, 22, 38, 40, 43]:
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET np.is_header = true
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
        ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False)

1 H-CS) <b>About Our Opportunity:</b>
3 H-TS) <b>Things You’ll Do Here:</b>
12 O-RQ) <b>What You’ll Bring to the Team:</b>
22 H-PQ) <h3 class="jobSectionHeader" pos="H-PQ">Preferred Qualifications:</h3>
38 H-TS) <b>Where You’ll Work:</b>
40 H-SP) <b>Why You Will Love Working Here</b>
43 H-SP) <b>Perks and Benefits</b>



----
## Maintenance

In [172]:

# Display cypher necessary to apply for all the jobs you qualify for that you haven't applied for
pyperclip.copy(cu.get_job_application_links)
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cu.get_job_application_links)
if row_objs_list:
    df = DataFrame(row_objs_list)
    display(df)
    for filename in df.filename:
        print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{filename}"]
SET fn.is_closed = true
RETURN fn;""")
        break
    for filename in df.filename:
        print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{filename}"]
SET fn.is_opportunity_application_emailed = true, fn.opportunity_application_email_date = date()
RETURN fn;""")
        break
speech_str = 'Job application cypher code copied to clipboard'; speech_engine.say(speech_str); speech_engine.runAndWait()

In [27]:

# Break up overly-long O-RQs:
# Ensure you have already run the "Fix Parts-of-Speech and Quals for 
# this posting" cells above or displayed the context of an 
# individual child string above. Don't close the Notepad++ window 
# until you have replaced the child string
# file_name = '476d6e5bd50da5a6_Data_Scientist_Remote_Indeed_com.html'
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
file_path = osp.abspath(osp.join(hau.SAVES_HTML_FOLDER, file_name))
wsu.clean_job_posting(file_path)
try: pyperclip.copy(re.sub("((?:<li>([^><]+)</li>\n)+)", "<ul>\n\\1</ul>\n", '\n'.join(child_strs_list), 0, re.MULTILINE))
except: pass
!"{text_editor_path}" "{file_path}"

In [None]:

cu.rebuild_filename_node(file_name, navigable_parent=None, verbose=True)
speech_str = f'{su.get_job_title_from_file_name(file_name)} node rebuild completed'; speech_engine.say(speech_str); speech_engine.runAndWait()


                MATCH
                    (np:NavigableParents {navigable_parent: "<h3 class="jobSectionHeader" pos="H-TS">Experience:</h3>"}),
                    (ht:HeaderTags {header_tag: "h3"})
                MERGE (ht)-[r:SUMMARIZES]->(np);



----

In [None]:

# Show what's in the database already for this html string
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        ''' + cu.return_every_np_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
row_objs_list

In [213]:

# Remove this particular qualification string from the quals dictionary
qualification_str = quals_list[2]
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict.pop(qualification_str, None)
nu.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{qualification_str}" in basic_quals_dict: {qualification_str in basic_quals_dict}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"Occasionally" in basic_quals_dict: False


In [214]:

# Remove this particular qualification string from the database
def do_cypher_tx(tx, qualification_str, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $qualification_str})
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={'qualification_str': qualification_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, qualification_str=qualification_str, verbose=False)

In [None]:

# Manually set each feature
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = true,
            np.is_task_scope = false,
            np.is_minimum_qualification = false,
            np.is_preferred_qualification = false,
            np.is_educational_requirement = true,
            np.is_legal_notification = false,
            np.is_other = false,
            np.is_corporate_scope = false,
            np.is_job_title = false,
            np.is_office_location = false,
            np.is_job_duration = false,
            np.is_supplemental_pay = false,
            np.is_interview_procedure = false,
            np.is_posting_date = false
        ''' + cu.return_every_np_str + ';'
    return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False); row_objs_list

In [229]:

# Remove file name from database
for file_name in ['']:
    
    # Lose the node features and folder storage
    cu.delete_filename_node(
        file_name, remove_node=True, remove_file=True, verbose=True
    )


                    MATCH (fn:FileNames {file_name: ""})
                    DETACH DELETE fn;
C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\html\ removed
