In [1]:

%pprint
%matplotlib inline
import sys
import os.path as osp

executable_path = sys.executable; scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts')
py_folder = osp.abspath('../py'); ffmpeg_folder = r'C:\ffmpeg\bin'
if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine)
from pandas import DataFrame
import re
import pyperclip
import ipywidgets as widgets
from IPython.display import display

Pretty printing has been turned OFF
Utility libraries created in 18 seconds


In [2]:

# Check if the slrcu has built its parts-of-speech logistic regression elements
# Parts-of-speech logistic regression elements is normally built in 1 hour, 55 minutes and 45 seconds
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'): print('predict_single is available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech logistic regression elements built in {duration_str}'; print(speech_str)

I have 48,129 labeled parts of speech in here


Train the POS Classifiers: 100%|█████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 265.81it/s]

predict_single is available
Parts-of-speech logistic regression elements built in 9 seconds





In [3]:

# Check if the scrfcu has built its parts-of-speech conditional random field elements
# Parts-of-speech CRF elements normally built in 29 minutes and 57 seconds
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech conditional random field elements built in {duration_str}'; print(speech_str)

predict_single is now available
Parts-of-speech conditional random field elements built in 2 seconds


In [4]:

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech stochastic gradient descent elements built in {duration_str}'; print(speech_str)

I have 48,129 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 17 seconds


In [5]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier normally trained in 15 hours, 42 minutes and 41 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'): crf.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(crf, 'CRF'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'POS classifier trained in {duration_str}'; print(speech_str)

predict_single is now available
POS classifier trained in 0 seconds


In [6]:

# Check if the lru has built its is-qualified classifier
t1 = time.time()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified LR elements built in {duration_str}'; print(speech_str)

I have 532,546 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 12 seconds


In [7]:

# Train the isheader classifier
t1 = time.time()
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-header classifier trained in {duration_str}'; print(speech_str)

I have 51,990 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 51,990 records trained
Is-header classifier trained in 12 seconds


In [8]:

speech_str = f'Last run on {datetime.now()}'; print(speech_str)

Last run on 2024-10-26 06:46:45.108291



---
# Training

In [79]:

# You need to run this again if you changed the qualification dictionary below or in another notebook
t1 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
lru.sync_basic_quals_dict(sampling_strategy_limit=None, verbose=False)

lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified classifer retrained in {duration_str}'; print(speech_str)

I have 21,826 hand-labeled qualification strings in here
I have 605,026 is-qualified vocabulary tokens in here
Is-qualified classifer retrained in 36 seconds



# Inference

In [80]:

t1 = time.time()

# Loop through all the unset %fit values, set them if you can, break for help if you can't
quals_list, file_name = lru.infer_from_hunting_dataframe(fitness_threshold=3/4, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Inference completed in {duration_str}'; print(speech_str); speech_engine.say(speech_str); speech_engine.runAndWait()

Basic Qualifications for 1395049 AI Data Architect INNOVATIONS UK LTD:
*quals_list[0] = "Experience with statistical modeling and data analysis" (1.0)
*quals_list[1] = "Experience with tabular data analysis using languages such as SQL, R, and/or Python" (1.0)
*quals_list[2] = "Proficient in data exploration techniques and tools such as Amazon Web Services (AWS)" (1.0)
*quals_list[3] = "2024-10-23" (0.9141)
*quals_list[4] = "Experience with data visualization libraries such as Plotly, Streamlit, and matplotlib." (1.0)
*quals_list[5] = "Strong Python programming fundamentals" (1.0)
quals_list[6] = "Good understanding of machine learning algorithms, tools and platforms" (1.0)
*quals_list[7] = "Great communication skills, able to explain model results to a non-technical audience" (1.0)
*quals_list[8] = "Experience with planning and execution" (0.0)
*quals_list[9] = "Bachelor's degree in Computer Science, Data Science or related field and over 16 years of relevant experience, Masters with 1

In [None]:
raise

In [83]:

# Manually label the unscored qualification string
qualification_str = quals_list[2]
print(qualification_str); basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[qualification_str]) + '\n' if(qualification_str in basic_quals_dict) else '', end='')
basic_quals_dict[qualification_str] = 1
nu.store_objects(basic_quals_dict=basic_quals_dict)

Proficient in data exploration techniques and tools such as Amazon Web Services (AWS)
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl



----
## Fix Parts-of-Speech and Quals for this posting

In [84]:

t1 = time.time()
file_path = osp.join(cu.SAVES_HTML_FOLDER, file_name)
if osp.isfile(file_path):
    child_strs_list = hau.get_child_strs_from_file(file_name=file_name)
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
    print(file_name)
    assert hasattr(slrcu, 'pos_predict_percent_fit_dict'), 'slrcu.predict_single needs to be available'
    pos_symbol_predictions_list = [slrcu.predict_single(sent_str) for sent_str in child_strs_list]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'CRF and child strings list recreated in {duration_str}'; print(speech_str)

1395049_AI_Data_Architect_LEIDOS_INNOVATIONS_UK_LTD.html
CRF and child strings list recreated in 1 minute and 9 seconds


In [85]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(pos_symbol_predictions_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech displayed in {duration_str}'; print(speech_str); speech_engine.say(speech_str); speech_engine.runAndWait()

['O-IP', 'O-TS', 'O-TS', 'H-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'H-RQ', 'O-RQ', 'O-RQ', 'O-PQ', 'O-RQ', 'O-PQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-PQ', 'O-RQ', 'O-RQ', 'O-RQ', 'H-PQ', 'O-RQ', 'O-PQ', 'O-PQ', 'O-RQ', 'O-RQ', 'H-PQ', 'O-RQ', 'O-TS', 'H-SP', 'H-SP', 'O-SP']
[13, 14, 16, 18, 19, 20, 21, 22, 24, 25, 26, 28, 31, 32]


[13, 14, 16, 18, 19, 20, 21, 22, 24, 25, 26, 28, 31, 32]
Parts-of-speech displayed in 1 minute and 17 seconds


In [None]:
raise

In [65]:

basic_quals_dict = nu.load_object('basic_quals_dict'); column_name = 'is_job_title'
for idx in list(range(0, 3)):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = '''\n            MATCH (np:NavigableParents {navigable_parent: $navigable_parent})\n            ''' + cu.return_every_np_str + ';'
        results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_job_title = {str(column_name == 'is_job_title').lower()},
                np.is_corporate_scope = {str(column_name == 'is_corporate_scope').lower()},
                np.is_task_scope = {str(column_name == 'is_task_scope').lower()},
                np.is_minimum_qualification = {str(column_name == 'is_minimum_qualification').lower()},
                np.is_preferred_qualification = {str(column_name == 'is_preferred_qualification').lower()},
                np.is_supplemental_pay = {str(column_name == 'is_supplemental_pay').lower()},
                np.is_office_location = {str(column_name == 'is_office_location').lower()},
                np.is_job_duration = {str(column_name == 'is_job_duration').lower()},
                np.is_interview_procedure = {str(column_name == 'is_interview_procedure').lower()},
                np.is_legal_notification = {str(column_name == 'is_legal_notification').lower()},
                np.is_other = {str(column_name == 'is_other').lower()},
                np.is_posting_date = {str(column_name == 'is_posting_date').lower()}
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

36 H-PQ) <p>______________________________________________________________________________</p>
47 H-PQ) <p>______________________________________________________________________________</p>


In [81]:

# Display the context of an individual child string
idx = 35
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
print(f'{idx} {pos_symbol}) {child_str}')

[23, 24, 26, 27, 28, 29, 31]
35 O-SP) <p>In support of pay transparency and equity, the minimum and maximum full-time annual base salary for this role in New York is $160,000 - $185,000 at the time of posting, with the potential of an incentive or bonus. While this is our reasonable expectation this is not a guarantee of compensation or salary, actual compensation is influenced by a wide range of factors including but not limited to skill set, level of experience, education, certifications, responsibility, and geographic location. Candidates hired to work in other locations will be subject to the pay range associated with that location. We offer a variety of benefits including medical, dental, vision, disability insurance, 401(k), EAP, parental leave, discretionary time off, and company-paid holidays. The specific programs and options available will vary depending on the state, start date, and employment type. Our Talent Acquisition team will be happy to answer any questions you may ha

In [80]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 1
nu.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<li>Familiarity of ACR (Automatic Content Recognition) techniques and their applications in media is a plus.</li>" in basic_quals_dict: 1


In [56]:

# Fix Non-headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in range(15, 27):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET np.is_header = false
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
        ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False)

15 O-RQ) <p>U.S. Citizenship is required.</p>
16 O-RQ) <p>Bachelors’ Degree or equivalent experience and a minimum of 3-5 years of experience applying ML/AI in a business context.</p>
17 O-RQ) <p>Deep expertise in classical machine learning algorithms and their mathematical foundations.</p>
18 O-RQ) <p>Proven experience applying advanced NLP techniques and working with large language models (e.g., LLMs)</p>
19 O-RQ) <p>Strong programming skills in Python, with extensive use of libraries such as scikit-learn, Pandas, NLTK, and Hugging Face Transformers.</p>
20 O-RQ) <p>Demonstrated ability to develop novel ML/AI solutions that directly impact business outcomes.</p>
21 H-RQ) <p>Experience presenting complex technical concepts to senior executives and translating them into strategic business initiatives.</p>
22 O-RQ) <p>Ability to work in a time critical environment and prioritize workload effectively</p>
23 O-RQ) <p>Ability to interact professionally with a diverse group of executives, m

In [41]:

# Fix Headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in [8, 11, 15, 18, 24, 37, 48]:
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET np.is_header = true
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
        ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False)

8 H-PQ) <strong>Not your usual app</strong>
11 O-CS) <strong>What makes our ride unique?</strong>
15 O-CS) We have a vision:
18 O-TS) <strong>THE JOURNEY</strong>
24 H-RQ) <strong>WHAT YOU WILL BRING TO THE RIDE</strong>
37 O-CS) <strong>We believe driven talent deserves:</strong>
48 H-TS) <strong>So, ready to take the wheel and make this the ride of your life?</strong>



----
## Maintenance

In [52]:

# Display cypher necessary to apply for all the jobs you qualify for that you haven't applied for
cypher_str = f'''
    // Get job application links for jobs you should apply to
    MATCH (fn:FileNames)
    WHERE
        fn.percent_fit >= 0.8 AND
        ((fn.is_closed IS NULL) OR (fn.is_closed = false)) AND
        ((fn.is_opportunity_application_emailed IS NULL) OR (fn.is_opportunity_application_emailed = false))
    RETURN
        fn.percent_fit AS percent_fit,
        fn.file_name AS filename,
        fn.posting_url AS posting_url
    ORDER BY fn.percent_fit DESC;'''
pyperclip.copy(cypher_str)
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    df = DataFrame(row_objs_list)
    display(df)
    for filename in df.filename:
        print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{filename}"]
SET fn.is_closed = true
RETURN fn;""")
        break
    for filename in df.filename:
        print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{filename}"]
SET fn.is_opportunity_application_emailed = true, fn.opportunity_application_email_date = date()
RETURN fn;""")
        break
speech_str = 'Job application cypher code copied to clipboard'; speech_engine.say(speech_str); speech_engine.runAndWait()

In [57]:

# Break up overly-long O-RQs:
# Ensure you have already run the "Fix Parts-of-Speech and Quals for 
# this posting" cells above or displayed the context of an 
# individual child string above. Don't close the Notepad++ window 
# until you have replaced the child string
# file_name = '476d6e5bd50da5a6_Data_Scientist_Remote_Indeed_com.html'
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
file_path = osp.abspath(osp.join(hau.SAVES_HTML_FOLDER, file_name))
wsu.clean_job_posting(file_path)
try: pyperclip.copy(re.sub("((?:<li>([^><]+)</li>\n)+)", "<ul>\n\\1</ul>\n", '\n'.join(child_strs_list), 0, re.MULTILINE))
except: pass
!"{text_editor_path}" "{file_path}"

In [58]:

cu.rebuild_filename_node(file_name, navigable_parent=None, verbose=True)
speech_str = f'{su.get_job_title_from_file_name(file_name)} node rebuild completed'; speech_engine.say(speech_str); speech_engine.runAndWait()


                MATCH
                    (np:NavigableParents {navigable_parent: "<p>CACI is an Equal Opportunity/Affirmative Action Employer. All qualified applicants will receive consideration for employment without regard to race, color, religion, sex, pregnancy, sexual orientation, gender identity, age, national origin, disability, status as a protected veteran, or any other protected characteristic.</p>"}),
                    (ht:HeaderTags {header_tag: "p"})
                MERGE (ht)-[r:SUMMARIZES]->(np);



----

In [None]:

# Show what's in the database already for this html string
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        ''' + cu.return_every_np_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
row_objs_list

In [None]:

# Remove this particular qualification string from the quals dictionary
qualification_str = quals_list[25]
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict.pop(qualification_str, None)
nu.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{qualification_str}" in basic_quals_dict: {qualification_str in basic_quals_dict}')

In [None]:

# Remove this particular qualification string from the database
def do_cypher_tx(tx, qualification_str, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $qualification_str})
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={'qualification_str': qualification_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, qualification_str=qualification_str, verbose=False)

In [None]:

# Manually set each feature
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = true,
            np.is_task_scope = false,
            np.is_minimum_qualification = false,
            np.is_preferred_qualification = false,
            np.is_educational_requirement = true,
            np.is_legal_notification = false,
            np.is_other = false,
            np.is_corporate_scope = false,
            np.is_job_title = false,
            np.is_office_location = false,
            np.is_job_duration = false,
            np.is_supplemental_pay = false,
            np.is_interview_procedure = false,
            np.is_posting_date = false
        ''' + cu.return_every_np_str + ';'
    return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False); row_objs_list

In [None]:

# Remove file name from database
for file_name in ['']:
    cu.delete_filename_node(file_name, verbose=True)