In [1]:

%pprint
%matplotlib inline
import sys
import os.path as osp

executable_path = sys.executable; scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts')
py_folder = osp.abspath(osp.join('..', 'py')); ffmpeg_folder = r'C:\ffmpeg\bin'
if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine)
from pandas import DataFrame
import re
import pyperclip
import ipywidgets as widgets
from IPython.display import display

Pretty printing has been turned OFF
Utility libraries created in 3 seconds


In [2]:

# Check if the Logistic Regression utilities class has built its is-qualified classifier
t1 = time.time()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified LR model built in {duration_str}'; print(speech_str)

I have 532,546 is-qualified vocabulary tokens in here
Is-qualified LR model built in 11 seconds


In [3]:

# Check if the slrcu has built its parts-of-speech logistic regression model
# Parts-of-speech logistic regression model is normally built in 1 hour, 18 minutes, and 19 seconds
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'): print('predict_single is available', file=sys.stderr)
else: print('predict_single is not available', file=sys.stderr)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech logistic regression model built in {duration_str}'; print(speech_str, file=sys.stderr)

I have 47,849 labeled parts of speech in here
Train the POS Classifiers: 100%|███████████████| 25/25 [00:00<00:00, 349.57it/s]
predict_single is available
Parts-of-speech logistic regression model built in 8 seconds


In [4]:

# Train the isheader classifier
t1 = time.time()
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-header classifier trained in {duration_str}'; print(speech_str)

I have 54,088 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 54,088 records trained
Is-header classifier trained in 7 seconds



---
# Training

In [21]:

# You need to run this again if you changed the qualification dictionary below or in another notebook
t1 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
lru.sync_basic_quals_dict(sampling_strategy_limit=None, verbose=False)

lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified classifer retrained in {duration_str}'; print(speech_str)

I have 24,606 hand-labeled qualification strings in here
I have 675,765 is-qualified vocabulary tokens in here
Is-qualified classifer retrained in 24 seconds



# Inference

In [22]:

t1 = time.time()

# Loop through all the unset %fit values, set them if you can, break for help if you can't
quals_list, file_name = lru.infer_from_hunting_dataframe(fitness_threshold=3/4, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Inference completed in {duration_str}'; print(speech_str); speech_engine.say(speech_str); speech_engine.runAndWait()

Basic Qualifications for 1786632 Senior Data Scientist:
*quals_list[0] = "Stay updated with emerging trends in blockchain analytics and data science methodologies." (1.0)
*quals_list[1] = "5-7 years of strong background in building machine learning models and performing statistical analysis." (1.0)
*quals_list[2] = "Experience with big data platforms and query languages (eg, SQL, Spark)." (1.0)
*quals_list[3] = "Expertise in Python and data science libraries such as Pandas, NumPy, and Scikit-learn." (1.0)
100.00%

hunting_df.loc[6181, 'percent_fit'] = (000+000+000+000)/4
15 left to go: 6180/6195 = 99.76% completed (taking about 4.3 minutes per post)
Inference completed in 14 seconds


In [None]:

d = time.time() - t0; duration_str = humanize.precisedelta(d, minimum_unit='minutes', format='%0.0f'); mpp = (d//60)/lru.max_togo if lru.max_togo else "N/A"
raise Exception(f'Postings from the previous email ingestion processed in {duration_str} taking about {mpp} minutes per post')

In [20]:

# Manually label the unscored qualification string
qualification_str = quals_list[10]
print(qualification_str); basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[qualification_str]) + '\n' if (qualification_str in basic_quals_dict) else '', end='')
basic_quals_dict[qualification_str] = 0
nu.store_objects(basic_quals_dict=basic_quals_dict)

Hands-on 3-5 years of relevant work experience as a Machine Learning Engineer
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl



----
## Fix Parts-of-Speech and Quals for this posting

In [43]:

t1 = time.time()
file_path = osp.join(cu.SAVES_HTML_FOLDER, file_name)
if osp.isfile(file_path):
    child_strs_list = hau.get_child_strs_from_file(file_name=file_name)
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
    print(file_name)
    assert hasattr(slrcu, 'pos_predict_percent_fit_dict'), 'slrcu.predict_single needs to be available'
    pos_symbol_predictions_list = [slrcu.predict_single(sent_str) for sent_str in child_strs_list]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Child strings list recreated in {duration_str}'; print(speech_str)

1776627_Data_Scientist_tbi_bank.html
Child strings list recreated in 1 minute and 6 seconds


In [44]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(pos_symbol_predictions_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech displayed in {duration_str}'; print(speech_str); speech_engine.say(speech_str); speech_engine.runAndWait()

['H-CS', 'H-CS', 'H-JT', 'O-CS', 'H-CS', 'O-CS', 'O-CS', 'O-RQ', 'O-CS', 'H-JT', 'O-CS', 'O-IP', 'O-IP', 'O-JT', 'O-TS', 'H-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'H-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'H-RQ', 'H-RQ', 'O-PQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-PQ', 'O-PQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-PQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-PQ', 'H-SP', 'H-SP', 'O-TS', 'O-RQ', 'O-SP', 'O-SP', 'O-SP', 'O-TS', 'O-RQ', 'O-RQ', 'H-CS', 'O-IP', 'O-IP', 'O-IP', 'O-CS', 'H-OL', 'O-PD', 'O-IP', 'H-JT', 'O-LN', 'O-O', 'O-LN', 'H-JT', 'H-SP', 'O-SP']
[7, 35, 36, 37, 38, 39, 42, 43, 44, 46, 47, 48, 53, 58, 59]


[7, 35, 36, 37, 38, 39, 42, 43, 44, 46, 47, 48, 53, 58, 59]
Parts-of-speech displayed in 1 minute and 17 seconds


In [None]:
raise

In [45]:

basic_quals_dict = nu.load_object('basic_quals_dict'); column_name = 'is_supplemental_pay'; range_obj = range(50, 60)
for idx in range_obj:
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = '''\n            MATCH (np:NavigableParents {navigable_parent: $navigable_parent})\n            ''' + cu.return_every_np_str + ';'
        results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_job_title = {str(column_name == 'is_job_title').lower()},
                np.is_corporate_scope = {str(column_name == 'is_corporate_scope').lower()},
                np.is_task_scope = {str(column_name == 'is_task_scope').lower()},
                np.is_minimum_qualification = {str(column_name == 'is_minimum_qualification').lower()},
                np.is_preferred_qualification = {str(column_name == 'is_preferred_qualification').lower()},
                np.is_supplemental_pay = {str(column_name == 'is_supplemental_pay').lower()},
                np.is_office_location = {str(column_name == 'is_office_location').lower()},
                np.is_job_duration = {str(column_name == 'is_job_duration').lower()},
                np.is_interview_procedure = {str(column_name == 'is_interview_procedure').lower()},
                np.is_legal_notification = {str(column_name == 'is_legal_notification').lower()},
                np.is_other = {str(column_name == 'is_other').lower()},
                np.is_posting_date = {str(column_name == 'is_posting_date').lower()}
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

50 H-SP) <strong>What we offer:</strong>
51 H-SP) <p>Seize the opportunity to grow your career</p>
52 O-TS) <p>Engage in exciting and meaningful work</p>
53 O-RQ) <p>Get recognition for your work and attitude</p>
54 O-SP) <p>Learn new skills and get management training</p>
55 O-SP) <p>Become part of a large, friendly and supportive team</p>
56 O-SP) <p>Get additional private health insurance</p>
57 O-TS) <p>Receive special prices for multisport card and multiple retailers</p>
58 O-RQ) <p>Obtain preferential prices for our banking products</p>
59 O-RQ) <p>Enjoy a great location in Sofia’s city centre near NDK and South Park</p>


In [None]:

# Display the context of an individual child string
idx = 14
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
print(f'{idx} {pos_symbol}) {child_str}')

In [None]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 1
nu.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')


----

In [46]:

# Fix headers and Non-headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in range_obj:
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    is_header = str(child_str.startswith('<strong')).lower()
    # is_header = str(':</' in child_str).lower()
    def do_cypher_tx(tx, navigable_parent):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET np.is_header = {is_header}
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
        ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False)

50 H-SP) <strong>What we offer:</strong>
51 H-SP) <p>Seize the opportunity to grow your career</p>
52 O-TS) <p>Engage in exciting and meaningful work</p>
53 O-RQ) <p>Get recognition for your work and attitude</p>
54 O-SP) <p>Learn new skills and get management training</p>
55 O-SP) <p>Become part of a large, friendly and supportive team</p>
56 O-SP) <p>Get additional private health insurance</p>
57 O-TS) <p>Receive special prices for multisport card and multiple retailers</p>
58 O-RQ) <p>Obtain preferential prices for our banking products</p>
59 O-RQ) <p>Enjoy a great location in Sofia’s city centre near NDK and South Park</p>


In [None]:

# Fix Non-headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in range(63, 69):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET np.is_header = false
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
        ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False)

In [None]:

# Fix Headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in [0]:
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET np.is_header = true
            ''' + cu.return_every_np_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
        ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False)


----
## Maintenance

In [35]:

# Display cypher necessary to apply for all the jobs you qualify for that you haven't applied for
pyperclip.copy(cu.get_job_application_links)
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cu.get_job_application_links)
if row_objs_list:
    df = DataFrame(row_objs_list)
    display(df)
    for filename in df.filename:
        print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{filename}"]
SET fn.is_closed = true
RETURN fn;""")
        break
    for filename in df.filename:
        print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{filename}"]
SET
    fn.is_opportunity_application_emailed = true,
    fn.opportunity_application_email_date = date(),
    fn.application_url = "xxxxxx"
RETURN fn;""")
        break
speech_str = 'Job application cypher code copied to clipboard'; speech_engine.say(speech_str); speech_engine.runAndWait()

In [38]:

# Break up overly-long O-RQs:
# Ensure you have already run the "Fix Parts-of-Speech and Quals for 
# this posting" cells above or displayed the context of an 
# individual child string above. Don't close the Notepad++ window 
# until you have replaced the child string
# file_name = 'Data_Science_Consulting_Engagement_Manager_-_Remote_-_Indeed.com_9993304a3df214bf.html'
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
file_path = osp.abspath(osp.join(hau.SAVES_HTML_FOLDER, file_name))
wsu.clean_job_posting(file_path)
try: pyperclip.copy(re.sub("((?:<li>([^><]+)</li>\n)+)", "<ul>\n\\1</ul>\n", '\n'.join(child_strs_list), 0, re.MULTILINE))
except: pass
!"{text_editor_path}" "{file_path}"

In [25]:

file_path = osp.abspath(osp.join(hau.SAVES_HTML_FOLDER, file_name))
wsu.convert_p_b_to_h3(file_path, basic_text_set={'h3'}, phrase_elements_set={'b'}, verbose=False)

In [27]:

file_path = osp.abspath(osp.join(hau.SAVES_HTML_FOLDER, file_name))
wsu.replace_phrase_elements_in_block_elements(file_path, block_elements_set={'li'}, phrase_elements_set={'b'})

In [None]:

file_path = osp.abspath(osp.join(hau.SAVES_HTML_FOLDER, file_name))
wsu.replace_single_child_tags_in_li(file_path)

In [39]:

cu.rebuild_filename_node(file_name, navigable_parent=None, verbose=True)
speech_str = f'{su.get_job_title_from_file_name(file_name)} node rebuild completed'; speech_engine.say(speech_str); speech_engine.runAndWait()


                MATCH (fn:FileNames {file_name: "83ff1d9973fe7277_Systems_Engineer_Remote_Indeed_com.html"})
                SET fn.percent_fit = NULL;



----

In [None]:

# Show what's in the database already for this html string
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        ''' + cu.return_every_np_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
row_objs_list

In [None]:

# Remove this particular qualification string from the quals dictionary
qualification_str = quals_list[2]
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict.pop(qualification_str, None)
nu.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{qualification_str}" in basic_quals_dict: {qualification_str in basic_quals_dict}')

In [None]:

# Remove this particular qualification string from the database
def do_cypher_tx(tx, qualification_str, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $qualification_str})
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={'qualification_str': qualification_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, qualification_str=qualification_str, verbose=False)

In [None]:

# Manually set each feature
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = true,
            np.is_task_scope = false,
            np.is_minimum_qualification = false,
            np.is_preferred_qualification = false,
            np.is_educational_requirement = true,
            np.is_legal_notification = false,
            np.is_other = false,
            np.is_corporate_scope = false,
            np.is_job_title = false,
            np.is_office_location = false,
            np.is_job_duration = false,
            np.is_supplemental_pay = false,
            np.is_interview_procedure = false,
            np.is_posting_date = false
        ''' + cu.return_every_np_str + ';'
    return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False); row_objs_list

In [None]:

# Remove file name from database
for file_name in ['']:
    
    # Lose the node features and folder storage
    cu.delete_filename_node(
        file_name, remove_node=True, remove_file=True, verbose=True
    )


----

In [None]:

# Check if the scrfcu has built its parts-of-speech conditional random field model
# Parts-of-speech CRF model normally built in 29 minutes and 57 seconds
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech conditional random field model built in {duration_str}'; print(speech_str)

In [None]:

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent model
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech stochastic gradient descent model built in {duration_str}'; print(speech_str)

In [None]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier normally trained in 15 hours, 42 minutes and 41 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'): crf.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(crf, 'CRF'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'POS classifier trained in {duration_str}'; print(speech_str)

In [None]:

speech_str = f'Last run on {datetime.now()}'; print(speech_str)