In [1]:

%pprint
%matplotlib inline
import sys
sys.path.insert(1, '../py')
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu)
import os
from pandas import DataFrame
import re

freq = 990

Pretty printing has been turned OFF
Utility libraries created in 8 seconds


In [2]:

# Check if the slrcu has built its parts-of-speech logistic regression elements
# Parts-of-speech logistic regression elements is normally built in 1 hour, 27 minutes and 21 seconds
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 49,102 labeled parts of speech in here
predict_single is available
Parts-of-speech logistic regression elements built in 11 seconds


In [3]:

# Check if the scrfcu has built its parts-of-speech conditional random field elements
# Parts-of-speech CRF elements normally built in 29 minutes and 57 seconds
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is now available
Parts-of-speech conditional random field elements built in 2 seconds


In [4]:

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient descent elements built in {duration_str}')

I have 49,102 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 14 seconds


In [5]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier normally trained in 15 hours, 42 minutes and 41 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'): crf.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(crf, 'CRF'): print('predict_single is now available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'POS classifier trained in {duration_str}')

predict_single is now available
POS classifier trained in 0 seconds


In [6]:

# Check if the lru has built its is-qualified classifier
t1 = time.time()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified LR elements built in {duration_str}')

I have 424,879 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 7 seconds


In [7]:

# Train the isheader classifier
t1 = time.time()
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Is-header classifier trained in {duration_str}')

I have 49,883 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 49,883 records trained
Is-header classifier trained in 10 seconds


In [8]:

nu.beep(freq, duration)
print(f'Last run on {datetime.now()}')

Last run on 2024-04-20 15:17:04.736400



---
# Training

In [20]:

# You need to run this again if you changed the
# qualification dictionary below or in another notebook
t1 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
lru.sync_basic_quals_dict(sampling_strategy_limit=None, verbose=False)

lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified classifer retrained in {duration_str}')

I have 16,160 hand-labeled qualification strings in here
I have 521,254 is-qualified vocabulary tokens in here
Is-qualified classifer retrained in 1 minute and 34 seconds



---
# Inference

In [21]:

t1 = time.time()

# Loop through all the unset %fit values, set them if you can, break for help if you can't
quals_list, file_name = lru.infer_from_hunting_dataframe(fitness_threshold=3/4, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f'); print(f'Inference completed in {duration_str}'); nu.beep(freq, duration)

Basic Qualifications for Net Developer Web Developer Remote:
quals_list[0] = "<li>.NET Core 3.1 or above: 3 years (Required)</li>" (0.0)
*quals_list[1] = "<li>Candidates should be self-motivated and have 3-6 years demonstrable experience with architecting, designing, coding, testing, and deploying new products</li>" (1.0)
*quals_list[2] = "<li>Serving as the primary technical contact for external communications (eg, from customers or stakeholders)</li>" (0.9998)
*quals_list[3] = "<li>Adapt to change quickly and adjust work accordingly in a positive manner</li>" (1.0)
quals_list[4] = "<li>Excellent troubleshooting and communication skills and an eye for details</li>" (1.0)
*quals_list[5] = "<li>Python knowledge and experience with Pandas (Python Data Analysis Library)</li>" (1.0)
*quals_list[6] = "<li>Familiarity with source control, particularly Git Testing and deploying applications and systems</li>" (0.4547)
*quals_list[7] = "<li>Excellent estimating and risk management skills</li>" 

In [None]:
raise

In [None]:

# Manually label the unscored qual
qualification_str = quals_list[1]
print(qualification_str); basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[qualification_str]) + '\n' if(qualification_str in basic_quals_dict) else '', end='')
basic_quals_dict[qualification_str] = 0
nu.store_objects(basic_quals_dict=basic_quals_dict)


----
## Fix Parts-of-Speech and Quals for this posting

In [22]:

t1 = time.time()
file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if os.path.isfile(file_path):
    child_strs_list = hau.get_child_strs_from_file(file_name=file_name)
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
    print(file_name)
    child_tags_list = hau.get_child_tags_list(child_strs_list)
    feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
    feature_tuple_list = []
    for feature_dict in feature_dict_list:
        feature_tuple_list.append(hc.get_feature_tuple(
            feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=scrfcu.predict_single,
            pos_sgd_predict_single=ssgdcu.predict_single
        ))
    crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'CRF and child strings list recreated in {duration_str}')

74f56e94f75f00a5_Net_Developer_Web_Developer_Remote_Indeed_com.html
CRF and child strings list recreated in 3 minutes and 4 seconds


In [None]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f'); print(f'Parts-of-speech displayed in {duration_str}'); nu.beep(freq, duration)

In [None]:
raise

In [None]:

basic_quals_dict = nu.load_object('basic_quals_dict'); column_name = 'is_task_scope'
for idx in list(range(50, 61)):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = '''
            MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
            ''' + cu.return_everything_str + ';'
        results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
    # if (row_objs_list[0][column_name] != True):
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_task_scope = {str(column_name == 'is_task_scope').lower()},
                np.is_minimum_qualification = {str(column_name == 'is_minimum_qualification').lower()},
                np.is_preferred_qualification = {str(column_name == 'is_preferred_qualification').lower()},
                np.is_educational_requirement = {str(column_name == 'is_educational_requirement').lower()},
                np.is_legal_notification = {str(column_name == 'is_legal_notification').lower()},
                np.is_other = {str(column_name == 'is_other').lower()},
                np.is_corporate_scope = {str(column_name == 'is_corporate_scope').lower()},
                np.is_job_title = {str(column_name == 'is_job_title').lower()},
                np.is_office_location = {str(column_name == 'is_office_location').lower()},
                np.is_job_duration = {str(column_name == 'is_job_duration').lower()},
                np.is_supplemental_pay = {str(column_name == 'is_supplemental_pay').lower()},
                np.is_interview_procedure = {str(column_name == 'is_interview_procedure').lower()},
                np.is_posting_date = {str(column_name == 'is_posting_date').lower()}
            ''' + cu.return_everything_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

In [None]:

# Display the context of an individual child string
idx = 43
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = nu.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
print(f'{idx} {pos_symbol}) {child_str}')

In [None]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 0
nu.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

In [None]:

# Fix Headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in [25]:
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_header = true
            ''' + cu.return_everything_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

In [None]:

# Fix Non-headers
basic_quals_dict = nu.load_object('basic_quals_dict')
for idx in [37]:
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
    print(f'{idx} {pos_symbol}) {child_str}')
    def do_cypher_tx(tx, navigable_parent, verbose=False):
        cypher_str = f'''
            MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
            SET
                np.is_header = false
            ''' + cu.return_everything_str + ';'
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

In [None]:

def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = false,
            np.is_task_scope = false,
            np.is_minimum_qualification = false,
            np.is_preferred_qualification = false,
            np.is_educational_requirement = false,
            np.is_legal_notification = false,
            np.is_other = false,
            np.is_corporate_scope = false,
            np.is_job_title = false,
            np.is_office_location = false,
            np.is_job_duration = false,
            np.is_supplemental_pay = false,
            np.is_interview_procedure = true,
            np.is_posting_date = false
        ''' + cu.return_everything_str + ';'
    return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=False); row_objs_list

In [None]:

basic_quals_dict = nu.load_object('basic_quals_dict')
column_name = 'is_interview_procedure'
for idx in range(len(child_strs_list)):
    child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]
    if '?' in child_str:
        def do_cypher_tx(tx, navigable_parent, verbose=False):
            cypher_str = '''
                MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
                ''' + cu.return_everything_str + ';'
            results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
            return [dict(record.items()) for record in results_list]
        with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
        if (row_objs_list[0][column_name] != True):
            print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
            print(f'{idx} {pos_symbol}) {child_str}')
            def do_cypher_tx(tx, navigable_parent, verbose=False):
                cypher_str = f'''
                    MATCH (np:NavigableParents {{navigable_parent: $navigable_parent}})
                    SET
                        np.is_task_scope = {str(column_name == 'is_task_scope').lower()},
                        np.is_minimum_qualification = {str(column_name == 'is_minimum_qualification').lower()},
                        np.is_preferred_qualification = {str(column_name == 'is_preferred_qualification').lower()},
                        np.is_educational_requirement = {str(column_name == 'is_educational_requirement').lower()},
                        np.is_legal_notification = {str(column_name == 'is_legal_notification').lower()},
                        np.is_other = {str(column_name == 'is_other').lower()},
                        np.is_corporate_scope = {str(column_name == 'is_corporate_scope').lower()},
                        np.is_job_title = {str(column_name == 'is_job_title').lower()},
                        np.is_office_location = {str(column_name == 'is_office_location').lower()},
                        np.is_job_duration = {str(column_name == 'is_job_duration').lower()},
                        np.is_supplemental_pay = {str(column_name == 'is_supplemental_pay').lower()},
                        np.is_interview_procedure = {str(column_name == 'is_interview_procedure').lower()},
                        np.is_posting_date = {str(column_name == 'is_posting_date').lower()}
                    ''' + cu.return_everything_str + ';'
                return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
            with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)

In [None]:

# Show what's in the database already for this html string
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        ''' + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
row_objs_list


----
## Maintenance

In [None]:

# Display as histogram
# lru.display_hunting_dataframe_as_histogram()
cypher_str = f'''
    MATCH (fn:FileNames)
    WHERE
        (fn.percent_fit >= 0.8) AND
        ((fn.is_closed IS NULL) OR (fn.is_closed = false)) AND
        ((fn.is_opportunity_application_emailed IS NULL) OR (fn.is_opportunity_application_emailed = false))
    RETURN
        fn.percent_fit AS percent_fit,
        fn.file_name AS file_name,
        fn.posting_url AS posting_url
    ORDER BY fn.percent_fit DESC;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
# if row_objs_list: display(DataFrame(row_objs_list))

df = DataFrame(row_objs_list)
for file_name in df.file_name:
    print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{file_name}"]
SET fn.is_closed = true
RETURN fn;""")
    break

for file_name in df.file_name:
    print(f"""
MATCH (fn:FileNames)
WHERE fn.file_name IN ["{file_name}"]
SET fn.is_opportunity_application_emailed = true, fn.opportunity_application_email_date = date()
RETURN fn;""")
    break

In [19]:

# Break up overly-long O-RQs:
# Ensure you have already displayed the context of an individual child string above
# Don't close the Notepad++ window until you have replaced the child string
def display_file_in_text_editor(file_name):
    text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
    file_path = osp.abspath(osp.join(hau.SAVES_HTML_FOLDER, file_name))
    !"{text_editor_path}" "{file_path}"
display_file_in_text_editor(file_name)
cu.rebuild_filename_node(file_name, navigable_parent=None, verbose=True)
nu.beep(freq, duration)


                MATCH
                    (np:NavigableParents {navigable_parent: "<li>Work Location: Remote</li>"}),
                    (ht:HeaderTags {header_tag: "li"})
                MERGE (ht)-[r:SUMMARIZES]->(np);


In [None]:

# Remove this particular child string from the quals dictionary and database
basic_quals_dict = nu.load_object('basic_quals_dict')
child_str = child_strs_list[idx]
basic_quals_dict.pop(child_str, None)
# basic_quals_dict[child_str] = 0
nu.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')
def do_cypher_tx(tx, qualification_str, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $qualification_str})
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={'qualification_str': qualification_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, qualification_str=child_str, verbose=False)

In [None]:

# Remove file name from database
for file_name in ['']:
    cu.delete_filename_node(file_name, verbose=True)

In [None]:

import re

def separate_qualifications(quals_list):
  """
  This function takes a list of qualifications and separates them into individual sentences.

  Args:
      quals_list: A list of strings, where each string represents a qualification.

  Returns:
      A list of lists, where each inner list contains the individual qualifications extracted from the corresponding element in the original quals_list.
  """
  separated_quals = []
  for qual in quals_list:
    # Split qualifications based on commas, semicolons, and colons followed by whitespace
    split_quals = re.split(r", |; |:", qual)
    # Further split based on "and" if it's not the first word and there's punctuation before it
    for i, sentence in enumerate(split_quals):
      if i > 0 and re.search(r"\b(and)\b", sentence, flags=re.IGNORECASE) and re.search(r"[,\.;]", split_quals[i-1]):
        split_quals[i-1] += f" {sentence.strip()}"
        split_quals.pop(i)
    # Remove empty strings and leading/trailing whitespace
    separated_quals.append([q.strip() for q in split_quals if q.strip()])
  return separated_quals

separated_qualifications = separate_qualifications(quals_list)

# Print the separated qualifications
for qual_set in separated_qualifications:
  for qual in qual_set:
    print(qual)
  print()