In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

%matplotlib inline
from datetime import datetime
from neo4j.exceptions import ServiceUnavailable
import humanize
import matplotlib.pyplot as plt
import time
import winsound

bin_count = 12
duration = 1000  # milliseconds
freq = 880  # Hz
height_inches = 3.0
width_inches = 18.0

In [4]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

try:
    
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
    
    from hc_utils import HeaderCategories
    hc = HeaderCategories(cu=cu, verbose=False)
    
    # 400 6,094 37 minutes and 50 seconds
    # 800 6,094 36 minutes and 42 seconds
    # 1,600 8,349 59 minutes and 6 seconds
    # 3_200 9,974 1 hour, 4 minutes and 24 seconds
    # 6_400 10,635 1 hour, 23 minutes and 11 seconds
    from lr_utils import LrUtilities
    lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
    lru.build_isheader_logistic_regression_elements(verbose=False)
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=5_000, verbose=False)
    lru.build_pos_logistic_regression_elements(sampling_strategy_limit=6_400, verbose=True)
    
    from crf_utils import CrfUtilities
    crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, verbose=True)
    
    from section_utils import SectionUtilities
    su = SectionUtilities(s=s, ha=ha, cu=cu, crf=crf, verbose=False)
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
I have 10,635 labeled parts of speech in here
Got this <class 'numpy.core._exceptions._ArrayMemoryError'> error in build_pos_logistic_regression_elements trying  to turn the pos_symbol TF-IDF matrix into a normal array: Unable to allocate 31.7 GiB for an array with shape (10635, 400406) and data type float64
Utility libraries created in 57 minutes and 34 seconds
Last run on 2023-03-07 21:44:15.020697



---
# Training

In [5]:

# You need to run this again if you changed the qualification dictionary in another notebook
t0 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
# sampling_strategy_limit=9_000 gets 11,365 hand-labeled qualification strings and takes 2 minutes and 25 seconds
basic_quals_dict = lru.sync_basic_quals_dict(sampling_strategy_limit=8_000, verbose=False)

lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified classifer retrained in {duration_str}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
I have 11,746 hand-labeled qualification strings in here
Retraining complete
Is-qualified classifer retrained in 3 minutes and 4 seconds


In [6]:

from pandas import DataFrame

cypher_str = f'''
    MATCH (fn:FileNames)
    WHERE
        fn.percent_fit = 0.0 AND
        ((fn.is_closed IS NULL) OR (fn.is_closed = false)) AND
        ((fn.is_verified IS NULL) OR (fn.is_verified = false)) AND
        ((fn.is_opportunity_application_emailed IS NULL) OR
        (fn.is_opportunity_application_emailed = false))
    RETURN
        fn.percent_fit AS percent_fit,
        fn.file_name AS file_name,
        fn.posting_url AS url
    ORDER BY fn.percent_fit ASC;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
files_list = []
if row_objs_list:
    files_list = DataFrame(row_objs_list).file_name.tolist()
print(f'Only {len(files_list)} more mis-estimated minimum-requirements-met percentages to go!')

Only 1427 more mis-estimated minimum-requirements-met percentages to go!


In [7]:

from pandas import DataFrame

cypher_str = f'''
    MATCH (fn:FileNames)
    WHERE
        (toLower(fn.file_name) CONTAINS "data_scien")
        AND (fn.role_title IS NOT NULL)
        AND ((fn.is_closed IS NULL) OR (fn.is_closed = false))
        AND ((fn.is_verified IS NULL) OR (fn.is_verified = false))
        AND ((fn.is_opportunity_application_emailed IS NULL) OR
            (fn.is_opportunity_application_emailed = false))
    RETURN
        fn.percent_fit AS percent_fit,
        fn.file_name AS file_name
    ORDER BY fn.percent_fit ASC;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
files_list = []
if row_objs_list:
    files_list = DataFrame(row_objs_list).file_name.tolist()
print(f'Only {len(files_list)} more mis-estimated minimum-requirements-met percentages to go!')

Only 0 more mis-estimated minimum-requirements-met percentages to go!



----
## Fix POS and Quals for this posting

In [10]:

import os

file_name = 'aafda86facc69d43_Experimentation_Data_Scientist_Remote_Indeed_com.html'
# file_name = files_list.pop()
file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if os.path.isfile(file_path):
    child_strs_list = ha.get_child_strs_from_file(file_name=file_name)
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
    child_tags_list = ha.get_child_tags_list(child_strs_list)
    is_header_list = []
    for is_header, child_str in zip(ha.get_is_header_list(child_strs_list), child_strs_list):
        if is_header is None:
            probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
            idx = probs_list.index(max(probs_list))
            is_header = [True, False][idx]
        is_header_list.append(is_header)
    feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
    feature_tuple_list = []
    for feature_dict in feature_dict_list:
        feature_tuple_list.append(hc.get_feature_tuple(feature_dict, lru.pos_lr_predict_single))
    crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
    print(file_name)

aafda86facc69d43_Experimentation_Data_Scientist_Remote_Indeed_com.html


In [11]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
winsound.Beep(freq, duration)

['H-TS', 'O-O', 'O-O', 'O-O', 'H-JT', 'H-OL', 'H-IP', 'O-JD', 'O-OL', 'H-SP', 'H-SP', 'O-SP', 'O-SP', 'H-TS', 'O-CS', 'O-O', 'O-TS', 'O-TS', 'H-TS', 'O-LN', 'O-LN', 'O-TS', 'O-TS', 'O-CS', 'O-CS', 'O-TS', 'H-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-LN', 'H-O', 'H-O', 'O-PD']
[27, 28, 29, 30, 31, 32, 33, 34]


[27, 28, 29, 30, 31, 32, 33, 34]


In [None]:
raise

In [20]:

# Display the context of an individual child string
idx = 27
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = s.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end=''); print(f'{idx} {pos_symbol}) {child_str}')

[27, 28, 29, 30, 31, 32, 33, 34]
0
27 O-RQ) <li>MS or Ph.D. or equivalent experience in a quantitative field or 6+ years of proven ability as a Data Scientist. Preferably in digital media or product fields.</li>


In [21]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 1
s.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<li>MS or Ph.D. or equivalent experience in a quantitative field or 6+ years of proven ability as a Data Scientist. Preferably in digital media or product fields.</li>" in basic_quals_dict: 1


In [19]:

def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = """MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = 'False',
            np.is_task_scope = 'True',
            np.is_minimum_qualification = 'False',
            np.is_preferred_qualification = 'False',
            np.is_educational_requirement = 'False',
            np.is_legal_notification = 'False',
            np.is_other = 'False',
            np.is_corporate_scope = 'False',
            np.is_job_title = 'False',
            np.is_office_location = 'False',
            np.is_job_duration = 'False',
            np.is_supplemental_pay = 'False',
            np.is_interview_procedure = 'False',
            np.is_posting_date = 'False'
        """ + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
row_objs_list

[{'navigable_parent': '<li>Serve as strategic partner with business to size and prioritize tests that could bring high-yield testing results to maximize business impact.</li>', 'is_header': 'False', 'is_task_scope': 'True', 'is_qualification': None, 'is_minimum_qualification': 'False', 'is_preferred_qualification': 'False', 'is_legal_notification': 'False', 'is_job_title': 'False', 'is_office_location': 'False', 'is_job_duration': 'False', 'is_supplemental_pay': 'False', 'is_educational_requirement': 'False', 'is_interview_procedure': 'False', 'is_corporate_scope': 'False', 'is_posting_date': 'False', 'is_other': 'False'}]

In [35]:

# Show what's in the database already for this html string
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        ''' + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
row_objs_list

[{'navigable_parent': '<orq>1 - Data Science (P4 - Expert)</orq>', 'is_header': 'False', 'is_task_scope': 'False', 'is_qualification': None, 'is_minimum_qualification': 'True', 'is_preferred_qualification': 'False', 'is_legal_notification': 'False', 'is_job_title': 'False', 'is_office_location': 'False', 'is_job_duration': 'False', 'is_supplemental_pay': 'False', 'is_educational_requirement': 'False', 'is_interview_procedure': 'False', 'is_corporate_scope': 'False', 'is_posting_date': 'False', 'is_other': 'False'}]

In [111]:

# Remove this particular child string from the quals dictionary and database
basic_quals_dict = s.load_object('basic_quals_dict')
child_str = child_strs_list[idx]
basic_quals_dict.pop(child_str, None)
# basic_quals_dict[child_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')
def do_cypher_tx(tx, qualification_str, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $qualification_str})
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={'qualification_str': qualification_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, qualification_str=child_str, verbose=False)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<li>Understand the needs and challenges of fast-growth organization and participate in its development</li>" in basic_quals_dict: False



----

In [99]:

# Mark the file name as needing retraining everywhere
import numpy as np
from IPython.display import clear_output

# file_name = '14885afaa7bbd01e_Software_Developer_Engineer_in_Test_SDET_Eagle_ID_83616_Indeed_com.html'
mask_series = lru.hunting_df.percent_fit.isin([file_name])
lru.hunting_df.loc[mask_series, 'percent_fit'] = np.nan
s.store_objects(hunting_df=lru.hunting_df)
def do_cypher_tx(tx, file_name, verbose=False):
    cypher_str = """
        MATCH (fn:FileNames {file_name: $file_name})
        SET fn.percent_fit = NULL, fn.is_verified = false
        RETURN fn;"""
    if verbose:
        clear_output(wait=True)
        print(cypher_str.replace('$file_name', f'"{file_name}"'))
    results_list = tx.run(query=cypher_str, parameters={'file_name': file_name})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, file_name=file_name, verbose=False)
row_objs_list

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\hunting_df.pkl


[{'fn': <Node element_id='968277' labels=frozenset({'FileNames'}) properties={'assigned_role': 'Data Science Practitioner', 'career_level_from_to': '9 to 8', 'file_name': '4698820_0_META_PLATFORMS_INC_Data_Scientist.html', 'role_primary_contact_email_id': 'k.balasubramaniam@accenture.com', 'is_role_sold': 'Yes', 'role_title': 'Data Scientist', 'role_client_supply_contact': 'Navarrete Méndez,Maria Patricia', 'is_verified': False, 'role_id': '4698820.0', 'project_metro_city': 'San Francisco', 'role_end_date': '12/1/2023', 'role_primary_contact': 'Krishnamurthy,Balasubramaniam', 'role_start_date': '3/1/2023', 'client_name': 'META PLATFORMS, INC.'}>}]

In [112]:

# You've made no changes to the qualification dictionary (regardless of parts-of-speech changes)
from IPython.display import clear_output

def do_cypher_tx(tx, file_name, verbose=False):
    cypher_str = """
        MATCH (fn:FileNames {file_name: $file_name})
        SET fn.is_verified = true
        RETURN fn;"""
    if verbose:
        clear_output(wait=True)
        print(cypher_str.replace('$file_name', f'"{file_name}"'))
    parameter_dict = {'file_name': file_name}
    results_list = tx.run(query=cypher_str, parameters=parameter_dict)
    values_list = []
    for record in results_list:
        values_list.append(dict(record.items()))

    return values_list
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, file_name=file_name, verbose=True)
row_objs_list


        MATCH (fn:FileNames {file_name: "4698820_META_PLATFORMS_INC_Data_Scientist.html"})
        SET fn.is_verified = true
        RETURN fn;


[{'fn': <Node element_id='957809' labels=frozenset({'FileNames'}) properties={'assigned_role': 'Data Science Practitioner', 'career_level_from_to': '9 to 8', 'file_name': '4698820_META_PLATFORMS_INC_Data_Scientist.html', 'role_primary_contact_email_id': 'k.balasubramaniam@accenture.com', 'is_role_sold': 'Yes', 'role_title': 'Data Scientist', 'role_client_supply_contact': 'Navarrete M?ez,Maria Patricia', 'is_verified': True, 'role_id': '4698820', 'project_metro_city': 'San Francisco', 'role_end_date': '12/1/2023', 'percent_fit': 0.16666666666666666, 'role_primary_contact': 'Krishnamurthy,Balasubramaniam', 'role_start_date': '3/1/2023', 'client_name': 'META PLATFORMS, INC.'}>}]

In [None]:

# Mark the file name as closed
cypher_str = f'''
    MATCH (fn:FileNames {{file_name: "{file_name}"}})
    SET fn.is_closed = true
    RETURN fn;'''
print(cypher_str)
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
row_objs_list

In [None]:

# Manually label the unscored qual
qualification_str = quals_list[13]
print(qualification_str)
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[qualification_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)

In [None]:

# Remove file name from database
# file_name = '3c031ea6ad293e92_General_Service_Technician_Westborough_MA_01581_Indeed_com.html'
cu.delete_filename_node(file_name, verbose=True)


----

In [195]:

# Take a badly written requirements section and see if you can programmatically parse the qualification string out of it
import re
from nltk.tokenize import sent_tokenize

# sampling_strategy_limit=6_400 gets 10,635 labeled parts of speech and takes 49 minutes and 30 seconds
# sampling_strategy_limit=7_000 gets 10,635 labeled parts of speech and takes 49 minutes and 30 seconds
lru.build_pos_logistic_regression_elements(sampling_strategy_limit=70_000, verbose=True)

child_str = '<p>REQUIREMENTS: Requires a Bachelor’s degree, or foreign equivalent degree in Computer Science, Computer Engineering, or Electronic Engineering and four (4) years of experience in the job offered,'
child_str += ' or four (4) years of experience in a related occupation driving strategy and approach through solution and enterprise testing; executing automation through Ginger, CI/CD, Agile, Python, Java languages,'
child_str += ' and testing tools such as Selenium; collaborating with cross functional teams to analyze, develop, and implement end-to-end solutions; using existing and modernized tooling such as, Jira Align,'
child_str += ' iTrack, Zephyr, AI/ML, and AQUA; performing the walkthrough and grooming of capabilities and features in cases with ARTs; creating test plans,'
child_str += ' scenarios/use cases and test cases associated with capabilities and features; ensuring that all test cases are in alignment with automation frameworks; writing E2E scenario test cases,'
child_str += ' maximizing test coverage for a feature, and minimizing the impact of disruptive test cases;'
child_str += ' designing and implementing automation tests and frameworks to enable continuous deployment and continuous testing for CTP across all phases;'
child_str += ' and ensuring that all automation scripts have gone through standard code quality checks, incorporating Gerrit Code Review and Cloud Review.</p>'
qual_paragraph = re.sub('</?[^<>]+>', '', child_str.strip(), 0, re.MULTILINE)
if len(sent_tokenize(qual_paragraph)) < 2:
    child_strs_list = re.split(' *: *', qual_paragraph, 0)
    child_tags_list = ha.get_child_tags_list(child_strs_list)
    is_header_list = []
    for is_header, child_str in zip(ha.get_is_header_list(child_strs_list), child_strs_list):
        if is_header is None:
            probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
            idx = probs_list.index(max(probs_list))
            is_header = [True, False][idx]
        is_header_list.append(is_header)
    feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
    feature_tuple_list = []
    for feature_dict in feature_dict_list:
        feature_tuple_list.append(hc.get_feature_tuple(feature_dict, lru.pos_lr_predict_single))
    crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
    if crf_list[0] == 'H-RQ':
        child_strs_list = re.split(' *; *', ': '.join(child_strs_list[1:]), 0)
        child_tags_list = ha.get_child_tags_list(child_strs_list)
        is_header_list = []
        for is_header, child_str in zip(ha.get_is_header_list(child_strs_list), child_strs_list):
            if is_header is None:
                probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
                idx = probs_list.index(max(probs_list))
                is_header = [True, False][idx]
            is_header_list.append(is_header)
        feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
        feature_tuple_list = []
        for feature_dict in feature_dict_list:
            feature_tuple_list.append(hc.get_feature_tuple(feature_dict, lru.pos_lr_predict_single))
        crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
        db_pos_list = []
        for navigable_parent in child_strs_list:
            db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
        pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)

I have 10,635 labeled parts of speech in here
['O-RQ', 'O-PQ', 'O-TS', 'O-TS', 'O-RQ', 'O-TS', 'O-TS', 'O-TS', 'O-LN', 'O-TS']
[0, 4]


[0, 4]
