In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

# Get the Neo4j driver
from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities()
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

In [4]:

from neo4j.exceptions import ServiceUnavailable

try:
    version_str = cu.driver.verify_connectivity()
    print(f'======== {version_str} ========')
    
    from hc_utils import HeaderCategories
    hc = HeaderCategories(cu=cu, verbose=False)
    
    from section_utils import SectionUtilities
    su = SectionUtilities(s=s, ha=ha, cu=cu, verbose=False)
    
    from lr_utils import LrUtilities
    lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
    
    from crf_utils import CrfUtilities
    crf = CrfUtilities(ha=ha, hc=hc, cu=cu, verbose=False)
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    # print(str(e).strip())
    raise ServiceUnavailable('You need to start Neo4j as a console')
except Exception as e:
    print(e.__class__)



In [5]:

from datetime import datetime
from urllib.parse import urlparse, parse_qs
from IPython.display import clear_output
import shutil
import time
import humanize
%run ../load_magic/dataframes.py

In [6]:

navigable_parent_cypher_str = '''
    MATCH (np:NavigableParents {{navigable_parent: '{}'}})
    ''' + cu.return_everything_str + ';'


---
# Training

In [7]:

t0 = time.time()
lru.build_pos_logistic_regression_elements(verbose=False)
crf.retrain_pos_classifier(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech classifier trained in {duration_str}')

Parts-of-speech classifier trained in 7 minutes and 7 seconds


In [8]:

t0 = time.time()
lru.build_isheader_logistic_regression_elements(verbose=False)
lru.retrain_isheader_classifier(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Is-header classifier trained in {duration_str}')

Is-header classifier trained in 1 minute and 39 seconds


In [9]:

# Rebuild the classifer from the quals dictionary
t0 = time.time()
lru.build_isqualified_logistic_regression_elements(verbose=False)
lru.retrain_isqualified_classifier(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified classifer trained in {duration_str}')

Is-qualified classifer trained in 7 seconds



----

In [380]:

file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\data\html\other_email.html'
file_name = re.sub(r'[^A-Za-z0-9]+', ' ',
                   '''
Sr. Data Scientist
UMass Memorial Health
Worcester, MA 01608
''').strip().replace(' ', '_') + '.html'
new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if os.path.isfile(new_file_path):
    file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'
    new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if not os.path.isfile(new_file_path):
    shutil.copy(file_path, os.path.join(cu.SAVES_HTML_FOLDER, file_name))
    page_soup = get_page_soup(file_path)
    div_soup = page_soup.find_all(name='div', id='jobDescriptionText')[0]
    child_strs_list = ha.get_navigable_children(div_soup, [])
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
    print(file_name)

Sr_Data_Scientist_UMass_Memorial_Health_Worcester_MA_01608.html


In [381]:

# Add the posting URL to the file name
cypher_str = f'''
    MATCH (fn:FileNames {{file_name: "{file_name}"}})
    SET fn.posting_url = "https://www.indeed.com/viewjob?jk=78378484a2e75fd6&tk=1g9nsupd7v4et808&from=jobi2a&advn=1593641562529740&adid=395461290&ad=-6NYlbfkN0AMaXa3HsIVUHhGOm8qWa3jps87ppVspeqLYuXaXfwUvTtXN3pcVmE_N7alNJEpzZj3mgotD54zrWPVlfmCXff-woQlMoXgzqETYa5F5ct05VMXgjmEyf0XKSWbSLlLvVRfNZXI1NYWuGTowU6iNq2fuZNaxJ96oipbVurd7ncAqCBr4ieYUdGK1fbMfxkusHVqRvDlgxnEkEPWZlKANlZMvQGDWgrlbgxgSTFQzxfjS1fLytJ3LEsJGhmdJmuAPvQhRMQxlURfFkQfZ9cMknDpbehyipfNja7AWsR8j5hTTaAgLiRIvGv8JCIVdSlmJUb3E2Qw1IosQMCjoZvEGalh2VWjp9_S7U8eihLCKDW0kaQUsgPrPddEH5gdk3q9UrO8EjohgMkCgktl0zKuAeCUxMD5voENPl7wu9WQR5ppAGbaPZZoxsh4Hh8d_gQkrRM_ZaQ0y2ObsEIOhrAEJFlf&pub=21d85ca573e478f5e659e48885c828920cace3277f6b99df&i2af=jobi2a_smbrez_posting_email&xkcb=SoD5-_M3b9pFxlg0WL0PbzkdCdPP"
    RETURN fn;'''
print(cypher_str)
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
row_objs_list


    MATCH (fn:FileNames {file_name: "Sr_Data_Scientist_UMass_Memorial_Health_Worcester_MA_01608.html"})
    SET fn.posting_url = "https://www.indeed.com/viewjob?jk=78378484a2e75fd6&tk=1g9nsupd7v4et808&from=jobi2a&advn=1593641562529740&adid=395461290&ad=-6NYlbfkN0AMaXa3HsIVUHhGOm8qWa3jps87ppVspeqLYuXaXfwUvTtXN3pcVmE_N7alNJEpzZj3mgotD54zrWPVlfmCXff-woQlMoXgzqETYa5F5ct05VMXgjmEyf0XKSWbSLlLvVRfNZXI1NYWuGTowU6iNq2fuZNaxJ96oipbVurd7ncAqCBr4ieYUdGK1fbMfxkusHVqRvDlgxnEkEPWZlKANlZMvQGDWgrlbgxgSTFQzxfjS1fLytJ3LEsJGhmdJmuAPvQhRMQxlURfFkQfZ9cMknDpbehyipfNja7AWsR8j5hTTaAgLiRIvGv8JCIVdSlmJUb3E2Qw1IosQMCjoZvEGalh2VWjp9_S7U8eihLCKDW0kaQUsgPrPddEH5gdk3q9UrO8EjohgMkCgktl0zKuAeCUxMD5voENPl7wu9WQR5ppAGbaPZZoxsh4Hh8d_gQkrRM_ZaQ0y2ObsEIOhrAEJFlf&pub=21d85ca573e478f5e659e48885c828920cace3277f6b99df&i2af=jobi2a_smbrez_posting_email&xkcb=SoD5-_M3b9pFxlg0WL0PbzkdCdPP"
    RETURN fn;


[{'fn': <Node id=852753 labels=frozenset({'FileNames'}) properties={'file_name': 'Sr_Data_Scientist_UMass_Memorial_Health_Worcester_MA_01608.html', 'posting_url': 'https://www.indeed.com/viewjob?jk=78378484a2e75fd6&tk=1g9nsupd7v4et808&from=jobi2a&advn=1593641562529740&adid=395461290&ad=-6NYlbfkN0AMaXa3HsIVUHhGOm8qWa3jps87ppVspeqLYuXaXfwUvTtXN3pcVmE_N7alNJEpzZj3mgotD54zrWPVlfmCXff-woQlMoXgzqETYa5F5ct05VMXgjmEyf0XKSWbSLlLvVRfNZXI1NYWuGTowU6iNq2fuZNaxJ96oipbVurd7ncAqCBr4ieYUdGK1fbMfxkusHVqRvDlgxnEkEPWZlKANlZMvQGDWgrlbgxgSTFQzxfjS1fLytJ3LEsJGhmdJmuAPvQhRMQxlURfFkQfZ9cMknDpbehyipfNja7AWsR8j5hTTaAgLiRIvGv8JCIVdSlmJUb3E2Qw1IosQMCjoZvEGalh2VWjp9_S7U8eihLCKDW0kaQUsgPrPddEH5gdk3q9UrO8EjohgMkCgktl0zKuAeCUxMD5voENPl7wu9WQR5ppAGbaPZZoxsh4Hh8d_gQkrRM_ZaQ0y2ObsEIOhrAEJFlf&pub=21d85ca573e478f5e659e48885c828920cace3277f6b99df&i2af=jobi2a_smbrez_posting_email&xkcb=SoD5-_M3b9pFxlg0WL0PbzkdCdPP'}>}]

In [382]:

child_tags_list = ha.get_child_tags_list(child_strs_list)
is_header_list = []
for is_header, child_str in zip(ha.get_is_header_list(child_strs_list), child_strs_list):
    if is_header is None:
        probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
        idx = probs_list.index(max(probs_list))
        is_header = [True, False][idx]
    is_header_list.append(is_header)
feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
feature_tuple_list = []
for feature_dict in feature_dict_list:
    feature_tuple_list.append(hc.get_feature_tuple(feature_dict, lru.pos_lr_predict_single))
crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)

['O-TS', 'O-TS', 'O-OL', 'O-CS', 'O-OL', 'O-CS', 'O-CS', 'O-TS', 'H-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-CS', 'O-CS', 'O-CS', 'O-ER', 'O-RQ', 'O-CS', 'O-RQ', 'O-RQ', 'O-RQ', 'O-TS', 'O-TS', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-TS', 'O-RQ', 'O-RQ', 'O-PQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-TS', 'O-RQ', 'O-LN', 'O-JD', 'O-SP', 'H-JD', 'O-JD', 'O-OL']
[17, 18, 20, 21, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 37]


[17, 18, 20, 21, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 37]



----

In [None]:
raise

In [440]:

# Display the context of an individual child string
idx = 38
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = s.load_object('basic_quals_dict'); print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end=''); print(f'{idx} {pos_symbol}) {child_str}')

[17, 18, 20, 21, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 37]
38 O-LN) <p>All qualified applicants will receive consideration for employment without regard to race, color, religion, sex, sexual orientation, gender identity, national origin, disability or protected veteran status.</p>


In [438]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<li>Experience working with Optum Performance Analytics, medical claims data.</li>" in basic_quals_dict: 0


In [439]:

def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = """MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = 'False',
            np.is_task_scope = 'False',
            np.is_minimum_qualification = 'False',
            np.is_preferred_qualification = 'True',
            np.is_educational_requirement = 'False',
            np.is_legal_notification = 'False',
            np.is_other = 'False',
            np.is_corporate_scope = 'False',
            np.is_job_title = 'False',
            np.is_office_location = 'False',
            np.is_job_duration = 'False',
            np.is_supplemental_pay = 'False',
            np.is_interview_procedure = 'False',
            np.is_posting_date = 'False'
        """ + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
row_objs_list

[{'navigable_parent': '<li>Experience working with Optum Performance Analytics, medical claims data.</li>', 'is_header': 'False', 'is_task_scope': 'False', 'is_minimum_qualification': 'False', 'is_preferred_qualification': 'True', 'is_legal_notification': 'False', 'is_job_title': 'False', 'is_office_location': 'False', 'is_job_duration': 'False', 'is_supplemental_pay': 'False', 'is_educational_requirement': 'False', 'is_interview_procedure': 'False', 'is_corporate_scope': 'False', 'is_posting_date': 'False', 'is_other': 'False'}]

In [210]:

# Remove this particular child string from the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
child_str = child_strs_list[idx]
basic_quals_dict.pop(child_str)
# basic_quals_dict[child_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"a larger business audience" in basic_quals_dict: False
