In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

# Get the Neo4j driver
from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

In [4]:

from neo4j.exceptions import ServiceUnavailable

try:
    version_str = cu.driver.verify_connectivity()
    print(f'======== {version_str} ========')
    
    from hc_utils import HeaderCategories
    hc = HeaderCategories(cu=cu, verbose=False)
    
    from section_utils import SectionUtilities
    su = SectionUtilities(s=s, ha=ha, cu=cu, verbose=False)
    
    from lr_utils import LrUtilities
    lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
    
    from crf_utils import CrfUtilities
    crf = CrfUtilities(ha=ha, hc=hc, cu=cu, verbose=False)
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    # print(str(e).strip())
    raise ServiceUnavailable('You need to start Neo4j as a console')
except Exception as e:
    print(e.__class__)



In [5]:

import time
import humanize
from IPython.display import HTML, display
import enchant
from datetime import datetime
print(f'Last run on {datetime.now()}')

Last run on 2022-12-16 15:52:12.601844



---
# Training

In [6]:

t0 = time.time()
lru.build_isheader_logistic_regression_elements(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Is-header classifier rebuilt in {duration_str}')

Is-header classifier rebuilt in 0 seconds


In [7]:

t0 = time.time()
lru.build_pos_logistic_regression_elements(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech classifier rebuilt in {duration_str}')

Parts-of-speech classifier rebuilt in 1 minute and 30 seconds


In [8]:

# Rebuild the classifer from the quals dictionary
t0 = time.time()
lru.build_isqualified_logistic_regression_elements(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified classifer built in {duration_str}')

Is-qualified classifer built in 0 seconds



----
# Prepare cover sheet

In [14]:

# Show what qualifications you have for this posting
file_name = 'eca2876d9f354127_Security_Software_Engineer_L5_Detection_Engineering_Remote_Indeed_com.html'
child_strs_list = ha.get_child_strs_from_file(file_name=file_name)
is_header_list = []
for is_header, child_str in zip(ha.get_is_header_list(child_strs_list), child_strs_list):
    if is_header is None:
        probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
        idx = probs_list.index(max(probs_list))
        is_header = [True, False][idx]
    is_header_list.append(is_header)
feature_tuple_list = []
for feature_dict in hc.get_feature_dict_list(ha.get_child_tags_list(child_strs_list), is_header_list, child_strs_list):
    feature_tuple_list.append(hc.get_feature_tuple(feature_dict, lru.pos_lr_predict_single))
crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
indices_list = su.find_basic_quals_section_indexes(child_strs_list=child_strs_list, crf_list=crf_list, file_name=file_name)
quals_list = [child_str for i, child_str in enumerate(child_strs_list) if i in indices_list]
prediction_list = list(lru.predict_job_hunt_percent_fit(quals_list))
basic_quals_dict = s.load_object('basic_quals_dict')
lru.basic_quals_dict = basic_quals_dict
_, qual_count = lru.get_quals_str(prediction_list, quals_list)
job_fitness = qual_count/len(prediction_list)
d = enchant.Dict('en_US')
job_title = ' '.join([w for w in file_name.replace('.html', '').replace('_Indeed_com', '').split('_') if d.check(w)])
display(HTML(f'<p>I only meet {job_fitness:.1%} of the minimum requirements for the {job_title} position, but I can explain:</p>'))
for i, qual_str in enumerate(quals_list):
    if qual_str in basic_quals_dict:
        if basic_quals_dict[qual_str]:
            idx = qual_str.find('>')
            if idx == -1:
                display(HTML(f'{i+1}) {qual_str}'))
            else:
                display(HTML(f'{qual_str[:idx+1]}{i+1}) {qual_str[idx+1:]}'))

In [15]:

display(HTML(f"<p>The minimum requirements that I don't meet are:</p>"))
for i, qual_str in enumerate(quals_list):
    if (qual_str not in basic_quals_dict) or not basic_quals_dict[qual_str]:
        idx = qual_str.find('>')
        if idx == -1:
            display(HTML(f'{i+1}) {qual_str}'))
        else:
            display(HTML(f'{qual_str[:idx+1]}{i+1}) {qual_str[idx+1:]}'))

In [11]:

# This doesn't work unless you score all the O-PQs
db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list = []
for i, (crf_symbol, db_symbol) in enumerate(zip(crf_list, db_pos_list)):
    if db_symbol in [None, 'O', 'H']:
        pos_list.append(crf_symbol)
    else:
        pos_list.append(db_symbol)
display(HTML(f"<p>The preferred requirements that I meet are:</p>"))
pqs_list = [child_str for pos_str, child_str in zip(pos_list, child_strs_list) if (pos_str in ['O-PQ'])]
for i, qual_str in enumerate(pqs_list):
    if qual_str in basic_quals_dict:
        if basic_quals_dict[qual_str]:
            idx = qual_str.find('>')
            if idx == -1:
                display(HTML(f'{i+1}) {qual_str}'))
            else:
                display(HTML(f'{qual_str[:idx+1]}{i+1}) {qual_str[idx+1:]}'))