In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

%%time
from pandas import DataFrame

from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities()
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from section_utils import SectionUtilities
su = SectionUtilities(s=s, ha=ha, cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
lru.build_isheader_logistic_regression_elements()
lru.build_pos_logistic_regression_elements()

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, verbose=False)

import warnings
warnings.filterwarnings('ignore')

CPU times: total: 3min 20s
Wall time: 3min 26s


In [4]:

from datetime import datetime
from urllib.parse import urlparse, parse_qs
from IPython.display import clear_output
%run ../load_magic/dataframes.py

In [6]:

navigable_parent_cypher_str = '''
    MATCH (np:NavigableParents {{navigable_parent: '{}'}})
    ''' + cu.return_everything_str + ';'


----

In [7]:

import shutil

file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\data\html\other_email.html'
file_name = re.sub(r'[^A-Za-z0-9]+', ' ',
                   '''Data Scientist at Array Remote''').strip().replace(' ', '_') + '.html'
new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if os.path.isfile(new_file_path):
    file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'
    new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if not os.path.isfile(new_file_path):
    print(file_name)
    shutil.copy(file_path, os.path.join(cu.SAVES_HTML_FOLDER, file_name))
    page_soup = get_page_soup(file_path)
    div_soup = page_soup.find_all(name='div', id='jobDescriptionText')[0]
    child_strs_list = ha.get_navigable_children(div_soup, [])
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)

Data_Scientist_at_Array_Remote.html


In [8]:

# Add the posting URL to the file name
cypher_str = f'''
    MATCH (fn:FileNames {{file_name: "{file_name}"}})
    SET fn.posting_url = NULL
    RETURN fn;'''
print(cypher_str)
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
row_objs_list


    MATCH (fn:FileNames {file_name: "Data_Scientist_at_Array_Remote.html"})
    SET fn.posting_url = NULL
    RETURN fn;


[{'fn': <Node id=800898 labels=frozenset({'FileNames'}) properties={'file_name': 'Data_Scientist_at_Array_Remote.html'}>}]

In [9]:

child_tags_list = ha.get_child_tags_list(child_strs_list)
is_header_list = []
for is_header, child_str in zip(ha.get_is_header_list(child_strs_list), child_strs_list):
    if is_header is None:
        probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
        idx = probs_list.index(max(probs_list))
        is_header = [True, False][idx]
    is_header_list.append(is_header)
feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
feature_tuple_list = []
for feature_dict in feature_dict_list:
    feature_tuple_list.append(hc.get_feature_tuple(feature_dict, lru.pos_lr_predict_single))
crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)

['H-O', 'O-SP', 'O-OL', 'O-CS', 'O-LN', 'O-TS', 'H-TS', 'O-CS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-RQ', 'O-TS', 'O-RQ', 'O-ER', 'O-RQ', 'O-TS', 'O-RQ', 'O-RQ', 'O-TS', 'O-RQ', 'O-RQ', 'O-CS', 'O-CS', 'O-LN', 'O-LN', 'O-TS', 'O-TS', 'O-SP', 'O-TS', 'O-OL', 'O-TS', 'O-CS', 'O-LN']
[12, 14, 15, 16, 18, 19, 21, 22]


[12, 14, 15, 16, 18, 19, 21, 22]



----

In [10]:

raise

RuntimeError: No active exception to reraise

In [None]:

# Hand-label individual child strings
idx = 12
child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = s.load_object('basic_quals_dict')
if(child_str in basic_quals_dict):
    print(basic_quals_dict[child_str])
child_str = cu.clean_text(child_str); print(f'{idx} {pos_symbol}) {child_str}')

In [None]:

# Hand-label this particular child string in the quals dictionary
# child_str = 'Spark, Camel, Python, R, Pyspark, Zepplin, Java, Scala'
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 1
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

In [None]:

cypher_str = f'''
    MATCH (np:NavigableParents {{navigable_parent: '{child_str}'}})
    SET
        np.is_header = 'False',
        np.is_task_scope = 'False',
        np.is_minimum_qualification = 'True',
        np.is_preferred_qualification = 'False',
        np.is_educational_requirement = 'False',
        np.is_legal_notification = 'False',
        np.is_other = 'False',
        np.is_corporate_scope = 'False',
        np.is_job_title = 'False',
        np.is_office_location = 'False',
        np.is_job_duration = 'False',
        np.is_supplemental_pay = 'False',
        np.is_interview_procedure = 'False',
        np.is_posting_date = 'False'
    {cu.return_everything_str};'''
# print(cypher_str)
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
row_objs_list

In [None]:

# Show what's in the database already for this html string
# print(navigable_parent_cypher_str.format(child_str))
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, navigable_parent_cypher_str.format(child_str))
row_objs_list


----

In [None]:

# Show what qualifications you have for this posting
from IPython.display import HTML, display

# file_name = 'Senior_Data_Analyst_Scientist-_Rider_Insights_-_Remote_-_Indeed.com_8e2e678241df1d58.html'
lru.build_isqualified_logistic_regression_elements(verbose=False)
lru.retrain_from_dictionary(verbose=False)
# child_strs_list = ha.get_child_strs_from_file(file_name=file_name)
indices_list = su.find_basic_quals_section_indexes(child_strs_list=child_strs_list, crf_list=crf_list, file_name=file_name)
quals_list = [child_str for i, child_str in enumerate(child_strs_list) if i in indices_list]
prediction_list = list(lru.predict_job_hunt_percent_fit(quals_list))
basic_quals_dict = s.load_object('basic_quals_dict')
lru.basic_quals_dict = basic_quals_dict
quals_str, qual_count = lru.get_quals_str(prediction_list, quals_list)
job_fitness = qual_count/len(prediction_list)
display(HTML(f'I only meet {job_fitness:.1%} of the minimum requirements for {file_name}:'))
for i, qual_str in enumerate(quals_list):
    if qual_str in basic_quals_dict:
        if basic_quals_dict[qual_str]:
            idx = qual_str.find('>')
            if idx == -1:
                display(HTML(f'{i+1}) {qual_str}'))
            else:
                display(HTML(f'{qual_str[:idx+1]}{i+1}) {qual_str[idx+1:]}'))

In [None]:

# Add URL
posting_url = 'https://boards.greenhouse.io/array/jobs/4098049004?gh_jid=4098049004'
posting_url = cu.clean_text(posting_url)
cypher_str = f'''
    MATCH (fn:FileNames {{file_name: "{file_name}"}})
    SET fn.posting_url = "{posting_url}"
    RETURN fn;'''
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
    print(row_objs_list)

In [None]:

# Show what qualifications you have for this posting
from IPython.display import HTML, display

# file_name = 'Senior_Backend_Engineer_(Data_Science_Software_Engineering_Support)_-_Remote_-_Indeed.com_3e34ac4ae73849ba.html'
mask_series = (hunting_df.file_name != file_name)
for row_index, row_series in hunting_df[~mask_series].iterrows():
    quals_list, job_fitness = print_fit_job(row_index, row_series, basic_quals_dict)
display(HTML(f'I only meet {job_fitness:.1%} of the minimum requirements:'))
for i, qual_str in enumerate(quals_list):
    if qual_str in basic_quals_dict:
        if basic_quals_dict[qual_str]:
            idx = qual_str.find('>')
            if idx == -1:
                display(HTML(f'{i+1}) {qual_str}'))
            else:
                display(HTML(f'{qual_str[:idx+1]}{i+1}) {qual_str[idx+1:]}'))