In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

%%time
from pandas import DataFrame

from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities()
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from section_utils import SectionUtilities
su = SectionUtilities(s=s, ha=ha, cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
lru.build_isheader_logistic_regression_elements()
lru.build_pos_logistic_regression_elements()

import warnings
warnings.filterwarnings('ignore')

CPU times: total: 1min 39s
Wall time: 1min 42s


In [4]:

cypher_str = f'''
    MATCH (fn:FileNames)
    WHERE
        fn.percent_fit IN [0.0, 1.0] AND
        ((fn.is_verfied IS NULL) OR 
        (fn.is_verfied = false)) AND
        ((fn.is_closed IS NULL) AND ((fn.is_opportunity_application_emailed IS NULL) OR 
        (fn.is_opportunity_application_emailed = false))) OR (fn.is_closed = false)
    RETURN fn.file_name AS file_name
    ORDER BY fn.percent_fit DESC;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
files_list = []
if row_objs_list:
    files_list = DataFrame(row_objs_list).file_name.tolist()
files_list

[]


----

In [5]:

%run ../load_magic/dataframes.py

# file_name = files_list.pop()
file_name = 'Senior_Backend_Engineer_(Data_Science_Software_Engineering_Support)_-_Remote_-_Indeed.com_3e34ac4ae73849ba.html'
print(file_name)
file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
page_soup = get_page_soup(file_path)
div_soup = page_soup.find_all(name='div', id='jobDescriptionText')[0]
child_strs_list = ha.get_navigable_children(div_soup, [])
cu.ensure_filename(file_name, verbose=False)
cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)

Senior_Backend_Engineer_(Data_Science_Software_Engineering_Support)_-_Remote_-_Indeed.com_3e34ac4ae73849ba.html


In [6]:

child_tags_list = ha.get_child_tags_list(child_strs_list)
print(child_tags_list)

['plaintext', 'p', 'b', 'p', 'p', 'p', 'p', 'b', 'li', 'li', 'li', 'li', 'li', 'li', 'li', 'li', 'b', 'li', 'li', 'li', 'li', 'li', 'li', 'li', 'li', 'li', 'li', 'p', 'p', 'i']


In [7]:

is_header_list = []
for is_header, child_str in zip(ha.get_is_header_list(child_strs_list), child_strs_list):
    if is_header is None:
        probs_list = lru.ISHEADER_PREDICT_PERCENT_FIT(child_str)
        idx = probs_list.index(max(probs_list))
        is_header = [True, False][idx]
    is_header_list.append(is_header)

In [8]:

feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
feature_dict_list[4]

{'initial_tag': 'p', 'is_header': False, 'is_task_scope': None, 'is_minimum_qualification': None, 'is_preferred_qualification': None, 'is_legal_notification': None, 'is_job_title': None, 'is_office_location': None, 'is_job_duration': None, 'is_supplemental_pay': None, 'is_educational_requirement': None, 'is_interview_procedure': None, 'is_corporate_scope': None, 'is_posting_date': None, 'is_other': None, 'child_str': '<p>MEDS Perform/ Edge Study Feasibility: This is an intelligence and analytics product giving our clients better decision making capabilities.</p>'}

In [9]:

feature_tuple_list = []
for feature_dict in feature_dict_list:
    feature_tuple_list.append(hc.get_feature_tuple(feature_dict, lru.pos_lr_predict_single))
feature_tuple_list[4]

('p', '<p>MEDS Perform/ Edge Study Feasibility: This is an intelligence and analytics product giving our clients better decision making capabilities.</p>', 'O-CS')

In [10]:

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, verbose=False)
crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
crf_list

['O-CS', 'O-TS', 'O-SP', 'O-TS', 'O-CS', 'O-LN', 'H-TS', 'O-SP', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-CS', 'O-TS', 'O-TS', 'H-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-TS', 'O-RQ', 'O-RQ', 'O-PQ', 'O-CS', 'O-TS', 'O-LN']

In [11]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
db_pos_list

['O', 'O', 'H', 'O', 'O', 'O', 'O', 'H', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'H-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-PQ', 'O-PQ', 'O-CS', 'O', 'O']

In [12]:

from matplotlib.colors import to_hex
from IPython.display import HTML, display

# Make an RGB dictionary of all the parts-of-speech symbols
rgba_dict = su.get_pos_color_dictionary()

html_str = ''
pos_list = []
for i, (crf_symbol, db_symbol) in enumerate(zip(crf_list, db_pos_list)):
    if db_symbol in [None, 'O', 'H']:
        pos_list.append(crf_symbol)
    else:
        pos_list.append(db_symbol)
print(pos_list)
indices_list = su.get_section(pos_list)
print(indices_list)
for i, (child_str, pos_symbol) in enumerate(zip(child_strs_list, pos_list)):
    rgba = rgba_dict[pos_symbol]
    hex_str = to_hex(rgba, keep_alpha=True)
    if len(indices_list) and (i == min(indices_list)):
        html_str += '<hr />'
    child_str = su.append_pos_symbol(child_str, pos_symbol, use_explanation=True)
    html_str += f'{i+0} {pos_symbol}) <span style="color:{hex_str};">{child_str}</span><br />'
    if len(indices_list) and (i == max(indices_list)):
        html_str += '<hr />'
display(HTML(html_str))
print(indices_list)

['O-CS', 'O-TS', 'O-SP', 'O-TS', 'O-CS', 'O-LN', 'H-TS', 'O-SP', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-TS', 'O-CS', 'O-TS', 'O-TS', 'H-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-PQ', 'O-PQ', 'O-CS', 'O-TS', 'O-LN']
[17, 18, 19, 20, 21, 22, 23, 24]


[17, 18, 19, 20, 21, 22, 23, 24]



----

In [13]:

raise

RuntimeError: No active exception to reraise

In [23]:

# Hand-label individual child strings
idx = 24
child_str = child_strs_list[idx]
basic_quals_dict = s.load_object('basic_quals_dict')
if(child_str in basic_quals_dict):
    print(basic_quals_dict[child_str])
child_str = cu.clean_text(child_str)
print(child_str)

1
<li>Experience working with large amounts of data (hundreds of GB+) is desirable</li>


In [15]:

# Hand-label this particular child string in the quals dictionary
# child_str = 'Spark, Camel, Python, R, Pyspark, Zepplin, Java, Scala'
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 1
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<li>Requires a Bachelor s degree and a minimum of 5 years of related experience in software engineering; or an advanced degree with 3 years of experience; or equivalent work experience</li>" in basic_quals_dict: 1


In [16]:

cypher_str = f'''
    MATCH (np:NavigableParents {{navigable_parent: '{child_str}'}})
    SET
        np.is_header = 'False',
        np.is_task_scope = 'False',
        np.is_minimum_qualification = 'True',
        np.is_preferred_qualification = 'False',
        np.is_legal_notification = 'False',
        np.is_job_title = 'False',
        np.is_office_location = 'False',
        np.is_job_duration = 'False',
        np.is_supplemental_pay = 'False',
        np.is_educational_requirement = 'True',
        np.is_interview_procedure = 'False',
        np.is_corporate_scope = 'False',
        np.is_posting_date = 'False',
        np.is_other = 'False'
    RETURN
        np.navigable_parent AS navigable_parent,
        np.is_header AS is_header,
        np.is_task_scope AS is_task_scope,
        np.is_minimum_qualification AS is_minimum_qualification,
        np.is_preferred_qualification AS is_preferred_qualification,
        np.is_legal_notification AS is_legal_notification,
        np.is_job_title AS is_job_title,
        np.is_office_location AS is_office_location,
        np.is_job_duration AS is_job_duration,
        np.is_supplemental_pay AS is_supplemental_pay,
        np.is_educational_requirement AS is_educational_requirement,
        np.is_interview_procedure AS is_interview_procedure,
        np.is_corporate_scope AS is_corporate_scope,
        np.is_posting_date AS is_posting_date,
        np.is_other AS is_other;'''
# print(cypher_str)
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
row_objs_list

[{'navigable_parent': '<li>Requires a Bachelor s degree and a minimum of 5 years of related experience in software engineering; or an advanced degree with 3 years of experience; or equivalent work experience</li>', 'is_header': 'False', 'is_task_scope': 'False', 'is_minimum_qualification': 'True', 'is_preferred_qualification': 'False', 'is_legal_notification': 'False', 'is_job_title': 'False', 'is_office_location': 'False', 'is_job_duration': 'False', 'is_supplemental_pay': 'False', 'is_educational_requirement': 'True', 'is_interview_procedure': 'False', 'is_corporate_scope': 'False', 'is_posting_date': 'False', 'is_other': 'False'}]

In [None]:

# See if this particualr chld string is in the database
cypher_str = f'''
    MATCH (np:NavigableParents {{navigable_parent: '{child_str}'}})
    RETURN
        np.navigable_parent AS navigable_parent,
        np.is_header AS is_header,
        np.is_task_scope AS is_task_scope,
        np.is_minimum_qualification AS is_minimum_qualification,
        np.is_preferred_qualification AS is_preferred_qualification,
        np.is_legal_notification AS is_legal_notification,
        np.is_job_title AS is_job_title,
        np.is_office_location AS is_office_location,
        np.is_job_duration AS is_job_duration,
        np.is_supplemental_pay AS is_supplemental_pay,
        np.is_educational_requirement AS is_educational_requirement,
        np.is_interview_procedure AS is_interview_procedure,
        np.is_corporate_scope AS is_corporate_scope,
        np.is_posting_date AS is_posting_date,
        np.is_other AS is_other;'''
# print(cypher_str)
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
row_objs_list


----

In [None]:

# You've made no changes to the parts-of-speech symbols because it looks good as is
file_name = cu.clean_text(file_name)
cypher_str = f'''
    MATCH (fn:FileNames {{file_name: "{file_name}"}})
    SET fn.is_verfied = true;'''
with cu.driver.session() as session:
    session.write_transaction(cu.do_cypher_tx, cypher_str)

In [None]:

# Mark the file name as needing retraining everywhere
import numpy as np

hunting_df = s.load_object('hunting_df')
mask_series = hunting_df.percent_fit.isin([file_name])
hunting_df.loc[mask_series, 'percent_fit'] = np.nan
s.store_objects(hunting_df=hunting_df)
file_name = cu.clean_text(file_name)
cypher_str = f'''
    MATCH (fn:FileNames {{file_name: "{file_name}"}})
    SET fn.percent_fit = NULL;'''
print(cypher_str)
with cu.driver.session() as session:
    session.write_transaction(cu.do_cypher_tx, cypher_str)

In [None]:

# Remove this particular child string from the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
child_str = child_strs_list[idx]
basic_quals_dict.pop(child_str)
# basic_quals_dict[child_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')

In [None]:

# Mark the files with the largest qualification (implying it was run together) as needing to be retrained
import numpy as np

hunting_df = s.load_object('hunting_df')
basic_quals_dict = s.load_object('basic_quals_dict')
child_str = sorted([child_str for child_str in basic_quals_dict.keys()], key=lambda x: len(x), reverse=True)[0]
print(child_str)

In [None]:

# Mark the files with the largest qualification (implying it was run together) as needing to be retrained
basic_quals_dict.pop(child_str)
s.store_objects(basic_quals_dict=basic_quals_dict)
child_str = cu.clean_text(child_str)
cypher_str = f'''
    MATCH (np:NavigableParents {{navigable_parent: "{child_str}"}})-[r:NEXT]->(:NavigableParents)
    RETURN r.file_name AS file_name;'''
print(cypher_str)
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
for row_obj in row_objs_list:
    file_name = row_obj['file_name']
    mask_series = hunting_df.file_name.isin([file_name])
    hunting_df.loc[mask_series, 'percent_fit'] = np.nan
    s.store_objects(hunting_df=hunting_df)
    file_name = cu.clean_text(file_name)
    cypher_str = f'''
        MATCH (fn:FileNames {{file_name: "{file_name}"}})
        SET fn.percent_fit = NULL;'''
    # print(cypher_str)
    with cu.driver.session() as session:
        session.write_transaction(cu.do_cypher_tx, cypher_str)

In [None]:

# Find a qual in the dictionary with this substring
sentence_regex = re.compile(r'[\.;]')
quals_set = set()
concatonated_quals_list = sentence_regex.split(child_str.replace('<div>', '').replace('</div>', '').strip())
for q in concatonated_quals_list:
    q = q.strip()
    if q:
        quals_set.add(q)
quals_list = list(quals_set)
for q in quals_list:
    print(q)