In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from datetime import datetime
import humanize
import os
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 1 second
Last run on 2023-11-30 14:16:47.839746


In [4]:

from hc_utils import HeaderCategories

hc = HeaderCategories(cu=cu, verbose=False)

In [5]:

# Keep the total creation time to less than one hour by adjusting the sampling strategy limit
from lr_utils import LrUtilities

lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

In [6]:

from section_classifier_utils import SectionLRClassifierUtilities, SectionSGDClassifierUtilities, SectionCRFClassifierUtilities

scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)

In [7]:

from crf_utils import CrfUtilities

crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, slrcu=slrcu, scrfcu=scrfcu, ssgdcu=ssgdcu, verbose=True)

In [8]:

from section_utils import SectionUtilities

su = SectionUtilities(wsu=wsu, ihu=None, hc=hc, crf=crf, slrcu=slrcu, verbose=False)

In [9]:

# Parts-of-speech CRF elements built in 28 minutes and 51 seconds
t1 = time.time()
if not (hasattr(scrfcu, 'pos_symbol_crf') or crf.is_flask_running()):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_symbol_crf'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is now available
Parts-of-speech conditional random field elements built in 5 seconds


In [10]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier trained in 12 hours, 15 minutes and 36 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'):
    if s.pickle_exists('crf_CRF'):
        crf.CRF = s.load_object('crf_CRF')
    else:
        crf.retrain_pos_classifier(header_pattern_dict=s.load_object('HEADER_PATTERN_DICT'), verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'POS classifier trained in {duration_str}')

POS classifier trained in 0 seconds


In [11]:

# Parts-of-speech stochastic gradient decent elements built in 17 seconds
t1 = time.time()
if not (hasattr(ssgdcu, 'pos_predict_percent_fit_dict') or crf.is_flask_running()):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient decent elements built in {duration_str}')

I have 49,102 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient decent elements built in 13 seconds


In [12]:

from is_header_sgd_classifier import IsHeaderSgdClassifier

ihu = IsHeaderSgdClassifier(ha=ha, cu=cu, verbose=False)


----

In [15]:

# Run this if you haven't already created the file, but need to edit other_email.html first
import re
import shutil

file_name = ''
if file_name: file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
else:
    file_path = os.path.abspath('../data/html/other_email.html')
    command_str = fr'"C:\Program Files\Notepad++\notepad++.exe" {file_path}'
    print(command_str)
    !{command_str}
    file_name = re.sub(r'[^A-Za-z0-9]+', ' ', '''
        100% Remote Opportunity for Data Scientist Sr.
        Satyam Kumar Pandey
        ''').strip().replace(' ', '_') + '.html'
    new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    if os.path.isfile(new_file_path):
        file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'
        new_file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    if not os.path.isfile(new_file_path):
        shutil.copy(file_path, os.path.join(cu.SAVES_HTML_FOLDER, file_name))
        print(file_name)
page_soup = wsu.get_page_soup(file_path)
div_soup = page_soup.find_all(name='div', id='jobDescriptionText')[0]
child_strs_list = ha.get_navigable_children(div_soup, [])
cu.ensure_filename(file_name, verbose=False)
cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)

In [16]:

# Add the posting URL to the file name only if you have one
posting_url = ''
if posting_url:
    cypher_str = f'''
        MATCH (fn:FileNames {{file_name: "{file_name}"}})
        SET fn.posting_url = "{posting_url}"
        RETURN fn;'''
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
    display(row_objs_list)


----

In [40]:

child_tags_list = ha.get_child_tags_list(child_strs_list)

feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
feature_tuple_list = []
for feature_dict in feature_dict_list:
    feature_tuple_list.append(hc.get_feature_tuple(
        feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=scrfcu.predict_single
    ))
crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))

In [41]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
winsound.Beep(freq, duration)

['H-JT', 'O-JT', 'H-OL', 'O-OL', 'O-SP', 'O-O', 'O-RQ', 'H-JD', 'O-JD', 'O-O', 'O-RQ', 'O-O', 'O-RQ', 'O-O', 'O-IP', 'H-RQ', 'O-RQ', 'O-TS', 'O-O', 'O-TS', 'H-TS', 'O-TS', 'O-TS', 'H-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'O-RQ', 'H-PQ', 'O-PQ', 'O-PQ', 'O-O', 'O-RQ']
[6, 10, 12, 16, 24, 25, 26, 27]


[6, 10, 12, 16, 24, 25, 26, 27]



----

In [None]:
raise

In [87]:

# Display the context of an individual child string
idx = 33
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = s.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end='')
print(f'{idx} {pos_symbol}) {child_str}')

[6, 10, 12, 16, 24, 25, 26, 27]


IndexError: list index out of range

In [85]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 1
s.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<span style="font-family:Century Gothic,sans-serif">Bachelor's degree, an additional 4 years of relevant work experience is required</span>" in basic_quals_dict: 1


In [86]:

def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = """MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = false,
            np.is_task_scope = false,
            np.is_minimum_qualification = false,
            np.is_preferred_qualification = false,
            np.is_educational_requirement = true,
            np.is_legal_notification = false,
            np.is_other = false,
            np.is_corporate_scope = false,
            np.is_job_title = false,
            np.is_office_location = false,
            np.is_job_duration = false,
            np.is_supplemental_pay = false,
            np.is_interview_procedure = false,
            np.is_posting_date = false
        """ + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session: row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
# ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=True); row_objs_list
row_objs_list

[{'navigable_parent': '<span style="font-family:Century Gothic,sans-serif">Bachelor\'s degree, an additional 4 years of relevant work experience is required</span>', 'is_header': False, 'is_task_scope': False, 'is_minimum_qualification': False, 'is_preferred_qualification': False, 'is_legal_notification': False, 'is_job_title': False, 'is_office_location': False, 'is_job_duration': False, 'is_supplemental_pay': False, 'is_educational_requirement': True, 'is_interview_procedure': False, 'is_corporate_scope': False, 'is_posting_date': False, 'is_other': False}]

In [99]:

# Show what's in the database already for this html string
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        ''' + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
row_objs_list

[{'navigable_parent': '<p>Benefits:</p>', 'is_header': true, 'is_task_scope': false, 'is_qualification': false, 'is_minimum_qualification': false, 'is_preferred_qualification': false, 'is_legal_notification': false, 'is_job_title': false, 'is_office_location': false, 'is_job_duration': false, 'is_supplemental_pay': true, 'is_educational_requirement': false, 'is_interview_procedure': false, 'is_corporate_scope': false, 'is_posting_date': false, 'is_other': false}]

In [None]:

# Remove this particular child string from the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
child_str = child_strs_list[idx]
basic_quals_dict.pop(child_str)
# basic_quals_dict[child_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')


----