In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys
sys.path.insert(1, '../py')

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=True)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities()
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from storage import Storage
s = Storage()

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from html_analysis import ElementAnalysis
ea = ElementAnalysis(ha=ha, hc=hc, verbose=False)

import warnings
warnings.filterwarnings('ignore')

In [3]:

import os
from IPython.display import HTML, display

files_list = sorted([fn for fn in os.listdir(ha.SAVES_HTML_FOLDER) if fn.endswith('.html')])

In [14]:

%%time
file_name = files_list.pop()
display(HTML(f'<h1>{file_name}</h1>'))
file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
child_strs_list = ha.get_child_strs_from_file(file_name)
ea.display_basic_requirements(child_strs_list)

CPU times: total: 1min 26s
Wall time: 1min 25s



# See if all the email links have Job Description Text

In [3]:

file_path = '../data/html/indeed_email.html'
%run ../load_magic/dataframes.py
page_soup = get_page_soup(file_path)
css_selector = 'body > table > tbody > tr > td > a > table > tbody > tr > td > a'
link_soups_list = page_soup.select(css_selector)

In [4]:

from html_analysis import ElementAnalysis

ea = ElementAnalysis()
for link_soup in link_soups_list:
    url_str = link_soup['href']
    page_soup = get_page_soup(url_str)
    row_div_list = page_soup.find_all(name='div', id='jobDescriptionText')
    for div_soup in row_div_list:
        child_strs_list = ea.ha.get_navigable_children(div_soup, [])
    break

In [5]:

from cypher_utils import CypherUtilities
from hc_utils import HeaderCategories

cu = CypherUtilities()
hc = HeaderCategories()

In [6]:

ea = ElementAnalysis()
child_tags_list = ea.ha.get_child_tags_list(child_strs_list)
is_header_list = ea.ha.get_is_header_list(child_strs_list)
sql_dict = {False: None, True: 1}
feature_dict_list = [{'initial_tag': tag, 'is_header': is_header,
                      'is_task_scope': sql_dict[(child_str in hc.TASK_SCOPE_HEADERS_LIST)],
                      'is_minimum_qualification': sql_dict[(child_str in hc.REQ_QUALS_HEADERS_LIST)],
                      'is_preferred_qualification': sql_dict[(child_str in hc.PREFF_QUALS_HEADERS_LIST)],
                      'is_legal_notification': sql_dict[(child_str in hc.LEGAL_NOTIFS_HEADERS_LIST)],
                      'is_job_title': sql_dict[(child_str in hc.JOB_TITLE_HEADERS_LIST)],
                      'is_office_location': sql_dict[(child_str in hc.OFFICE_LOC_HEADERS_LIST)],
                      'is_job_duration': sql_dict[(child_str in hc.JOB_DURATION_HEADERS_LIST)],
                      'is_supplemental_pay': sql_dict[(child_str in hc.SUPP_PAY_HEADERS_LIST)],
                      'is_educational_requirement': sql_dict[(child_str in hc.EDUC_REQS_HEADERS_LIST)],
                      'is_interview_procedure': sql_dict[(child_str in hc.INTERV_PROC_HEADERS_LIST)],
                      'is_corporate_scope': sql_dict[(child_str in hc.CORP_SCOPE_HEADERS_LIST)],
                      'is_posting_date': sql_dict[(child_str in hc.POST_DATE_HEADERS_LIST)],
                      'is_other': sql_dict[(child_str in hc.OTHER_HEADERS_LIST)],
                      'child_str': child_str} for tag, is_header, child_str in zip(child_tags_list, is_header_list,
                                                                                   child_strs_list)]

In [7]:

feature_tuple_list = [hc.get_feature_tuple(feature_dict) for feature_dict in feature_dict_list]

In [8]:

from crf_utils import CrfUtilities

crf = CrfUtilities(ha=ea.ha, hc=hc, cu=cu, verbose=False)

In [11]:

from neo4j import GraphDatabase
import logging
from neo4j.exceptions import ServiceUnavailable

In [12]:

class App:

    def __init__(self, uri, user, password):
        driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        # Don't forget to close the driver connection when you are finished with it
        driver.close()

    def create_friendship(self, person1_name, person2_name):
        with driver.session() as session:
            # Write transactions allow the driver to handle retries and transient errors
            result = session.write_transaction(
                _create_and_return_friendship, person1_name, person2_name)
            for row in result:
                print('Created friendship between: {p1}, {p2}'.format(p1=row['p1'], p2=row['p2']))

    @staticmethod
    def _create_and_return_friendship(tx, person1_name, person2_name):
        # To learn more about the Cypher syntax, see https://neo4j.com/docs/cypher-manual/current/
        # The Reference Card is also a good resource for keywords https://neo4j.com/docs/cypher-refcard/current/
        query = (
            'CREATE (p1:Person { name: $person1_name }) '
            'CREATE (p2:Person { name: $person2_name }) '
            'CREATE (p1)-[:KNOWS]->(p2) '
            'RETURN p1, p2'
        )
        result = tx.run(query, person1_name=person1_name, person2_name=person2_name)
        try:
            return [{'p1': row['p1']['name'], 'p2': row['p2']['name']}
                    for row in result]
        # Capture any errors along with the query and data for traceability
        except ServiceUnavailable as exception:
            logging.error('{query} raised an error: \n {exception}'.format(
                query=query, exception=exception))
            raise

    def find_person(self, person_name):
        with driver.session() as session:
            result = session.read_transaction(_find_and_return_person, person_name)
            for row in result:
                print('Found person: {row}'.format(row=row))

    @staticmethod
    def _find_and_return_person(tx, person_name):
        query = (
            'MATCH (p:Person) '
            'WHERE p.name = $person_name '
            'RETURN p.name AS name'
        )
        result = tx.run(query, person_name=person_name)
        return [row['name'] for row in result]

In [17]:

# Aura queries use an encrypted connection using the "neo4j+s" URI scheme
import scrape_utils

wsu = scrape_utils.WebScrapingUtilities()
uri = wsu.secrets_json['console.neo4j.io']['connect_url']
user =  wsu.secrets_json['console.neo4j.io']['username']
password = wsu.secrets_json['console.neo4j.io']['password']
app = App(uri, user, password)
app.create_friendship('Alice', 'David')
app.find_person('Alice')
app.close()

Created friendship between: Alice, David
Found person: Alice
Found person: Alice


In [None]:

crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))

In [19]:

file_name = 'Data_Scientist_(Services)_63b23e3bd4672ef3.html'
cypher_str = f"""
    MATCH (fn:FileNames {{file_name: '{file_name}'}})
    RETURN fn.file_name_id;"""
row_objs_list = cu.get_execution_results(cypher_str, verbose=True)


    MATCH (fn:FileNames {file_name: 'Data_Scientist_(Services)_63b23e3bd4672ef3.html'})
    RETURN fn.file_name_id;


In [20]:

row_objs_list

[{'fn.file_name_id': 93}]

In [24]:

# <div id='jobDescriptionText' class='jobsearch-jobDescriptionText'>
print([f'ea.{fn}' for fn in dir(ea) if 'dict' in fn.lower()])
print([f'ea.{fn}' for fn in dir(ea) if not fn.startswith('_')])

['ea.__dict__']
['ea.display_basic_requirements', 'ea.display_reqs_from_url', 'su.find_basic_quals_section_indexes', 'ea.get_idx_list', 'ea.ha', 'ea.hc']


In [15]:

print([f'ea.ha.{fn}' for fn in dir(ea.ha) if 'dict' in fn.lower()])
print([f'ea.ha.{fn}' for fn in dir(ea.ha) if not fn.startswith('_')])

['ea.ha.NAVIGABLE_PARENT_IS_HEADER_DICT', 'ea.ha.NAVIGABLE_PARENT_IS_QUAL_DICT', 'ea.ha.__dict__', 'ea.ha.store_true_or_false_dictionary']
['ea.ha.CHILDLESS_TAGS_LIST', 'ea.ha.CLF_NAME', 'ea.ha.CMAP', 'ea.ha.GT_REGEX', 'ea.ha.HTML_SCANNER_REGEX', 'ea.ha.LT_REGEX', 'ea.ha.NAVIGABLE_PARENT_IS_HEADER_DICT', 'ea.ha.NAVIGABLE_PARENT_IS_QUAL_DICT', 'ea.ha.QUALS_SCANNER_REGEX', 'ea.ha.SAVES_HTML_FOLDER', 'ea.ha.clean_html_str', 'ea.ha.get_body_soup', 'ea.ha.get_child_strs_from_file', 'ea.ha.get_child_tags_list', 'ea.ha.get_is_header_list', 'ea.ha.get_navigable_children', 'ea.ha.html2text', 'ea.ha.html_regex_tokenizer', 'ea.ha.quals_regex_tokenizer', 'ea.ha.store_true_or_false_dictionary', 'ea.ha.store_unique_list']


In [16]:

print([f'ea.hc.{fn}' for fn in dir(ea.hc) if 'dict' in fn.lower()])
print([f'ea.hc.{fn}' for fn in dir(ea.hc) if not fn.startswith('_')])

['ea.hc.POS_EXPLANATION_DICT', 'ea.hc.__dict__']
['ea.hc.CORP_SCOPE_HEADERS_LIST', 'ea.hc.EDUC_REQS_HEADERS_LIST', 'ea.hc.INTERV_PROC_HEADERS_LIST', 'ea.hc.JOB_DURATION_HEADERS_LIST', 'ea.hc.JOB_TITLE_HEADERS_LIST', 'ea.hc.LEGAL_NOTIFS_HEADERS_LIST', 'ea.hc.OFFICE_LOC_HEADERS_LIST', 'ea.hc.OTHER_HEADERS_LIST', 'ea.hc.POST_DATE_HEADERS_LIST', 'ea.hc.POS_EXPLANATION_DICT', 'ea.hc.PREFF_QUALS_HEADERS_LIST', 'ea.hc.REQ_QUALS_HEADERS_LIST', 'ea.hc.SUPP_PAY_HEADERS_LIST', 'ea.hc.TASK_SCOPE_HEADERS_LIST', 'ea.hc.append_parts_of_speech_list', 'ea.hc.get_feature_tuple']


In [12]:

su.find_basic_quals_section_indexes?

[1;31mSignature:[0m [0mea[0m[1;33m.[0m[0mfind_basic_quals_section[0m[1;33m([0m[0mchild_strs_list[0m[1;33m,[0m [0mverbose[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mFile:[0m      c:\users\daveb\onedrive\documents\github\job-hunting\py\html_analysis.py
[1;31mType:[0m      method


In [13]:

ea.ha.get_child_strs_from_file?

[1;31mSignature:[0m [0mea[0m[1;33m.[0m[0mha[0m[1;33m.[0m[0mget_child_strs_from_file[0m[1;33m([0m[0mfile_name[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mFile:[0m      c:\users\daveb\onedrive\documents\github\job-hunting\py\html_analysis.py
[1;31mType:[0m      method
