In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [6]:

# Get the Neo4j driver
from pandas import DataFrame
from urllib.parse import urlparse, parse_qs
from IPython.display import clear_output
import re
from urllib.error import HTTPError, URLError
import os

from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities()
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

In [4]:

from neo4j.exceptions import ServiceUnavailable

try:
    version_str = cu.driver.verify_connectivity()
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    # print(str(e).strip())
    raise ServiceUnavailable('You need to start Neo4j as a console')
except Exception as e:
    print(e.__class__)

In [None]:

file_path = '../data/html/indeed_jobs.html'
page_soup = wsu.get_page_soup(file_path)
selectors_list = ['body', 'div', 'div', 'div', 'div.atw-AppCard-mainContainer', 'div.atw-AppCard-jobInfo', 'div',
                  'header', 'div.atw-JobInfo-statusTag', 'div', 'div', 'span']
files_list = []
for span_soup in page_soup.select(' > '.join(selectors_list)):
    status_str = span_soup.text
    header_soup = span_soup.parent.parent.parent.parent
    viewjob_url = header_soup.select('a')[0]['href']
    jk_str = parse_qs(urlparse(viewjob_url).query).get('jk', [''])[0]
    # print(header_soup.prettify())
    
    # Find the posting URL associated with the file name
    def do_cypher_tx(tx, jk_str, verbose=False):
        cypher_str = f'''
            MATCH (fn:FileNames)
            WHERE fn.posting_url CONTAINS $jk_str
            RETURN fn;'''
        if verbose:
            clear_output(wait=True)
            print(cypher_str.replace('$jk_str', f'"{jk_str}"'))
        parameter_dict = {'jk_str': jk_str}
        results_list = tx.run(query=cypher_str, parameters=parameter_dict)
        values_list = []
        for record in results_list:
            values_list.append(dict(record.items()))

        return values_list
    with cu.driver.session() as session:
        row_objs_list = session.read_transaction(do_cypher_tx, jk_str=f'jk={jk_str}', verbose=False)
        df = DataFrame([{k: v for k, v in row_obj['fn'].items()} for row_obj in row_objs_list]).T
        if df.shape[1]:
            file_node_dict = df[0].to_dict()
        else:
            file_node_dict, files_list = su.load_indeed_posting_url(viewjob_url=viewjob_url, jk_str=jk_str,
                                                                    files_list=files_list, verbose=True)
        def do_cypher_tx(tx, file_name, indeed_status, verbose=False):
            cypher_str = """
                MATCH (fn:FileNames {file_name: $file_name})
                SET fn.indeed_status = $indeed_status;"""
            if verbose:
                clear_output(wait=True)
                print(cypher_str.replace('$file_name', f'"{file_name}"').replace('$indeed_status', f'"{indeed_status}"'))
            parameter_dict = {'file_name': file_name, 'indeed_status': indeed_status}
            tx.run(query=cypher_str, parameters=parameter_dict)
        with cu.driver.session() as session:
            session.write_transaction(do_cypher_tx, file_name=file_node_dict['file_name'],
                                      indeed_status=status_str, verbose=True)

In [10]:

cu.ensure_navigableparent('END', verbose=False)
for file_name in files_list:
    file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    page_soup = wsu.get_page_soup(file_path)
    row_div_list = page_soup.find_all(name='div', id='jobDescriptionText')
    for div_soup in row_div_list:
        child_strs_list = ha.get_navigable_children(div_soup, [])
        cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)

In [11]:

files_list

[]