In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from datetime import datetime
import humanize
import os
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

from section_classifier_utils import SectionLRClassifierUtilities, SectionCRFClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

from section_utils import SectionUtilities
su = SectionUtilities(wsu=wsu, ihu=None, hc=None, crf=None, slrcu=slrcu, verbose=False)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 8 seconds
Last run on 2023-12-19 06:59:36.616675



----

In [16]:

file_path = os.path.abspath('../data/html/indeed_email.html')
command_str = fr'"C:\Program Files\Notepad++\notepad++.exe" {file_path}'
print(command_str)
!{command_str}

"C:\Program Files\Notepad++\notepad++.exe" C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\data\html\indeed_email.html


In [17]:

try:
    driver = wsu.get_driver()#(browser_name='Chrome')
    winsound.Beep(freq, int(duration/2))
finally:
    file_path = '../data/html/indeed_email.html'
    page_soup = wsu.get_page_soup(file_path)
    css_selector = 'table > tbody > tr > td > a > table > tbody > tr > td > a'
    link_soups_list = page_soup.select(css_selector)
    url_strs_set = set()
    for link_soup in link_soups_list:
        url_str = link_soup['href']
        url_strs_set.add(url_str)
    display(len(url_strs_set))

Getting the FireFox driver


12

In [18]:

files_list = []
for url_str in url_strs_set: file_node_dict, files_list = su.load_indeed_posting_url(viewjob_url=url_str, driver=driver, files_list=files_list, verbose=False)
print(f'Fileing {len(files_list)} postings complete. Delete the email.')

Fileing 4 postings complete. Delete the email.


In [19]:

import os

try: driver.close()
except Exception as e: print(f'{e.__class__} error: {str(e).strip()}')
cu.ensure_navigableparent('END', verbose=False)
for file_name in files_list:
    file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    page_soup = wsu.get_page_soup(file_path)
    row_div_list = page_soup.find_all(name='div', id='jobDescriptionText')
    for div_soup in row_div_list:
        child_strs_list = ha.get_navigable_children(div_soup, [])
        cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
winsound.Beep(freq, duration)
files_list

['c84a1aaab38819f7_Machine_Learning_Engineer_Remote_Indeed_com.html', 'Gen_AI_Architect_Remote_Philadelphia_PA_Indeed_com.html', '6bd3ffa5a5acbb98_Data_Engineer_Richfield_OH_44286_Indeed_com.html', 'd752e252730033dd_Senior_Data_Analyst_Remote_Indeed_com.html']