In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

from datetime import datetime
from urllib.parse import urlparse, parse_qs
import humanize
import os
import re
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz
job_title_css = 'div.p5 > h1'
job_subtitle_css = 'div.jobs-unified-top-card__primary-description > span.jobs-unified-top-card__subtitle-primary-grouping'
ascii_regex = re.compile('[^A-Za-z0-9]+')
article_css = '.jobs-description__container'
seemore_button_css = 'button.t-14'
seemore_button_xpath = '/html/body//footer/button'
details_css = '#job-details'
nav_search_css = '.global-nav__content'

# Insert at 1, 0 is the script path (or '' in REPL)
if ('../py' not in sys.path): sys.path.insert(1, '../py')

In [3]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 1 second
Last run on 2023-03-23 11:17:44.196651



----

In [7]:

try:
    driver = wsu.get_driver()
    winsound.Beep(freq, int(duration/2))
    wsu.log_into_linkedin(driver)
finally:
    page_soup = wsu.get_page_soup('../data/html/linkedin_email.html')
    css_selector = '#jobDescriptionText > table > tbody > tr > td > table > tbody > tr > td > a > table > tbody > tr > td > a'
    link_soups_list = page_soup.select(css_selector)
    
    # Get rid of the duplicate URLs
    url_strs_set = set()
    for link_soup in link_soups_list:
        url_str = link_soup['href']
        url_strs_set.add(url_str)
    
    display(len(url_strs_set))

Getting the FireFox driver
Getting URL: https://www.linkedin.com/home
Filling in the session_key field with dave.babbitt@gmail.com
Clicking /html/body/main/section[1]/div/div/form/button
Waiting for the web element to be visible: Message: (280, 628) is out of bounds of viewport width (1280) and height (587)
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:180:5
MoveTargetOutOfBoundsError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:370:5
assertInViewPort@chrome://remote/content/marionette/action.sys.mjs:2113:11
dispatch@chrome://remote/content/marionette/action.sys.mjs:1011:21
dispatch/pendingEvents<@chrome://remote/content/marionette/action.sys.mjs:1827:14
dispatch@chrome://remote/content/marionette/action.sys.mjs:1826:39
dispatch/chainEvents<@chrome://remote/content/marionette/action.sys.mjs:1753:27
dispatch@chrome://remote/content/marionette/action.sys.mjs:1755:7
performAc

2

In [8]:

from selenium.webdriver.common.by import By

# Assumes this is the first time you've run this cell
files_list = []

for url_str in url_strs_set:
    wsu.driver_get_url(driver, url_str, verbose=False)
    # import time
    time.sleep(4)
    titles_list = driver.find_elements(By.CSS_SELECTOR, job_title_css)
    if not titles_list:
        titles_list = driver.find_elements(By.CSS_SELECTOR, '#ember130 > h2:nth-child(1)')
    if titles_list:
        job_title_str = titles_list[0].text
        job_subtitle_str = driver.find_elements(By.CSS_SELECTOR, job_subtitle_css)[0].text
        page_title = f'{job_title_str} {job_subtitle_str}'
        file_name = ascii_regex.sub(' ', page_title).strip().replace(' ', '_')
        trackingId = ascii_regex.sub(' ', parse_qs(urlparse(url_str).query).get('trackingId', [''])[0]).strip().replace(' ', '_')
        if len(trackingId):
            file_name = f'{trackingId}_{file_name}.html'
        else:
            file_name = f'{file_name}.html'

            # Assumes this is the first time you've run this cell
            file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
            if os.path.isfile(file_path):
                file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'

        file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
        if not os.path.isfile(file_path):
            with open(file_path, 'w', encoding=s.encoding_type) as f:
                print(f'Saving to {file_path}')
                f.write('<html><head><title>')
                f.write(page_title)
                f.write('</title></head><body><div id="jobDescriptionText">')
                overlay_tag = driver.find_elements(By.CSS_SELECTOR, nav_search_css)[0]
                driver.execute_script("arguments[0].setAttribute('style','display:none;');", overlay_tag)
                wsu.click_web_element(driver, seemore_button_xpath, verbose=False)
                web_obj = driver.find_elements(By.CSS_SELECTOR, details_css)[0]
                article_str = web_obj.get_attribute('innerHTML').strip()
                f.write(article_str)
                f.write('</div></body></html>')
            files_list.append(file_name)
        cu.ensure_filename(file_name, verbose=False)
        cu.set_posting_url(file_name, url_str, verbose=False)
print(f'Fileing {len(files_list)} postings complete. Delete the email.')

Saving to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\html\wJ1dhSD9f1UGBFUsOCIy8w_Senior_Principal_Data_Scientist_UKG_Lowell_MA_On_site.html
Fileing 1 postings complete. Delete the email.


In [9]:

try:
    driver.close()
except Exception as e:
    print(f'{e.__class__.__name__} error: {str(e).strip()}')
cu.ensure_navigableparent('END', verbose=False)
for file_name in files_list:
    file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    page_soup = wsu.get_page_soup(file_path)
    row_div_list = page_soup.find_all(name='article')
    for div_soup in row_div_list:
        child_strs_list = ha.get_navigable_children(div_soup, [])
        assert child_strs_list, f'Something is wrong with {file_name}'
        cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
wsu.beep(freq, duration)
files_list

['wJ1dhSD9f1UGBFUsOCIy8w_Senior_Principal_Data_Scientist_UKG_Lowell_MA_On_site.html']