In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

In [4]:

from neo4j.exceptions import ServiceUnavailable

try:
    version_str = cu.driver.verify_connectivity()

    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    # print(str(e).strip())
    raise ServiceUnavailable('You need to start Neo4j as a console')
except Exception as e:
    print(e.__class__)

In [5]:

from datetime import datetime
from urllib.parse import urlparse, parse_qs
from urllib.error import HTTPError, URLError
from IPython.display import clear_output
import re
import os
print(f'Last run on {datetime.now()}')

Last run on 2022-07-19 10:28:31.367373



----

In [6]:

file_path = '../data/html/linkedin_email.html'
page_soup = wsu.get_page_soup(file_path)
css_selector = '#jobDescriptionText > table > tbody > tr > td > table > tbody > tr > td > a > table > tbody > tr > td > a'
link_soups_list = page_soup.select(css_selector)
len(link_soups_list)

6

In [7]:

url_strs_list = [link_soup['href'] for link_soup in link_soups_list]
len(url_strs_list)

6

In [8]:

driver = wsu.get_driver()
wsu.log_into_linkedin(driver)
job_title_css = 'div.p5 > h1'
job_subtitle_css = 'div.jobs-unified-top-card__primary-description > span.jobs-unified-top-card__subtitle-primary-grouping'
ascii_regex = re.compile('[^A-Za-z0-9]+')
article_css = '.jobs-description__container'
seemore_button_css = 'button.t-14'
seemore_button_xpath = '/html/body//footer/button'
details_css = '#job-details'
nav_search_css = '.global-nav__content'

Getting the FireFox driver
Getting URL: https://www.linkedin.com/home
Filling in the session_key field with dave.babbitt@gmail.com
Clicking /html/body/main/section[1]/div/div/form/button


In [9]:

files_list = []
for url_str in url_strs_list:
    wsu.driver_get_url(driver, url_str, verbose=False)
    job_title_str = driver.find_elements_by_css_selector(job_title_css)[0].text
    job_subtitle_str = driver.find_elements_by_css_selector(job_subtitle_css)[0].text
    page_title = f'{job_title_str} {job_subtitle_str}'
    file_name = ascii_regex.sub(' ', page_title).strip().replace(' ', '_')
    trackingId = ascii_regex.sub(' ', parse_qs(urlparse(url_str).query).get('trackingId', [''])[0]).strip().replace(' ', '_')
    if len(trackingId):
        file_name = f'{trackingId}_{file_name}.html'
    else:
        file_name = f'{file_name}.html'
        file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
        if os.path.isfile(file_path):
            file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'
    file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    if not os.path.isfile(file_path):
        with open(file_path, 'w', encoding=s.encoding_type) as f:
            print(f'Saving to {file_path}')
            f.write('<html><head><title>')
            f.write(page_title)
            f.write('</title></head><body><div id="jobDescriptionText">')
            overlay_tag = driver.find_elements_by_css_selector(nav_search_css)[0]
            driver.execute_script("arguments[0].setAttribute('style','display:none;');", overlay_tag)
            wsu.click_web_element(driver, seemore_button_xpath, verbose=False)
            web_obj = driver.find_elements_by_css_selector(details_css)[0]
            article_str = web_obj.get_attribute('innerHTML').strip()
            f.write(article_str)
            f.write('</div></body></html>')
        files_list.append(file_name)
    cu.ensure_filename(file_name, verbose=False)
    cu.set_posting_url(file_name, url_str, verbose=False)
print(f'Fileing {len(files_list)} postings complete')

Saving to ../saves/html\4C9b9Y5ZywIoIGTFDVZUBQ_Senior_Applied_ML_Engineer_EvolutionIQ_Philadelphia_PA_Remote.html
Saving to ../saves/html\raedzJ33eIpFhdurQZ8CYA_Senior_Applied_ML_Engineer_EvolutionIQ_New_York_City_Metropolitan_Area_Remote.html
Saving to ../saves/html\sJGPuabtyiut4pTqh64vAQ_Senior_Applied_ML_Engineer_EvolutionIQ_Chicago_IL_Remote.html
Saving to ../saves/html\LF9QAxC0PMQNENXJ19mHiw_Senior_Applied_ML_Engineer_EvolutionIQ_Palo_Alto_CA_Remote.html
Saving to ../saves/html\7uh5feA3NNyj7vsoLK9HRg_API_Engineer_SVAM_International_Inc_Boston_MA_Remote.html
Saving to ../saves/html\Uqm87llsXAIjJ2JX_exP5g_Senior_Applied_ML_Engineer_EvolutionIQ_Washington_DC_Baltimore_Area_Remote.html
Fileing 6 postings complete


In [10]:

driver.close()
cu.ensure_navigableparent('END', verbose=False)
for file_name in files_list:
    file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    page_soup = wsu.get_page_soup(file_path)
    row_div_list = page_soup.find_all(name='article')
    for div_soup in row_div_list:
        child_strs_list = ha.get_navigable_children(div_soup, [])
        assert child_strs_list, f'Something is wrong with {file_name}'
        cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
files_list

['4C9b9Y5ZywIoIGTFDVZUBQ_Senior_Applied_ML_Engineer_EvolutionIQ_Philadelphia_PA_Remote.html', 'raedzJ33eIpFhdurQZ8CYA_Senior_Applied_ML_Engineer_EvolutionIQ_New_York_City_Metropolitan_Area_Remote.html', 'sJGPuabtyiut4pTqh64vAQ_Senior_Applied_ML_Engineer_EvolutionIQ_Chicago_IL_Remote.html', 'LF9QAxC0PMQNENXJ19mHiw_Senior_Applied_ML_Engineer_EvolutionIQ_Palo_Alto_CA_Remote.html', '7uh5feA3NNyj7vsoLK9HRg_API_Engineer_SVAM_International_Inc_Boston_MA_Remote.html', 'Uqm87llsXAIjJ2JX_exP5g_Senior_Applied_ML_Engineer_EvolutionIQ_Washington_DC_Baltimore_Area_Remote.html']