In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

from pandas import DataFrame

from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

In [4]:

from neo4j.exceptions import ServiceUnavailable

try:
    version_str = cu.driver.verify_connectivity()
    
    from hc_utils import HeaderCategories
    hc = HeaderCategories(cu=cu, verbose=False)

    from section_utils import SectionUtilities
    su = SectionUtilities(s=s, ha=ha, cu=cu, verbose=False)

    from lr_utils import LrUtilities
    lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
    lru.build_isheader_logistic_regression_elements()
    lru.build_pos_logistic_regression_elements()

    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    # print(str(e).strip())
    raise ServiceUnavailable('You need to start Neo4j as a console')
except Exception as e:
    print(e.__class__)

In [5]:

%run ../load_magic/dataframes.py
from datetime import datetime
from urllib.parse import urlparse, parse_qs
from urllib.error import HTTPError, URLError
from IPython.display import clear_output
import random
import requests
import time
print(f'Last run on {datetime.now()}')

Last run on 2022-07-25 08:25:16.183121



----

In [44]:

file_path = '../data/html/dice_email.html'
page_soup = wsu.get_page_soup(file_path)
tags_list = ['#jobDescriptionText', 'table', 'tbody', 'tr', 'td', 'div', 'div', 'div', 'div', 'div', 'div',
             'div', 'div', 'div', 'table', 'tbody', 'tr:nth-child(1)', 'td', 'table',
             'tbody', 'tr:nth-child(1)', 'td', 'a']
css_selector = ' > '.join(tags_list)
link_soups_list = page_soup.select(css_selector)
len(link_soups_list)

10

In [45]:

url_strs_list = []
rows_list = []
for link_soup in link_soups_list:
    url_str = link_soup['href']
    row_dict = {k: v[0] for k, v in parse_qs(urlparse(url_str).query).items()}
    row_dict['url_str'] = url_str
    rows_list.append(row_dict)
df = pd.DataFrame(rows_list)
url_strs_list = df.url_str.unique().tolist()
len(url_strs_list)

5

In [46]:

driver = wsu.get_driver()
wsu.log_into_dice(driver, verbose=False)
sorry_css = 'h1.pull-left'
article_css = '#jobdescSec'
job_title_css = '#jt'
job_org_css = '#hiringOrganizationName'
job_location_css = '.location'
ascii_regex = re.compile('[^A-Za-z0-9]+')

Getting the FireFox driver


In [47]:

files_list = []
for url_str in url_strs_list:
    wsu.driver_get_url(driver, url_str, verbose=False)
    time.sleep(3)
    tags_list = driver.find_elements_by_css_selector(sorry_css)
    if not tags_list:
        job_title_str = driver.find_elements_by_css_selector(job_title_css)[0].text
        job_org_str = driver.find_elements_by_css_selector(job_org_css)[0].text
        job_location_str = driver.find_elements_by_css_selector(job_location_css)[0].text
        page_title = f'{job_title_str} {job_org_str} {job_location_str}'
        file_name = ascii_regex.sub(' ', page_title).strip().replace(' ', '_')
        file_name = f'{file_name}.html'
        cu.ensure_filename(file_name, verbose=False)
        cu.set_posting_url(file_name, url_str, verbose=False)
        file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
        if os.path.isfile(file_path):
            file_name = datetime.now().strftime('%Y%m%d%H%M%S%f') + f'_{file_name}'
        if not os.path.isfile(file_path):
            with open(file_path, 'w', encoding=s.encoding_type) as f:
                print(f'Saving to {file_path}')
                f.write('<html><head><title>')
                f.write(page_title)
                f.write('</title></head><body><div id="jobDescriptionText">')
                web_obj = driver.find_elements_by_css_selector(article_css)[0]
                article_str = web_obj.get_attribute('innerHTML').strip()
                f.write(article_str)
                f.write('</div></body></html>')
            files_list.append(file_name)
print(f'Fileing {len(files_list)} postings complete')

Saving to ../saves/html\DataiKu_Avista_Tech_Remote.html
Saving to ../saves/html\Data_Scientist_InfoSmart_Technologies_Inc_Remote_or_Atlanta_GA.html
Saving to ../saves/html\OPB_Data_Scientist_Codeforce_360_Remote_or_100_Remote_GA.html
Saving to ../saves/html\Senior_Data_Scientist_Esteem_IT_Remote.html
Fileing 4 postings complete


In [48]:

driver.close()
cu.ensure_navigableparent('END', verbose=False)
for file_name in files_list:
    file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    if os.path.isfile(file_path):
        page_soup = wsu.get_page_soup(file_path)
        for div_soup in page_soup.find_all(name='div', attrs={'id': 'jobDescriptionText'}):
            child_strs_list = ha.get_navigable_children(div_soup, [])
            assert child_strs_list, f'Something is wrong with {file_name}'
            cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
files_list

['DataiKu_Avista_Tech_Remote.html', 'Data_Scientist_InfoSmart_Technologies_Inc_Remote_or_Atlanta_GA.html', 'OPB_Data_Scientist_Codeforce_360_Remote_or_100_Remote_GA.html', 'Senior_Data_Scientist_Esteem_IT_Remote.html']