In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

%matplotlib inline
from datetime import datetime
from pandas import DataFrame
import humanize
import os
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)


---
# Tabulate Job Hunting Progress

In [16]:

date_range = 'Sunday, 03/12/2023 - Saturday, 03/18/2023'
def do_cypher_tx(tx, date_range, verbose=False):
    cypher_str = '''
        // Define the input date range as a string
        WITH $date_range AS date_range

        // Split the input string into two parts, one for the start date and one for the end date
        WITH split(date_range, " - ") AS dates

        // Split the start and end dates into their components
        WITH
            split(dates[0], ", ") AS start_components,
            split(dates[1], ", ") AS end_components

        // Reassemble the start date components into a
        // format that the date() function can recognize
        WITH
            [item in split(start_components[1], "/") | toInteger(item)] AS start_components,
            [item in split(end_components[1], "/") | toInteger(item)] AS end_components

        // Convert the integer date partss into Neo4j date objects using the date() function
        WITH
            date({
                day: start_components[1],
                month: start_components[0],
                year: start_components[2]
                }) AS date_start,
            date({
                day: end_components[1],
                month: end_components[0],
                year: end_components[2]
                }) AS date_end

        // Find all FileNames nodes and filter them by opportunity_application_email_date property
        MATCH (fn:FileNames)
        WHERE
            (fn.opportunity_application_email_date >= date_start) AND
            (fn.opportunity_application_email_date <= date_end)

        // Return the filtered nodes
        RETURN fn;'''
    if verbose:
        clear_output(wait=True)
        print(cypher_str.replace('$date_range', f'"{date_range}"'))
    parameter_dict = {'date_range': date_range}
    rows_list = []
    for record in tx.run(query=cypher_str, parameters=parameter_dict):
        row_dict = {k: v for k, v in dict(record.items())['fn'].items()}
        rows_list.append(row_dict)
    df = DataFrame(rows_list)

    return df

with cu.driver.session() as session:
    progress_dates_df = session.write_transaction(do_cypher_tx, date_range=date_range, verbose=False)
mask_series = progress_dates_df.posting_url.isnull()
columns_list = ['opportunity_application_email_date', 'file_name', 'posting_url']
progress_dates_df[~mask_series][columns_list].T.to_dict()

{1: {'opportunity_application_email_date': neo4j.time.Date(2023, 3, 13), 'file_name': '1fea112bf6198419_Data_Scientist_Chicago_IL_Indeed_com.html', 'posting_url': 'https://www.indeed.com/rc/clk/dl?jk=1fea112bf6198419&from=ja&qd=RnZhMybXSk4M3QtTVGXWocPDA-jVn_f73KUcK2QrGXxWzxuTTZnceBTcgT1wk7VUhH6vRsR2kLpXgXBggkmABvNDuymhiEN80F4AmgvDj8k&rd=Xz_BPd8uwD7p5SZJpoZayl_MKnaSAFGAsD6kfERFt3g&tk=1grb91v2e2sr5001&alid=63b02dca1ef86228dd5d5128'}, 2: {'opportunity_application_email_date': neo4j.time.Date(2023, 3, 13), 'file_name': '8fc0ed7481ff426b_Data_Scientist_Gainesville_FL_Indeed_com.html', 'posting_url': 'https://www.indeed.com/rc/clk/dl?jk=8fc0ed7481ff426b&from=ja&qd=RnZhMybXSk4M3QtTVGXWocPDA-jVn_f73KUcK2QrGXzsP3Tt6Lw41Z1c6_ojf6PopcJcD0BHPEiWCwntHyTAx2T7g2ScrVXuQZy7QCMpQh4&rd=Xwq-0IM72XK4fIJ12CGj5l_MKnaSAFGAsD6kfERFt3g&tk=1grg51gnskmao800&alid=63b02dca1ef86228dd5d5128'}, 3: {'opportunity_application_email_date': neo4j.time.Date(2023, 3, 13), 'file_name': '8eda6f13ecba3a8c_Senior_Associate_Manag