In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

%matplotlib inline
from IPython.display import HTML
from datetime import datetime
from neo4j.exceptions import ServiceUnavailable
import enchant
import humanize
import os
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

try:
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

from section_classifier_utils import SectionLRClassifierUtilities, SectionSGDClassifierUtilities, SectionCRFClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, slrcu=slrcu, verbose=True)

from section_utils import SectionUtilities
su = SectionUtilities(wsu=wsu, crf=crf, verbose=False)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')

Utility libraries created in 5 seconds


In [4]:

# Train the isheader classifier
t1 = time.time()
from is_header_sgd_classifier import IsHeaderSgdClassifier
ihu = IsHeaderSgdClassifier(ha=ha, cu=cu, verbose=False)
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Is-header classifier trained in {duration_str}')

I have 49,087 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 49,087 records trained
Is-header classifier trained in 6 seconds


In [5]:

# Train the POS SGD classifier
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech SGD classifier trained in {duration_str}')

I have 49,102 labeled parts of speech in here
Parts-of-speech SGD classifier trained in 8 seconds


In [6]:

# Check if the lru has built its is-qualified classifier
t0 = time.time()
if not hasattr(lru, 'ISQUALIFIED_LR'):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=5_000, verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified LR elements built in {duration_str}')

I have 424,879 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 4 seconds


In [7]:

# Train the POS CRF classifier
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech CRF classifier trained in {duration_str}')

Parts-of-speech CRF classifier trained in 1 second


In [8]:

# Seek a SectionLRClassifierUtilities object
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    pos_lr_predict_single = slrcu.predict_single
elif crf.is_flask_running():
    pos_lr_predict_single = crf.get_pos_lr_predict_single_from_api
else:
    pos_lr_predict_single = None

# Seek a SectionCRFClassifierUtilities object
if hasattr(scrfcu, 'pos_symbol_crf'):
    pos_crf_predict_single = scrfcu.predict_single
elif crf.is_flask_running():
    pos_crf_predict_single = crf.get_pos_crf_predict_single_from_api
else:
    pos_crf_predict_single = None

# Seek a SectionSGDClassifierUtilities object
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    pos_sgd_predict_single = ssgdcu.predict_single
elif crf.is_flask_running():
    pos_sgd_predict_single = crf.get_pos_sgd_predict_single_from_api
else:
    pos_sgd_predict_single = None

In [9]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier trained in 12 hours, 15 minutes and 36 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'):
    crf.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(crf, 'CRF'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'POS classifier trained in {duration_str}')

predict_single is now available
POS classifier trained in 0 seconds


In [10]:

file_name = '20231130141017677230_100_Remote_Opportunity_for_Data_Scientist_Sr_Satyam_Kumar_Pandey.html'


---
# Training

In [11]:

# You need to run this again if you changed the qualification dictionary in another notebook
t0 = time.time()
lru.sync_basic_quals_dict(sampling_strategy_limit=None, verbose=False)
lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified classifer retrained in {duration_str}')

I have 13,281 hand-labeled qualification strings in here
I have 440,750 is-qualified vocabulary tokens in here
Is-qualified classifer retrained in 7 seconds



----
# Prepare cover sheet

In [12]:

# Show what qualifications you have for this posting
t0 = time.time()
ask_str = ''
child_strs_list = ha.get_child_strs_from_file(file_name=file_name)

feature_tuple_list = []
for feature_dict in cu.get_feature_dict_list(ha.get_child_tags_list(child_strs_list), child_strs_list):
    feature_tuple_list.append(hc.get_feature_tuple(
        feature_dict, pos_lr_predict_single=pos_lr_predict_single, pos_crf_predict_single=pos_crf_predict_single,
        pos_sgd_predict_single=pos_sgd_predict_single
    ))

crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
indices_list = su.find_basic_quals_section_indexes(child_strs_list=child_strs_list, crf_list=crf_list, file_name=file_name)
quals_list = [child_str for i, child_str in enumerate(child_strs_list) if i in indices_list]
prediction_list = list(lru.predict_job_hunt_percent_fit(quals_list))
_, qual_count = lru.get_quals_str(prediction_list, quals_list)
job_fitness = qual_count/len(prediction_list)
d = enchant.Dict('en_US')
job_title = ' '.join([w for w in file_name.replace('.html', '').replace('_Indeed_com', '').split('_') if d.check(w)])
met_str = f'<p>I only meet {job_fitness:.1%} of the minimum requirements for the {job_title} position, but I can explain:</p>'
ask_str += met_str
display(HTML(met_str))
for i, qual_str in enumerate(quals_list):
    if qual_str in lru.basic_quals_dict:
        if lru.basic_quals_dict[qual_str]:
            met_str = f'{i+1}) {qual_str}'
            ask_str += ' ' + met_str
            idx = qual_str.find('>')
            if idx == -1:
                display(HTML(met_str))
            else:
                display(HTML(f'{qual_str[:idx+1]}{i+1}) {qual_str[idx+1:]}'))
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Qualifications shown in {duration_str}')

Qualifications shown in 4 seconds


In [13]:

unmet_str = "<p>The minimum requirements that I don't meet are:</p>"
display(HTML(unmet_str))
for i, qual_str in enumerate(quals_list):
    if (qual_str not in lru.basic_quals_dict) or not lru.basic_quals_dict[qual_str]:
        met_str = f'{i+1}) {qual_str}'
        unmet_str += ' ' + met_str
        idx = qual_str.find('>')
        if idx == -1:
            display(HTML(met_str))
        else:
            display(HTML(f'{qual_str[:idx+1]}{i+1}) {qual_str[idx+1:]}'))

In [14]:

# This doesn't work unless you score all the O-PQs
db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list = []
for i, (crf_symbol, db_symbol) in enumerate(zip(crf_list, db_pos_list)):
    if db_symbol in [None, 'O', 'H']:
        pos_list.append(crf_symbol)
    else:
        pos_list.append(db_symbol)
met_str = f"<p>The preferred requirements that I meet are:</p>"
display(HTML(met_str))
min_str = ''
pqs_list = [child_str for pos_str, child_str in zip(pos_list, child_strs_list) if (pos_str in ['O-PQ'])]
for i, qual_str in enumerate(pqs_list):
    if qual_str in lru.basic_quals_dict:
        if lru.basic_quals_dict[qual_str]:
            pref_str = f'{i+1}) {qual_str}'
            min_str += ' ' + pref_str
            idx = qual_str.find('>')
            if idx == -1:
                display(HTML(pref_str))
            else:
                display(HTML(f'{qual_str[:idx+1]}{i+1}) {qual_str[idx+1:]}'))
if min_str:
    ask_str += met_str + min_str
winsound.Beep(freq, duration)


----
# Write cover sheet

In [15]:

topic = 'interested'
if topic == 'recruiter':
    recruiter_name = 'Joseph St.Denis'
    youchat_str = f"Reply to the recruiter email that you don't meet {1-job_fitness:.1%} of the requirements"
    youchat_str += f" ({unmet_str}), though you do meet these criterion: {ask_str} and am interested in applying for the job."
    #and have applied for the job.
    youchat_str += f" (Replace [Your Name] with Dave Babbitt, Replace [Recruiter] with {recruiter_name})"
elif topic == 'cover':
    import pandas as pd
    cypher_str = f"""
        MATCH (fn:FileNames {{file_name: "{file_name}"}})
        RETURN
            fn.role_primary_contact AS role_primary_contact,
            fn.role_primary_contact_email_id AS role_primary_contact_email_id,
            fn.role_title AS role_title
        ORDER BY fn.percent_fit DESC;"""
    cover_df = pd.DataFrame(cu.get_execution_results(cypher_str, verbose=False))
    recruiter_name = cover_df.role_primary_contact.squeeze()
    email_address = cover_df.role_primary_contact_email_id.squeeze()
    role_title = cover_df.role_title.squeeze()
    if (recruiter_name is None) or (email_address is None):
        suffix_str = ''
    else:
        suffix_str = f' to "{recruiter_name}" <{email_address}>'
    youchat_str = f"Write a cover letter email{suffix_str}, complete with subject, using this information:"
    youchat_str += f" {ask_str} Replace [Your Name] with Dave Babbitt"
elif topic == 'zoom':
    interviewer_name = 'Dan, David, Alex, and Melinda'
    company_name = '3GIMBALS'
    youchat_str = f"Write a follow up thank you note for an interview using this information: a) Interviewer Name: {interviewer_name}, b)"
    youchat_str += f" Position: {job_title}, c) Company Name {company_name}, d) relevant skills: {ask_str}, e) Your Name: Dave Babbitt."
    youchat_str += f" Ask about going over the programming exercise."
elif topic == 'phone':
    interviewer_name = 'Dan, David, Alex, and Melinda'
    company_name = '3GIMBALS'
    interviewer_title = 'interview team'
    youchat_str = f"Write an email, complete with subject, to {interviewer_name} about what a pleasure it was to talk to them, the"
    youchat_str += f" {interviewer_title}, on the phone about the {job_title} position with {company_name}. Replace [Your Name]"
    youchat_str += " with Dave Babbitt"
elif topic == 'interested':
    file_path = '../data/txt/resume.txt'
    with open(file_path, 'r') as file:
        resume_str = file.read().rstrip()
    task_strs_list = []
    for task_str in [child_str for pos_str, child_str in zip(pos_list, child_strs_list) if (pos_str in ['O-TS'])]:
        task_strs_list.append(task_str)
    company_name = child_strs_list[1]
    youchat_str = f"Explain in first person singular why I would be interested in this role at {company_name}, given\n\n1)"
    youchat_str += f" this information about the task scope:\n\n{' '.join(task_strs_list)}\n\nand, 2) my resume:\n\n{resume_str}"
elif topic == 'question':
    file_path = '../data/txt/resume.txt'
    with open(file_path, 'r') as file:
        resume_str = file.read().rstrip()
    task_strs_list = []
    for task_str in [child_str for pos_str, child_str in zip(pos_list, child_strs_list) if (pos_str in ['O-TS'])]:
        task_strs_list.append(task_str)
    company_name = child_strs_list[1]
    youchat_str = f"Pretend you have the competencies and experience listed on the resume. Explain in first person singular"
    youchat_str += " how you manage projects, communicate with clients, discover their needs, make recommendations, stay in budget,"
    youchat_str += f" manage changing requirements, and produce results, given this resume:\n\n{resume_str[75:]}"
elif topic == 'rejected':
    job_title = 'Senior Data Engineering Analyst, Platform Engineering (Remote, Anywhere in US)'
    recruiting_team_name = 'Humana Recruiting Team'
    company_name = 'Humana'
    youchat_str = f"Write a reply to the {recruiting_team_name} rejection letter for the {job_title} position at {company_name},"
    youchat_str += " persuading the recruiting team to explain in more detail why I was rejected for the role. Replace [Name] with"
    youchat_str += " Dave Babbitt"
print(youchat_str)

Explain in first person singular why I would be interested in this role at <span style="font-family:Century Gothic,sans-serif">Data Scientist Sr</span>, given

1) this information about the task scope:

CareFirst is seeking a Data Scientist to support the Machine Learning component of its AuditIQ Product which helps proactively identify claim overpayments prior to FEPOC’s annual OIG (Office of Inspector General) Audits. C <strong>areFirst is</strong> <span style="font-family:Century Gothic,sans-serif">NOT</span> <span style="font-family:Century Gothic,sans-serif">looking for a data science researcher. They are seeking a resource that can assist with the ingestion of data along with the deployment and production of machine learning scripts.</span> •Identifies and solves business problems by using various numerical techniques, algorithms, and models in statistical modeling, machine learning, operations research, and data mining. •Uses advanced analytical capabilities to support data scie


----

In [60]:

import urllib.parse

driver = wsu.get_driver(verbose=False)
youchat_url = f'https://you.com/search?q={urllib.parse.quote_plus(youchat_str)}&fromSearchBar=true&tbm=youchat'
wsu.driver_get_url(driver, youchat_url, verbose=False)
print(youchat_str)
winsound.Beep(freq, duration); raise

Write a cover letter email, complete with subject, using this information: <p>I only meet 80.0% of the minimum requirements for the Data Scientist London England United Kingdom Remote position, but I can explain:</p> 1) <li>No formal academic, postgraduate or professional qualifications are</li> 2) <li>Strong previous experience as a Data Scientist.</li> 3) <li>Proven knowledge of programming skills and database knowledge.</li> 5) <li>Database experience, ideally AWS tech stack, MongoDB, and PostgreSQL</li> 6) <li>Experience with integrating various other public/proprietary datasets is highly desirable.</li> 7) <li>Strong understanding and implementation experience of predictive modelling algorithms such as regressions, time series, clustering and decision trees.</li> 9) <li>Ability to analyse data to drive actions.</li> 10) <li>Familiarity dealing with trade-offs between model performance and business needs.</li> Replace [Your Name] with Dave Babbitt


RuntimeError: No active exception to reraise


### Check the back FireFox window to make sure the chat writing has stopped before running this next cell.

In [61]:

from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By

css_selector = 'div[data-testid="youchat-answer-turn-0"]'
try:
    web_element = driver.find_element(By.CSS_SELECTOR, css_selector)
    print(web_element.text)
except NoSuchElementException as e:
    pass
except Exception as e:
    print(f'{e.__class__.__name__} error: {str(e).strip()}')
finally:
    driver.close()

Dear Hiring Manager,
I am writing to apply for the Data Scientist London England United Kingdom Remote position. Although I only meet 80.0% of the minimum requirements listed, there are several aspects of my experience that I believe make me a strong candidate for this position.
First, while I do not have any formal academic, postgraduate, or professional qualifications, I have extensive experience as a Data Scientist. In my previous roles, I have demonstrated the ability to apply predictive modelling algorithms, such as regressions, time series, and decision trees, to drive actions and improve business outcomes. I am also skilled in programming and database knowledge, with experience in AWS tech stack, MongoDB, and PostgreSQL.
In addition, I have experience integrating multiple public and proprietary datasets, which I believe would be beneficial in this position. I am familiar with the trade-offs between model performance and business needs, and I excel at analyzing data to drive acti