In [1]:

%pprint
%matplotlib inline
import sys
import os.path as osp

executable_path = sys.executable; scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts')
py_folder = osp.abspath('../py'); ffmpeg_folder = r'C:\ffmpeg\bin'
if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine)
from pandas import DataFrame
import re
import pyperclip
import ipywidgets as widgets
from IPython.display import display

Pretty printing has been turned OFF
Utility libraries created in 8 seconds


In [2]:

# Check if the slrcu has built its parts-of-speech logistic regression model
# Parts-of-speech logistic regression model is normally built in 1 hour, 10 minutes and 4 seconds
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'): print('predict_single is available')
else: print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Parts-of-speech logistic regression model built in {duration_str}'; print(speech_str)

I have 48,199 labeled parts of speech in here


Train the POS Classifiers: 100%|███████████████| 25/25 [00:00<00:00, 354.40it/s]

predict_single is available
Parts-of-speech logistic regression model built in 6 seconds





# Speed/Accuracy Tradeoffs and Synthetic Dataset Generation

## Speed vs. Accuracy Tradeoffs
- To run the job-hunting pipeline efficiently on a laptop, I had to make **speed vs. accuracy tradeoffs**.
- For predicting the category of each `navigable_parent` (HTML navigable text wrapped in its parent tag), I chose **Logistic Regression**:
  - Logistic Regression is relatively fast and interpretable.
  - It sacrifices some accuracy compared to more complex machine learning models like neural networks.


## Training Data Imbalance Challenge
- The training data is highly **imbalanced**. For example, when training on **Post Date Headers (H-PD)**:
  - I have only **5 examples** of the target class.
  - There are **48,123 labeled parts of speech of non-examples**.
- This imbalance makes it difficult for the model to learn meaningful patterns for the specific class (H-PD).
- As a result, the model may perform poorly when classifying H-PD examples.

## Synthetic Dataset Generation with LLMs
- To address the training data imbalance, I am generating a **synthetic dataset** using **Large Language Models (LLMs)**:
  - I provide the few available examples as input to the LLM.
  - I refine the generation process using:
    - **Prompt engineering** before generation.
    - **Regular expressions** to clean and validate the generated examples post-generation.
    - Letting the **Logistic Regression** model pick out the best of the generated examples.
- The LLM generates additional synthetic data that resembles the target class (H-PD) closely.


## Speed and LLM Implementation Issues
- The synthetic dataset generation process is implemented using **Cohere's API**:
  - I haven't exceeded my current quota.
  - The offline open source models I have on my laptop are too slow, preventing fast turnaround.
- Manually fixing some labels in the training dataset forces me to retrain the POS Classifiers, which takes about 2 hours.

## Goals and Future Testing
- By combining **Logistic Regression** with **synthetic data augmentation**, I aim to:
  - Strike a balance between **speed**, **accuracy**, and **practicality**.
  - Improve the overall performance of the machine learning pipeline.
- I am assuming adding labeled synthetic `navigable_parent`s significantly enhances model performance without compromising efficiency. I will test when I can find some more examples in the wild.


## Get some example job posting HTML

In [3]:

# Get all at-least-partially tagged file names
cypher_str = f'''
    // Get the tagged node counts for each file
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WITH
        r2.file_name AS file_name,
        COUNT(r1) AS tagged_count,
        COUNT(r2) AS edge_count,
        COUNT(np1) AS np_count
    RETURN np_count, tagged_count, edge_count, file_name
    ORDER BY edge_count DESC;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    tagged_node_counts_df = DataFrame(row_objs_list)
    if tagged_node_counts_df.shape[0]:
        display(tagged_node_counts_df.sample(5))

Unnamed: 0,np_count,tagged_count,edge_count,file_name
1178,16,16,16,c0cbfde3d10e39b6_Experimentation_Data_Scientis...
2493,7,7,7,ce2c3bfeb11aebb9_QA_Engineer_Morris_Plains_NJ_...
2265,8,8,8,f8e9caebfcdaec5c_React_Full_Stack_Developer_Re...
3754,3,3,3,1008058068135_Systems_Analyst_III.html
2874,5,5,5,1681373_Sr_Fullstack_Developer_Bertoni_Solutio...


In [4]:

# Get all POS symbols for the at-least-partially tagged files
filenames_list = tagged_node_counts_df.file_name.tolist()
filenames_str = '", "'.join(filenames_list)
cypher_str = f'''
    // Get child string and POS for each at-least-partially tagged file
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WHERE
        r2.file_name IN ["{filenames_str}"]
    RETURN
        np1.navigable_parent AS text,
        pos.pos_symbol AS pos_symbol;'''
# print(cypher_str)
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    file_tags_df = DataFrame(row_objs_list)
    if file_tags_df.shape[0]:
        display(file_tags_df.sample(5))

Unnamed: 0,text,pos_symbol
43278,"<div class=""css-tvvxwd ecydgvn1"">Paid time off...",O-SP
47051,<li>Health insurance</li>,O-SP
37219,<li>Day shift</li>,O-JD
16418,<li>Experience with any or all of the followin...,O-RQ
34570,<p>Work Location: Remote</p>,O-OL


In [7]:

# Get the most unbalance POS symbol
min_support_pos_symbol = file_tags_df.groupby('pos_symbol').count().reset_index().rename(columns={'text': 'labeled_count'}).sort_values('labeled_count').iloc[0].pos_symbol
min_recall_pos_symbol = nu.load_object('min_recall_pos_symbol')
print(f'The training data is highly imbalanced. For example, when training on {hc.POS_EXPLANATION_DICT[min_recall_pos_symbol]} ({min_recall_pos_symbol}):')

# Get all summarized child strings
cypher_str = f'''
    // Get all summarized child strings
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)
    WHERE pos.pos_symbol IN ["{min_recall_pos_symbol}"]
    RETURN DISTINCT np1.navigable_parent AS text;'''
# print(cypher_str)
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    df = DataFrame(row_objs_list)
    base_headers = df.text.to_list()
    if df.shape[0]:
        print(f'    I have only {df.shape[0]:,} examples of the target class.')
        print(f'    There are {file_tags_df.shape[0]:,} labeled parts of speech of non-examples.')
        display(df.head(5))

The training data is highly imbalanced. For example, when training on Interview Procedures Header (H-IP):
    I have only 140 examples of the target class.
    There are 57,618 labeled parts of speech of non-examples.


Unnamed: 0,text
0,<b>Application Process</b>
1,<b>GS-12</b>
2,<p>HOW TO APPLY</p>
3,"to join our VIRTUAL team, where we accommodate..."
4,<b>(Additional relevant work experience and/or...



## Generate synthetic job posting HTML

In [8]:

import cohere

# Initialize the Cohere client
co_key = wsu.secrets_json['Cohere_API_Key']
co = cohere.Client(co_key)

# Define the base headers for Post Date Headers
prompt = f"Output a python list (not a function that generates a list) of job posting HTML examples similar to these {hc.POS_EXPLANATION_DICT[min_recall_pos_symbol]}s ({min_recall_pos_symbol}):\n" + str(base_headers)
print(prompt)

Output a python list (not a function that generates a list) of job posting HTML examples similar to these Interview Procedures Headers (H-IP):
['<b>Application Process</b>', '<b>GS-12</b>', '<p>HOW TO APPLY</p>', 'to join our VIRTUAL team, where we accommodate all US timezones for collaborative mid-day sessions and activities. The person we are looking for will help us develop and commercialize progressive software solutions to BigData challenges. The successful candidate will:', '<b>(Additional relevant work experience and/or education from an accredited college, university or technical school may be substituted.)</b>', "<p>A qualified candidate's online application and resume must demonstrate at least one year of specialized experience equivalent to the next lower grade level in the Federal service. Specialized experience for these positions are defined as:</p>", 'For immediate consideration, please send your resume to jbeauliere@matlensilver.com', '<div>The physical demands describe

In [23]:

element_strs_set = set([
    ''
])

In [24]:

# Generate synthetic headers
response = co.generate(
    prompt=prompt,
    model='command-xlarge-nightly',
    num_generations=5,
    max_tokens=200,
    temperature=0.7,
)
generation_texts_list = [cohere_generation_obj.text for cohere_generation_obj in response.generations]
print(max(generation_texts_list, key=lambda x: len(str(x))))

unknown field: parameter model is not a valid field


Here is a Python list of job posting HTML examples similar to the provided Post Date Headers (H-PD):

```python
job_posting_html_examples = [
    '<h3>Posted on:</h3>',
    '<strong>Date Posted:</strong>',
    '<p>Job Posted: </p>',
    '<div class="post-date">Date: </div>',
    '<span>Posted: </span>',
    '<label>Post Date:</label>',
    '<b>Job Ad Date:</b>',
    '<i>Posted</i>',
    '<div>Application Deadline: </div>',
    '<p>Application Due: </p>'
]
```


In [26]:

for generation_text in generation_texts_list:
    # display(generation_text)
    element_strs_list = re.findall(r"""['`]([^\\'`\]\[]+)['`],?""", generation_text)
    for element_str in element_strs_list:
        html_str = element_str.strip('\'`').strip()
        if html_str.startswith('<') or html_str.endswith('>'):
            html_str = hau.get_navigable_children(hau.get_body_soup(html_str), [])[0].strip()

            # Disqualify text surrounded by inline elements
            if not any(map(lambda pe: html_str.endswith(f'</{pe}>'), cu.inline_elements_set)):
                element_strs_set.add(html_str)
        else:
            element_strs_set.add(html_str)

In [28]:

# Display the HTML that the classifier thinks looks good
db_pos_list = []
child_strs_list = sorted(element_strs_set)
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list, verbose=False)
for navigable_parent, db_pos_symbol in zip(child_strs_list, db_pos_list):
    if db_pos_symbol and ('-' in db_pos_symbol):
        # print(f'*{db_pos_symbol}: {navigable_parent}')
        continue
    else:
        assert hasattr(slrcu, 'pos_predict_percent_fit_dict'), 'slrcu.predict_single needs to be available'
        pr_pos_symbol = slrcu.predict_single(navigable_parent)
        if pr_pos_symbol == min_recall_pos_symbol:
            print(f'{pr_pos_symbol}: {navigable_parent}')

H-PD: <b>Announcement Date:</b>
H-PD: <b>Commencement Date:</b>
H-PD: <b>Job Posting Date:</b>
H-PD: <b>Launch Date:</b>
H-PD: <b>Post Date:</b>
H-PD: <b>Update Date:</b>
H-PD: <div>Date Published</div>
H-PD: <div>Job Posting Details</div>
H-PD: <h3>Job Posting Date:</h3>
H-PD: <h4>Job Posting</h4>
H-PD: <label>Job Posting</label>
H-PD: <p>Job Posting Date:</p>



## Add (pos:PartsOfSpeech)-[r:SUMMARIZES]->(np:NavigableParents) pairs to the database

In [34]:

# Take these hand-picked HTML strings and add them to the database
from IPython.display import clear_output

child_strs_list = [
    '<b>Advertisement Date</b>', '<b>Announced On:</b>', '<b>Announcement Date:</b>', '<b>Announcement Date</b>', '<b>Announcement Release Date:</b>',
    '<b>Commencement Date:</b>', '<b>Date Announced:</b>', '<b>Date Posted:</b>', '<b>Date Updated:</b>', '<b>Date of Advertisement:</b>',
    '<b>Date of Posting:</b>', '<b>Date of Publication:</b>', '<b>Effective From:</b>', '<b>Job Ad Date:</b>', '<b>Job Advertisement:</b>',
    '<b>Job Posted On:</b>', '<b>Job Posted:</b>', '<b>Job Posting :</b>', '<b>Job Posting Date:</b>', '<b>Job Posting</b>', '<b>Last Updated:</b>',
    '<b>Launch Date:</b>', '<b>Post Date:</b>', '<b>Post Date</b>', '<b>Posted On:</b>', '<b>Posted on:</b>', '<b>Publication Date:</b>',
    '<b>Published on:</b>', '<b>Recent Posting:</b>', '<b>Recent Update</b>', '<b>Release Date:</b>', '<b>Time of Posting:</b>',
    '<b>Update Date:</b>', '<div class="job-date">Posted on</div>', '<div class="job-meta">Posted on</div>',
    '<div class="job-posting-date">Posted</div>', '<div class="post-date">Date:</div>', '<div class="post-date">Posted on</div>',
    '<div class="post-date">Posted:</div>', '<div class="post-date">Posted</div>', '<div>Date Posted</div>', '<div>Date Published:</div>',
    '<div>Date Published</div>', '<div>Date of Publication</div>', '<div>Job Posted:</div>', '<div>Published</div>', '<div>Update Date</div>',
    '<h3>Date Posted:</h3>', '<h3>Date Posted</h3>', '<h3>Job Posted on:</h3>', '<h3>Job Posting Date:</h3>', '<h3>Post Date</h3>', '<h3>Posted on:</h3>',
    '<h3>Publication</h3>', '<h4>Job Posting</h4>', '<label>Job Listing Date:</label>', '<label>Job Posting</label>', '<label>Post Date:</label>',
    '<label>Posted:</label>', '<label>Published:</label>', '<p>Date Posted:</p>', '<p>Date Published:</p>', '<p>Date of Publication:</p>',
    '<p>Job Posted:</p>', '<p>Job Posting Date:</p>', '<p>Post Date</p>', '<p>Posted Date:</p>', 'Available From:', 'Job Posted:', 'Posted', 'Posted on:',
    'Published on:', 'We posted this job on', 'date-posted', 'post-date'
]

In [40]:

# Take the hand-picked HTML strings and add their characteristics to the database
for child_str in child_strs_list:
    def create_characteristics(tx, navigable_parent, verbose=True):
        cypher_str = '''
            // Set the characteristics of the navigable_parent
            MERGE (np:NavigableParents {navigable_parent: $navigable_parent})
            SET
                np.is_job_title = false,
                np.is_corporate_scope = false,
                np.is_task_scope = false,
                np.is_minimum_qualification = false,
                np.is_preferred_qualification = false,
                np.is_supplemental_pay = false,
                np.is_office_location = false,
                np.is_job_duration = false,
                np.is_interview_procedure = false,
                np.is_legal_notification = false,
                np.is_other = false,
                np.is_posting_date = true,
                np.is_header = true
            RETURN
                np.navigable_parent AS navigable_parent,
                np.is_posting_date AS is_np_posting_date,
                np.is_header AS is_np_header;'''
        if verbose:
            clear_output(wait=True)
            print(cypher_str.replace('$navigable_parent', f'"{navigable_parent}"'))
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
    row_objs_list = []
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(create_characteristics, navigable_parent=child_str)
    if row_objs_list:
        df = DataFrame(row_objs_list)
        if df.shape[0]:
            display(df.head(5))


            // Set the characteristics of the navigable_parent
            MERGE (np:NavigableParents {navigable_parent: "post-date"})
            SET
                np.is_job_title = false,
                np.is_corporate_scope = false,
                np.is_task_scope = false,
                np.is_minimum_qualification = false,
                np.is_preferred_qualification = false,
                np.is_supplemental_pay = false,
                np.is_office_location = false,
                np.is_job_duration = false,
                np.is_interview_procedure = false,
                np.is_legal_notification = false,
                np.is_other = false,
                np.is_posting_date = true,
                np.is_header = true
            RETURN
                np.navigable_parent AS navigable_parent,
                np.is_posting_date AS is_np_posting_date,
                np.is_header AS is_np_header;


Unnamed: 0,navigable_parent,is_np_posting_date,is_np_header
0,post-date,True,True


In [41]:

# Take the hand-picked HTML strings and remove their bad relationships to the database
for child_str in child_strs_list:
    def remove_relationships(tx, navigable_parent, pos_symbol, verbose=True):
        cypher_str = '''
            // Remove the SUMMARIZES relationships that are not pos_symbol
            MATCH (pos:PartsOfSpeech)-[r:SUMMARIZES]->(np:NavigableParents {navigable_parent: $navigable_parent})
            WHERE NOT (pos.pos_symbol = $pos_symbol)
            DELETE r
            RETURN pos, r, np;'''
        if verbose:
            clear_output(wait=True)
            print(cypher_str.replace('$navigable_parent', f'"{navigable_parent}"').replace('$pos_symbol', f'"{pos_symbol}"'))
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent, 'pos_symbol': pos_symbol})]
    row_objs_list = []
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(remove_relationships, navigable_parent=child_str, pos_symbol=min_recall_pos_symbol)
    if row_objs_list:
        df = DataFrame(row_objs_list)
        if df.shape[0]:
            display(df.head(5))


            // Remove the SUMMARIZES relationships that are not pos_symbol
            MATCH (pos:PartsOfSpeech)-[r:SUMMARIZES]->(np:NavigableParents {navigable_parent: "post-date"})
            WHERE NOT (pos.pos_symbol = "H-PD")
            DELETE r
            RETURN pos, r, np;


In [44]:

# Take the hand-picked HTML strings and add the correct relationships
for child_str in child_strs_list:
    def create_relationship(tx, navigable_parent, pos_symbol, verbose=True):
        cypher_str = '''
            // Create a SUMMARIZES relationship if it doesn't already exist
            MATCH (pos:PartsOfSpeech {pos_symbol: $pos_symbol})
            MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
            MERGE (pos)-[r:SUMMARIZES]->(np)
            RETURN
                pos.pos_explanation AS pos_explanation,
                pos.pos_symbol AS pos_symbol,
                pos.is_posting_date AS is_pos_posting_date,
                pos.is_header AS is_pos_header,
                type(r) AS relationship_type,
                np.navigable_parent AS navigable_parent,
                np.is_posting_date AS is_np_posting_date,
                np.is_header AS is_np_header;'''
        if verbose:
            clear_output(wait=True)
            print(cypher_str.replace('$navigable_parent', f'"{navigable_parent}"').replace('$pos_symbol', f'"{pos_symbol}"'))
        return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent, 'pos_symbol': pos_symbol})]
    row_objs_list = []
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(create_relationship, navigable_parent=child_str, pos_symbol=min_recall_pos_symbol)
    if row_objs_list:
        df = DataFrame(row_objs_list)
        if df.shape[0]:
            display(df.head(5).T)


            // Create a SUMMARIZES relationship if it doesn't already exist
            MATCH (pos:PartsOfSpeech {pos_symbol: "H-PD"})
            MATCH (np:NavigableParents {navigable_parent: "post-date"})
            MERGE (pos)-[r:SUMMARIZES]->(np)
            RETURN
                pos.pos_explanation AS pos_explanation,
                pos.pos_symbol AS pos_symbol,
                pos.is_posting_date AS is_pos_posting_date,
                pos.is_header AS is_pos_header,
                type(r) AS relationship_type,
                np.navigable_parent AS navigable_parent,
                np.is_posting_date AS is_np_posting_date,
                np.is_header AS is_np_header;


Unnamed: 0,0
pos_explanation,Post Date Header
pos_symbol,H-PD
is_pos_posting_date,True
is_pos_header,True
relationship_type,SUMMARIZES
navigable_parent,post-date
is_np_posting_date,True
is_np_header,True



----

In [30]:

print(f'''
element_strs_set = set({sorted(element_strs_set)})''')


element_strs_set = set(['<b>Advertisement Date</b>', '<b>Announced On:</b>', '<b>Announcement Date:</b>', '<b>Announcement Date</b>', '<b>Announcement Release Date:</b>', '<b>Commencement Date:</b>', '<b>Date Announced:</b>', '<b>Date Posted:</b>', '<b>Date Updated:</b>', '<b>Date of Advertisement:</b>', '<b>Date of Posting:</b>', '<b>Date of Publication:</b>', '<b>Effective From:</b>', '<b>Job Ad Date:</b>', '<b>Job Advertisement:</b>', '<b>Job Posted On:</b>', '<b>Job Posted:</b>', '<b>Job Posting :</b>', '<b>Job Posting Date:</b>', '<b>Job Posting</b>', '<b>Last Updated:</b>', '<b>Launch Date:</b>', '<b>Post Date:</b>', '<b>Post Date</b>', '<b>Posted On:</b>', '<b>Posted on:</b>', '<b>Publication Date:</b>', '<b>Published on:</b>', '<b>Recent Posting:</b>', '<b>Recent Update</b>', '<b>Release Date:</b>', '<b>Time of Posting:</b>', '<b>Update Date:</b>', '<div class="job-date">Posted on</div>', '<div class="job-meta">Posted on</div>', '<div class="job-posting-date">Posted</div>', '<

In [12]:

# Get all at-least-partially tagged file names
cypher_str = '''
    // Remove the SUMMARIZES relationships that are not "O-PD"
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)
    WHERE
        np1.navigable_parent IN ['<span class="jobsearch-HiringInsights-entry--text">Posted 7 days ago</span>']
        AND NOT (pos.pos_symbol = "O-PD")
    DELETE r1;'''
pyperclip.copy(cypher_str)