In [84]:

# Load needed libraries and functions
%matplotlib inline
%pprint
import sys
import os.path as osp

executable_path = sys.executable; scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts')
py_folder = osp.abspath('../py'); ffmpeg_folder = r'C:\ffmpeg\bin'
if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
from jobpostlib import (cu, datetime, nu, humanize, time, lru)
from nltk.tokenize import sent_tokenize
from pandas import DataFrame
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pyperclip
import re

# Download necessary NLTK data
nltk.download('punkt')

Pretty printing has been turned OFF


[nltk_data] Downloading package punkt to C:\Users\daveb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:

# Check if the lru has built its is-qualified classifier
t1 = time.time()
# lru.basic_quals_dict = None; lru.sync_basic_quals_dict()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified LR elements built in {duration_str}'; print(speech_str)

I have 424,879 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 9 seconds



----

In [426]:

# Get the databased quals
cypher_str = '''
    // Get all qualification strings in the database
    MATCH (qs:QualificationStrings)
    RETURN qs;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    basic_quals_df = DataFrame(
        [{k: v for k, v in row_obj['qs'].items()} for row_obj in row_objs_list]
    ).drop_duplicates()
    shape_tuple = basic_quals_df.shape
    pyperclip.copy(str(shape_tuple))
    print(f'basic_quals_df.shape: {shape_tuple}') # (18257, 2)

# Step 1: Preprocess the sentences
flat_sentences = sorted([sent for qualification_str in basic_quals_df.qualification_str for sent in sent_tokenize(qualification_str)], key=lambda x: len(x))

basic_quals_df.shape: (18257, 2)


In [427]:

# Step 2: Convert sentences to numerical representations
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(flat_sentences)

In [429]:

# Step 3: Choose a clustering algorithm (DBSCAN in this example)
t1 = time.time()
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)  # You may need to adjust these parameters

# Get cluster assignments
cluster_assignments = dbscan.fit_predict(X)

# Apply PCA to reduce dimensionality to 2
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X.toarray())

# Separate non-noise points
non_noise_mask = cluster_assignments != -1
X_2d_non_noise = X_2d[non_noise_mask]
cluster_assignments_non_noise = cluster_assignments[non_noise_mask]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Clusters assigned in {duration_str}'; print(speech_str)

Clusters assigned in 1 minute and 21 seconds


In [430]:

# Print some statistics
n_clusters = len(set(cluster_assignments)) - (1 if -1 in cluster_assignments else 0)
print(f"Number of clusters found: {n_clusters}")

for cluster in set(cluster_assignments):
    cluster_size = np.sum(cluster_assignments == cluster)
    if cluster == -1:
        print(f"Noise points: {cluster_size}")
    # else:
    #     print(f"Cluster {cluster} size: {cluster_size}")

# Print a few example sentences from each cluster
for cluster in set(cluster_assignments):
    if cluster == -1:
        print(f"\nExample sentences from Noise:")
    else:
        print(f"\nExample sentences from Cluster {cluster}:")
    cluster_sentences = [sent for sent, clust in zip(flat_sentences, cluster_assignments) if clust == cluster]
    for sentence in cluster_sentences[:3]:  # Print first 3 sentences
        print(f"- {sentence}")

Number of clusters found: 60
Noise points: 18744

Example sentences from Cluster 0:
- ;
- ;
- ;

Example sentences from Cluster 1:
- Min.
- Min.
- Min.

Example sentences from Cluster 2:
- </div>
- </div>
- </div>

Example sentences from Cluster 3:
- is a plus
- is a plus
- is a plus.

Example sentences from Cluster 4:
- Master Degree
- Master's degree
- Master’s degree

Example sentences from Cluster 5:
- Bachelor Degree
- Bachelor degree
- Bachelor s degree

Example sentences from Cluster 6:
- SQL - 3 years
- 5+ Years of SQL.
- 3+ years of SQL experience.

Example sentences from Cluster 7:
- Python - 3 years
- Python 5+ years,
- 5+ years in python

Example sentences from Cluster 8:
- Proficiency Python
- Python proficiency
- Proficiency in Python

Example sentences from Cluster 9:
- Must be US Citizen
- Must be US citizen
- Must be US Citizen.

Example sentences from Cluster 10:
- What motivates you?
- What motivates you?
- What motivates you?

Example sentences from Cluster 11:
- Do

In [None]:

# Create a scatter plot
plt.figure(figsize=(12, 8))

# Plot only non-noise points
scatter = plt.scatter(
    X_2d_non_noise[:, 0], X_2d_non_noise[:, 1], c=cluster_assignments_non_noise,
    cmap='viridis', alpha=0.7
)

plt.colorbar(scatter)

# Add labels and title
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('DBSCAN Cluster Visualization of Sentences (Noise Points Hidden)')

# Show the plot
plt.show()

In [None]:

# Step 3: Choose a clustering algorithm (KMeans in this example)
n_clusters = 3  # You can adjust this number
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Step 4: Apply the clustering algorithm
kmeans.fit(X)

# Get cluster assignments for each sentence
cluster_assignments = kmeans.labels_

In [None]:

# Print results
for i, sentence in enumerate(flat_sentences):
    print(f"Sentence: {sentence}")
    print(f"Cluster: {cluster_assignments[i]}")
    print()


----
## Maintenance

In [421]:

# Get the bad databased quals
cypher_str = '''
    // Get all qualification strings in the database
    MATCH (qs:QualificationStrings)
    WHERE qs.qualification_str =~ '^[^A-Za-z0-9].*'
    RETURN qs;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    basic_quals_df = DataFrame(
        [{k: v for k, v in row_obj['qs'].items()} for row_obj in row_objs_list]
    ).drop_duplicates()
    shape_tuple = basic_quals_df.shape
    pyperclip.copy(str(shape_tuple))
    print(f'basic_quals_df.shape: {shape_tuple}') # (28, 2)

# Step 1: Preprocess the sentences
flat_sentences = sorted([sent for qualification_str in basic_quals_df.qualification_str for sent in sent_tokenize(qualification_str)], key=lambda x: len(x))

basic_quals_df.shape: (21, 2)


In [422]:

flat_sentences[:10]

['.NET Core (P3 - Advanced)', '.NET: 10 years (Preferred)', '(Nice-to-have) AWS experience', '.NET Core 3.1 or above: 3 years (Required)', '(Optimal) Publication in robotics/ML/CV conference', '+3 years experience with the Go programming language', '(preferred) Experience with Agile software development', '+1 years of professional experience with dbt (cloud or core)', '[Bonus] You might have experience with Snowflake, Jupyter, Pandas.', '(Desired) Experience with ETL tools such as Azure Data Factory or Apache NiFi.']

In [425]:

poppable_list = [qs for qs in basic_quals_df.qualification_str if (len(min(sent_tokenize(qs), key=lambda s: len(s))) < 3000) and all(map(
    lambda x: not qs.startswith(x), ['.NET', '(Nice-to-have) ', '[Bonus] ', '(Optimal) ', '+3 years ', '(preferred) ', '+1 years ', '(Desired) ', '(Optional) ', '&gt; ', '(3-6) years ', '+2 years ']
))]
poppable_list[:10]

[]

In [419]:

child_str = poppable_list.pop()
print(f'{len(poppable_list)}) {child_str}')
pyperclip.copy(f'\nnew_child_str = "{child_str}"')

0) (Google Earth Engine, Amazon Web Services) to obtain and analyze data, and analysis and visualization technologies such as Drupal, Leaflet, R Shiny, Tableau, etc.


In [413]:

# Replace this particular child string in the quals dictionary
new_child_str = "Please note, at this time, remote work in this role cannot be carried out from states on the West coast, within the Pacific Time Zone"
basic_quals_dict = nu.load_object('basic_quals_dict')
if child_str in basic_quals_dict:
    basic_quals_dict[new_child_str] = basic_quals_dict[child_str]
    basic_quals_dict.pop(child_str, None)
    nu.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')
print(f'"{new_child_str}" in basic_quals_dict: {new_child_str in basic_quals_dict}')

# Replace this particular child string in the database
def do_cypher_tx(tx, old_child_str, new_child_str):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $old_child_str})
        SET qs.qualification_str = $new_child_str;
        '''
    results_list = tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, old_child_str=child_str, new_child_str=new_child_str)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"(Please note, at this time, remote work in this role cannot be carried out from states on the West coast, within the Pacific Time Zone" in basic_quals_dict: False
"Please note, at this time, remote work in this role cannot be carried out from states on the West coast, within the Pacific Time Zone" in basic_quals_dict: True


In [420]:

# Remove this particular child string from the quals dictionary
basic_quals_dict = nu.load_object('basic_quals_dict')
basic_quals_dict.pop(child_str, None)
nu.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')

# Remove this particular child string from the database
def do_cypher_tx(tx, qualification_str, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $qualification_str})
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={'qualification_str': qualification_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, qualification_str=child_str, verbose=False)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"(Google Earth Engine, Amazon Web Services) to obtain and analyze data, and analysis and visualization technologies such as Drupal, Leaflet, R Shiny, Tableau, etc." in basic_quals_dict: False


In [304]:

# Replace child strings beginning with and ending with an asterisk in the quals dictionary
pattern = r'\*([^<]+)\*'
poppable_list = [child_str for child_str in basic_quals_df.qualification_str if re.search(pattern, child_str)]
progress_bar = tqdm(
    poppable_list, total=len(poppable_list), desc=f"Remove *s"
)
for child_str in progress_bar:
    new_child_str = re.sub(pattern, r'\g<1>', child_str).strip()
    basic_quals_dict = nu.load_object('basic_quals_dict')
    if child_str in basic_quals_dict:
        basic_quals_dict[new_child_str] = basic_quals_dict[child_str]
        basic_quals_dict.pop(child_str, None)
        nu.store_objects(basic_quals_dict=basic_quals_dict, verbose=False)
    
    # Replace this particular child string in the database
    def do_cypher_tx(tx, old_child_str, new_child_str):
        cypher_str = '''
            MATCH (qs:QualificationStrings {qualification_str: $old_child_str})
            SET qs.qualification_str = $new_child_str;
            '''
        results_list = tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
    
        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, old_child_str=child_str, new_child_str=new_child_str)

Remove *s: 100%|█████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.59it/s]


In [345]:

# Replace child strings beginning with (3) or some other number in the quals dictionary
pattern = r'^\(\d+\) (.+)'
poppable_list = [child_str for child_str in basic_quals_df.qualification_str if re.search(pattern, child_str)]
progress_bar = tqdm(
    poppable_list, total=len(poppable_list), desc=f"Remove <orq>s"
)
for child_str in progress_bar:
    new_child_str = re.sub(pattern, r'\g<1>', child_str).strip()
    basic_quals_dict = nu.load_object('basic_quals_dict')
    if child_str in basic_quals_dict:
        basic_quals_dict[new_child_str] = basic_quals_dict[child_str]
        basic_quals_dict.pop(child_str, None)
        nu.store_objects(basic_quals_dict=basic_quals_dict, verbose=False)
    
    # Replace this particular child string in the database
    def do_cypher_tx(tx, old_child_str, new_child_str):
        cypher_str = '''
            MATCH (qs:QualificationStrings {qualification_str: $old_child_str})
            SET qs.qualification_str = $new_child_str;
            '''
        results_list = tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
    
        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, old_child_str=child_str, new_child_str=new_child_str)

Remove <orq>s: 100%|█████████████████████████████████████████████████████████████████████| 9/9 [00:01<00:00,  5.54it/s]


In [405]:

for prefix_str in [' ', '- ', '· ', '• ', '•', ', ', '● ', '**', '* ', '— ', '✅ ', '+ ', '" ', '-', ': ', '– ', '\\"', '\ufeff', '|', ')', '*']:
    
    # Replace child strings beginning with prefix_str in the quals dictionary
    poppable_list = [child_str for child_str in basic_quals_df.qualification_str if child_str.startswith(prefix_str)]
    progress_bar = tqdm(
        poppable_list, total=len(poppable_list), desc=f"Remove {prefix_str}s"
    )
    for child_str in progress_bar:
        new_child_str = child_str.replace(prefix_str, '', ).strip()
        basic_quals_dict = nu.load_object('basic_quals_dict')
        if child_str in basic_quals_dict:
            basic_quals_dict[new_child_str] = basic_quals_dict[child_str]
            basic_quals_dict.pop(child_str, None)
            nu.store_objects(basic_quals_dict=basic_quals_dict, verbose=False)
        
        # Replace this particular child string in the database
        def do_cypher_tx(tx, old_child_str, new_child_str):
            cypher_str = '''
                MATCH (qs:QualificationStrings {qualification_str: $old_child_str})
                SET qs.qualification_str = $new_child_str;
                '''
            results_list = tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
        
            return [dict(record.items()) for record in results_list]
        with cu.driver.session() as session:
            row_objs_list = session.write_transaction(do_cypher_tx, old_child_str=child_str, new_child_str=new_child_str)

Remove  s: 0it [00:00, ?it/s]
Remove - s: 0it [00:00, ?it/s]
Remove · s: 0it [00:00, ?it/s]
Remove • s: 0it [00:00, ?it/s]
Remove •s: 0it [00:00, ?it/s]
Remove , s: 0it [00:00, ?it/s]
Remove ● s: 0it [00:00, ?it/s]
Remove **s: 0it [00:00, ?it/s]
Remove * s: 0it [00:00, ?it/s]
Remove — s: 0it [00:00, ?it/s]
Remove ✅ s: 0it [00:00, ?it/s]
Remove + s: 0it [00:00, ?it/s]
Remove " s: 0it [00:00, ?it/s]
Remove -s: 0it [00:00, ?it/s]
Remove : s: 0it [00:00, ?it/s]
Remove – s: 0it [00:00, ?it/s]
Remove \"s: 0it [00:00, ?it/s]
Remove ﻿s: 0it [00:00, ?it/s]
Remove |s: 0it [00:00, ?it/s]
Remove )s: 0it [00:00, ?it/s]
Remove *s: 100%|█████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.97it/s]


In [303]:

# Replace child strings beginning with <orq> and a number and ending in </orq> in the quals dictionary
pattern = r'<orq>\d+ - ([^<]+)</orq>'
poppable_list = [child_str for child_str in basic_quals_df.qualification_str if re.search(pattern, child_str)]
progress_bar = tqdm(
    poppable_list, total=len(poppable_list), desc=f"Remove <orq>s"
)
for child_str in progress_bar:
    new_child_str = re.sub(pattern, r'\g<1>', child_str).strip()
    basic_quals_dict = nu.load_object('basic_quals_dict')
    if child_str in basic_quals_dict:
        basic_quals_dict[new_child_str] = basic_quals_dict[child_str]
        basic_quals_dict.pop(child_str, None)
        nu.store_objects(basic_quals_dict=basic_quals_dict, verbose=False)
    
    # Replace this particular child string in the database
    def do_cypher_tx(tx, old_child_str, new_child_str):
        cypher_str = '''
            MATCH (qs:QualificationStrings {qualification_str: $old_child_str})
            SET qs.qualification_str = $new_child_str;
            '''
        results_list = tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
    
        return [dict(record.items()) for record in results_list]
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx, old_child_str=child_str, new_child_str=new_child_str)

Remove <orq>s: 0it [00:00, ?it/s]


In [279]:

for tag in ['li', 'div', 'p', 'b', 'i', 'span', 'em', 'orq', 'strong']:
    
    # Replace child strings beginning with <tag> and (possibly) ending with </tag> in the quals dictionary
    pattern = f'<{tag}[^>]*>(.+)(?:</{tag}>)?'
    poppable_list = [child_str for child_str in basic_quals_df.qualification_str if re.search(pattern, child_str)]
    progress_bar = tqdm(
        poppable_list, total=len(poppable_list), desc=f"Remove <{tag}>s"
    )
    for child_str in progress_bar:
        new_child_str = re.sub(pattern, r'\g<1>', child_str).strip()
        basic_quals_dict = nu.load_object('basic_quals_dict')
        if child_str in basic_quals_dict:
            basic_quals_dict[new_child_str] = basic_quals_dict[child_str]
            basic_quals_dict.pop(child_str, None)
            nu.store_objects(basic_quals_dict=basic_quals_dict, verbose=False)
        
        # Replace this particular child string in the database
        def do_cypher_tx(tx, old_child_str, new_child_str):
            cypher_str = '''
                MATCH (qs:QualificationStrings {qualification_str: $old_child_str})
                SET qs.qualification_str = $new_child_str;
                '''
            results_list = tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
        
            return [dict(record.items()) for record in results_list]
        with cu.driver.session() as session:
            row_objs_list = session.write_transaction(do_cypher_tx, old_child_str=child_str, new_child_str=new_child_str)

Remove <li>s: 0it [00:00, ?it/s]
Remove <div>s: 0it [00:00, ?it/s]
Remove <p>s: 0it [00:00, ?it/s]
Remove <b>s: 0it [00:00, ?it/s]
Remove <i>s: 0it [00:00, ?it/s]
Remove <span>s: 0it [00:00, ?it/s]
Remove <em>s: 0it [00:00, ?it/s]
Remove <orq>s: 0it [00:00, ?it/s]
Remove <strong>s: 100%|████████████████████████████████████████████████████████████████| 14/14 [00:02<00:00,  6.83it/s]


In [29]:

# Remove child strings in the quals dictionary that are only one character long
poppable_list = [child_str for child_str in flat_sentences if len(child_str) < 2]
progress_bar = tqdm(
    poppable_list, total=len(poppable_list), desc="Remove short quals"
)
for child_str in progress_bar:
    basic_quals_dict = nu.load_object('basic_quals_dict')
    if child_str in basic_quals_dict:
        basic_quals_dict.pop(child_str, None)
        nu.store_objects(basic_quals_dict=basic_quals_dict, verbose=False)

# Remove these particular child strings from the database
def do_cypher_tx(tx, verbose=False):
    cypher_str = '''
        // Delete qualification strings that are one character long
        MATCH (qs:QualificationStrings)
        WHERE SIZE(qs.qualification_str) = 1
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str)

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, verbose=False)

Remove short quals: 100%|██████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 82.62it/s]


In [33]:

flat_sentences[:10]

['.', '.', '?', '?', '?', '?', '?', '?', '?', '?']

In [35]:

# Get the one-character databased quals
cypher_str = '''
    // Get all qualification strings in the database
    MATCH (qs:QualificationStrings)
    WHERE SIZE(qs.qualification_str) = 1
    RETURN qs;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    df = DataFrame(
        [{k: v for k, v in row_obj['qs'].items()} for row_obj in row_objs_list]
    ).drop_duplicates()
    print(f'df.shape: {df.shape}')

In [None]:

# Identify and remove duplicate qualification_str entries
def do_cypher_tx(tx):
    cypher_str = '''
        // Identify and remove duplicate qualification_str entries
        MATCH (qs:QualificationStrings)
        WITH qs.qualification_str AS qual, COLLECT(qs) AS nodes
        WHERE SIZE(nodes) > 1
        WITH qual, nodes[0] AS keepNode, nodes[1..] AS duplicateNodes
        DETACH DELETE duplicateNodes
        RETURN COUNT(duplicateNodes) AS removedDuplicates;
        '''
    results_list = tx.run(query=cypher_str)

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx)