In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from IPython.display import clear_output
from datetime import datetime
from neo4j.exceptions import ServiceUnavailable
from pandas import DataFrame
import humanize
import numpy as np
import os
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

from is_header_sgd_classifier import IsHeaderSgdClassifier
ihu = IsHeaderSgdClassifier(ha=ha, cu=cu, verbose=False)

try:
        
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

# Keep the total creation time to less than one hour by adjusting the sampling strategy limit
from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

from section_classifier_utils import SectionLRClassifierUtilities, SectionSGDClassifierUtilities, SectionCRFClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, slrcu=slrcu, scrfcu=scrfcu, ssgdcu=ssgdcu, verbose=True)

from section_utils import SectionUtilities
su = SectionUtilities(wsu=wsu, ihu=ihu, hc=hc, crf=crf, slrcu=slrcu, scrfcu=scrfcu, ssgdcu=ssgdcu, verbose=False)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 7 seconds
Last run on 2023-11-16 13:53:21.285788


In [4]:

# Parts-of-speech CRF elements built in 28 minutes and 51 seconds
t1 = time.time()
if not (hasattr(scrfcu, 'pos_symbol_crf') or crf.is_flask_running()):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_symbol_crf'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is now available
Parts-of-speech conditional random field elements built in 5 seconds


In [5]:

# Parts-of-speech stochastic gradient decent elements built in 17 seconds
t1 = time.time()
if not (hasattr(ssgdcu, 'pos_predict_percent_fit_dict') or crf.is_flask_running()):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient decent elements built in {duration_str}')

I have 49,102 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient decent elements built in 15 seconds


In [6]:

# Check if the crf has built its parts-of-speech classifier
# POS classifier trained in 12 hours, 15 minutes and 36 seconds
t1 = time.time()
if not hasattr(crf, 'CRF'):
    if s.pickle_exists('crf_CRF'):
        crf.CRF = s.load_object('crf_CRF')
    else:
        crf.retrain_pos_classifier(header_pattern_dict=s.load_object('HEADER_PATTERN_DICT'), verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'POS classifier trained in {duration_str}')

POS classifier trained in 0 seconds


In [7]:

# Parts-of-speech logistic regression elements built in 1 hour, 59 minutes and 41 seconds
t1 = time.time()
if not (hasattr(slrcu, 'pos_predict_percent_fit_dict') or crf.is_flask_running()):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 49,102 labeled parts of speech in here
predict_single is now available
Parts-of-speech logistic regression elements built in 10 seconds


In [8]:

# Is-qualified LR classifier built in 5 seconds
t1 = time.time()
lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=5_000, verbose=False)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified LR classifier built in {duration_str}')

Is-qualified LR classifier built in 6 seconds


In [9]:

# Is-header SGD classifer built in 9 seconds
t1 = time.time()
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Is-header SGD classifer built in {duration_str}')

I have 49,070 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 49,070 records trained
Is-header SGD classifer built in 8 seconds



---
# Training

In [10]:

# You need to run this again if you changed the qualification dictionary in another notebook
t1 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
lru.sync_basic_quals_dict(sampling_strategy_limit=8_000, verbose=False)
lru.retrain_isqualified_classifier(verbose=True)

duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Is-qualified classifer retrained in {duration_str}')

I have 13,272 hand-labeled qualification strings in here
I have 440,487 is-qualified vocabulary tokens in here
Is-qualified classifer retrained in 10 seconds


In [11]:

from pandas import DataFrame

cypher_str = f'''
    MATCH (fn:FileNames)
    WHERE
        fn.percent_fit = 0.0 AND
        ((fn.is_closed IS NULL) OR (fn.is_closed = false)) AND
        ((fn.is_verified IS NULL) OR (fn.is_verified = false)) AND
        ((fn.is_opportunity_application_emailed IS NULL) OR
        (fn.is_opportunity_application_emailed = false))
    RETURN
        fn.percent_fit AS percent_fit,
        fn.file_name AS file_name,
        fn.posting_url AS url
    ORDER BY fn.percent_fit ASC;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
files_list = []
if row_objs_list:
    files_list = DataFrame(row_objs_list).file_name.tolist()
print(f'Only {len(files_list)} more mis-estimated minimum-requirements-met percentages to go!')

Only 10 more mis-estimated minimum-requirements-met percentages to go!



----
## Fix POS and Quals for a job posting

In [12]:

t1 = time.time()
# file_name = 'f8b158d98ceff1ef_Machine_Learning_Algorithm_Developer_Assistant_Staff_Lexington_MA_Indeed_com.html'
file_name = files_list.pop()
file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
if os.path.isfile(file_path):
    child_strs_list = ha.get_child_strs_from_file(file_name=file_name)
    cu.ensure_filename(file_name, verbose=False)
    cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
    child_tags_list = ha.get_child_tags_list(child_strs_list)
    feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
    feature_tuple_list = []
    for feature_dict in feature_dict_list:
        feature_tuple_list.append(hc.get_feature_tuple(
            feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=scrfcu.predict_single,
            pos_sgd_predict_single=ssgdcu.predict_single
        ))
    crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
    print(file_name)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'CRF and child strings list recreated in {duration_str}')

eb92a1192904e96e_Data_and_Analytics_Summer_Help_Schofield_WI_54476_Indeed_com.html
CRF and child strings list recreated in 1 minute and 12 seconds


In [None]:

db_pos_list = []
for navigable_parent in child_strs_list:
    db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)
winsound.Beep(freq, duration)

In [None]:

# Restarting the kernel and getting to this cell took 2 minutes and 3 seconds
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Restarting the kernel and getting to this cell took {duration_str}'); raise

In [27]:

# Display the context of an individual child string
idx = 10
print(indices_list); child_str = child_strs_list[idx]; pos_symbol = pos_list[idx]; basic_quals_dict = s.load_object('basic_quals_dict')
print(str(basic_quals_dict[child_str]) + '\n' if(child_str in basic_quals_dict) else '', end=''); print(f'{idx} {pos_symbol}) {child_str}')

[4, 6, 8, 10]
1
10 O-RQ) <p>Selected candidate will be subject to a pre-employment background investigation and must be able to obtain and maintain a Secret level DoD security clearance.</p>


In [23]:

# Hand-label this particular child string in the quals dictionary
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[child_str] = 1
s.store_objects(basic_quals_dict=basic_quals_dict); print(f'"{child_str}" in basic_quals_dict: {basic_quals_dict[child_str]}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<p>The candidate must possess a Bachelor of Science degree in computer science, mathematics, electrical engineering, or an equivalent field. Prior exposure to machine learning algorithms, through coursework or research, is required. This includes deep learning techniques in addition to more general machine learning methods (e.g., Bayesian inference). Proficiency with Python is required, while familiarity with other development platforms like Matlab or C/C++ is considered an advantage. The candidate should also have some experience with common computer vision, natural language processing, or machine learning toolboxes (PyTorch, Pandas, OpenCV, TensorRT, libtorch, etc).</p>" in basic_quals_dict: 1


In [28]:

def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = """MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        SET
            np.is_header = false,
            np.is_task_scope = false,
            np.is_minimum_qualification = true,
            np.is_preferred_qualification = false,
            np.is_educational_requirement = false,
            np.is_legal_notification = false,
            np.is_other = false,
            np.is_corporate_scope = false,
            np.is_job_title = false,
            np.is_office_location = false,
            np.is_job_duration = false,
            np.is_supplemental_pay = false,
            np.is_interview_procedure = false,
            np.is_posting_date = false
        """ + cu.return_everything_str + ';'
    return [dict(record.items()) for record in tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
ihu.retrain_classifier(row_objs_list[0]['navigable_parent'], row_objs_list[0]['is_header'], verbose=True)
row_objs_list

1 iterations seen during updating fit for a total of 49,070 records trained


[{'navigable_parent': '<p>Selected candidate will be subject to a pre-employment background investigation and must be able to obtain and maintain a Secret level DoD security clearance.</p>', 'is_header': False, 'is_task_scope': False, 'is_minimum_qualification': True, 'is_preferred_qualification': False, 'is_legal_notification': False, 'is_job_title': False, 'is_office_location': False, 'is_job_duration': False, 'is_supplemental_pay': False, 'is_educational_requirement': False, 'is_interview_procedure': False, 'is_corporate_scope': False, 'is_posting_date': False, 'is_other': False}]

In [83]:

# Show what's in the database already for this html string
def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
        ''' + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str)
row_objs_list

[{'navigable_parent': '<b>Who makes a DatOps Engineer?</b>', 'is_header': None, 'is_task_scope': None, 'is_minimum_qualification': None, 'is_preferred_qualification': None, 'is_legal_notification': None, 'is_job_title': None, 'is_office_location': None, 'is_job_duration': None, 'is_supplemental_pay': None, 'is_educational_requirement': None, 'is_interview_procedure': None, 'is_corporate_scope': None, 'is_posting_date': None, 'is_other': None}]

In [335]:

lru.sync_basic_quals_dict(sampling_strategy_limit=8_000, verbose=False)
df = lru.basic_quals_df.copy()
df['length'] = df['qualification_str'].apply(len)
df = df.sort_values('length')
df.to_csv('../saves/csv/basic_quals_dict.csv')
df.head(1000).tail(5)

Unnamed: 0,qualification_str,is_qualified,length
1353,<li>Hadoop and Machine Learning</li>,0,36
1573,<p>· Experience with API testing</p>,0,36
4903,<orq>3 - MySQL (P3 - Advanced)</orq>,0,36
2823,<li>An understanding of rating.</li>,0,36
6492,<orq>1 - MySQL (P1 - Beginner)</orq>,1,36


In [330]:

# Remove the child string by idx from the quals dictionary and database
basic_quals_dict = s.load_object('basic_quals_dict')
child_str = df.loc[13195].qualification_str
basic_quals_dict.pop(child_str, None)
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')
def do_cypher_tx(tx, qualification_str, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $qualification_str})
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={'qualification_str': qualification_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, qualification_str=child_str, verbose=False)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
"<orq>Ability to nice to have).</orq>" in basic_quals_dict: False


In [None]:
raise
# Remove this particular child string from the quals dictionary and database
basic_quals_dict = s.load_object('basic_quals_dict')
child_str = child_strs_list[idx]
basic_quals_dict.pop(child_str, None)
# basic_quals_dict[child_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)
print(f'"{child_str}" in basic_quals_dict: {child_str in basic_quals_dict}')
def do_cypher_tx(tx, qualification_str, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings {qualification_str: $qualification_str})
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={'qualification_str': qualification_str})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, qualification_str=child_str, verbose=False)


----
## Maintenance

In [29]:

# Mark the file name as needing retraining everywhere
# file_name = 'b4e994e1d282ffa9_Digital_Platform_Services_Data_Analytics_and_Insights_Senior_Manager_Salt_Lake_City_UT_84111_Indeed_com.html'

# Check if the lru has retrained its isqualified classifier
if not hasattr(lru, 'hunting_df'):
    lru.retrain_isqualified_classifier(verbose=True)

mask_series = lru.hunting_df.percent_fit.isin([file_name])
lru.hunting_df.loc[mask_series, 'percent_fit'] = np.nan
s.store_objects(hunting_df=lru.hunting_df)
def do_cypher_tx(tx, file_name, verbose=False):
    cypher_str = """
        MATCH (fn:FileNames {file_name: $file_name})
        SET fn.percent_fit = NULL, fn.is_verified = true
        RETURN fn;"""
    if verbose:
        clear_output(wait=True)
        print(cypher_str.replace('$file_name', f'"{file_name}"'))
    results_list = tx.run(query=cypher_str, parameters={'file_name': file_name})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, file_name=file_name, verbose=False)
row_objs_list

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\hunting_df.pkl


[{'fn': <Node element_id='988302' labels=frozenset({'FileNames'}) properties={'file_name': 'f8b158d98ceff1ef_Machine_Learning_Algorithm_Developer_Assistant_Staff_Lexington_MA_Indeed_com.html', 'posting_url': 'https://www.indeed.com/rc/clk/dl?jk=f8b158d98ceff1ef&from=ja&qd=RnZhMybXSk4M3QtTVGXWocPDA-jVn_f73KUcK2QrGXzs5Dt3MAdRluWXCJcV97lyuB6k-jOww-IM6KWrTzOElDsCsVSx1_HPhTTU3XPOEs8&rd=FzlTg_mIv5FOCOtUGrVy1MoNBk--8gR6CvhRrl83wHU&tk=1gs4rka4ip2em805&alid=63b02da7edaed13019025096', 'is_verified': True}>}]

In [None]:

# You've made no changes to the qualification dictionary (regardless of parts-of-speech changes)
def do_cypher_tx(tx, file_name, verbose=False):
    cypher_str = """
        MATCH (fn:FileNames {file_name: $file_name})
        SET fn.is_verified = true
        RETURN fn;"""
    if verbose:
        clear_output(wait=True)
        print(cypher_str.replace('$file_name', f'"{file_name}"'))
    parameter_dict = {'file_name': file_name}
    results_list = tx.run(query=cypher_str, parameters=parameter_dict)
    values_list = []
    for record in results_list:
        values_list.append(dict(record.items()))

    return values_list
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, file_name=file_name, verbose=True)
row_objs_list

In [None]:

# Mark the file name as closed
cypher_str = f'''
    MATCH (fn:FileNames {{file_name: "{file_name}"}})
    SET fn.is_closed = true
    RETURN fn;'''
print(cypher_str)
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
row_objs_list

In [None]:

# Manually label the unscored qual
qualification_str = quals_list[13]
print(qualification_str)
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_dict[qualification_str] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)

In [None]:

# Remove file name from database
# file_name = '3c031ea6ad293e92_General_Service_Technician_Westborough_MA_01581_Indeed_com.html'
cu.delete_filename_node(file_name, verbose=True)

In [None]:

# Fix the unhashable mess you made
lru.basic_quals_df = lru.basic_quals_df.iloc[:-2]
s.store_objects(basic_quals_df=lru.basic_quals_df)
lru.basic_quals_dict = lru.basic_quals_df.set_index(
    'qualification_str'
).is_qualified.to_dict()
lru.s.store_objects(basic_quals_dict=lru.basic_quals_dict, verbose=True)

In [None]:

# Fix the unhashable mess you made
def do_cypher_tx(tx, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings)
        WHERE NOT qs.is_qualified IN [0, 1]
        DETACH DELETE qs;
        '''
    results_list = tx.run(query=cypher_str, parameters={})

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, verbose=False)
cypher_str = '''
    MATCH (qs:QualificationStrings)
    RETURN qs;'''
row_objs_list = cu.get_execution_results(cypher_str, verbose=False)
DataFrame(
    [{k: v for k, v in row_obj['qs'].items()} for row_obj in row_objs_list]
).tail(5)


----
# Take a badly written requirements section and see if you can programmatically parse the qualification string out of it

In [16]:

def combine_adjacent(split_strs_list):
    combined_list = []
    for i, s in enumerate(split_strs_list):
        if i == 0:
            combined_list.append(s)
        elif combined_list[-1].lower().endswith(' and'):
            combined_list[-1] = combined_list[-1] + ' ' + s
        else:
            combined_list.append(s)
    
    return combined_list

In [17]:

# Break the long HTML string into sentences and check if each is a qualification string
t1 = time.time()
from langchain.text_splitter import SpacyTextSplitter
import re

fake_stops_list = ['e.g.', 'etc.', 'M.S.', 'B.S.', 'Ph.D.', '(ex.', '(Ex.',
                   'U.S.', 'i.e.', '&amp;', 'E.g.', 'Bsc.', 'MSc.', 'incl.']
replacements_list = ['eg', 'etc', 'MS', 'BS', 'PhD', '(eg', '(eg', 'US',
                     'ie', '&', 'eg', 'BS', 'MS', 'include']
text_splitter = SpacyTextSplitter()
tag_regex = re.compile('<([a-z][a-z0-9]*)[^<>]*>')
rows_list = []
unhtml_str = re.sub('</?[^><]+>', '', child_str)
for fake_stop, replacement in zip(fake_stops_list, replacements_list):
    unhtml_str = unhtml_str.replace(fake_stop, replacement)
split_strs_list = combine_adjacent([str(split_str) for split_str in text_splitter._tokenizer(unhtml_str).sents])
for split_str in split_strs_list:
    row_dict = {}
    split_str = re.sub(r'\s*[:;.*]+\s*$', '', split_str)
    row_dict['split_str'] = split_str
    row_dict['char_count'] = len(split_str)
    match_obj = tag_regex.search(child_str)
    if match_obj:
        tag_name = match_obj.group()
        split_str = f'<{tag_name}>{split_str}</{tag_name}>'
    else:
        tag_name = 'plaintext'
    row_dict['tag_name'] = tag_name
    score = 1.0
    score *= slrcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
    score *= scrfcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
    score *= ssgdcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
    row_dict['orq_score'] = score
    score = 1.0
    score *= slrcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
    score *= scrfcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
    score *= ssgdcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
    row_dict['opq_score'] = score
    rows_list.append(row_dict)
split_orqs_df = DataFrame(rows_list)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Split O-RQs DataFrame built in {duration_str}')

Split O-RQs DataFrame built in 2 seconds


In [19]:

split_orqs_df.sort_values('orq_score', ascending=False)

Unnamed: 0,split_str,char_count,tag_name,orq_score,opq_score
23,3-5 years of experience in data science and da...,78,plaintext,0.3818604,1.23719e-06
26,Experience using AI/ML python libraries –PyTor...,259,plaintext,0.3729124,0.001369416
27,Experience working with MLOps infrastructure s...,128,plaintext,0.3484962,0.0001842955
25,Proficient in Python programming,32,plaintext,0.2616722,1.276113e-08
10,This position requires a motivated individual ...,280,plaintext,0.1635554,1.455531e-07
28,Legally authorized to work in the US,36,plaintext,0.07631722,1.789053e-06
24,Knowledge and experience in some of these: Dee...,219,plaintext,0.06289732,0.0004190706
16,"Architect, develop, deploy, and maintain scala...",219,plaintext,0.05318624,3.283486e-10
18,Work with cross-functional teams to derive the...,159,plaintext,0.04171702,9.168556e-09
21,Develop tools in analyzing diverse sets of imp...,222,plaintext,0.03348758,7.542002e-11


In [None]:

# Take a badly written requirements section and see if you can programmatically parse the qualification string out of it
import re
from nltk.tokenize import sent_tokenize

# sampling_strategy_limit=6_400 gets 10,635 labeled parts of speech and takes 49 minutes and 30 seconds
# sampling_strategy_limit=7_000 gets 10,635 labeled parts of speech and takes 49 minutes and 30 seconds
slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=70_000, verbose=True)

qual_paragraph = re.sub('</?[^<>]+>', '', child_str.strip(), 0, re.MULTILINE)
if len(sent_tokenize(qual_paragraph)) < 2:
    child_strs_list = re.split(' *: *', qual_paragraph, 0)
    child_tags_list = ha.get_child_tags_list(child_strs_list)
    feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
    feature_tuple_list = []
    for feature_dict in feature_dict_list:
        feature_tuple_list.append(hc.get_feature_tuple(feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=None, pos_sgd_predict_single=None))
    crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
    if crf_list[0] == 'H-RQ':
        child_strs_list = re.split(' *; *', ': '.join(child_strs_list[1:]), 0)
        child_tags_list = ha.get_child_tags_list(child_strs_list)
        feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
        feature_tuple_list = []
        for feature_dict in feature_dict_list:
            feature_tuple_list.append(hc.get_feature_tuple(feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=None, pos_sgd_predict_single=None))
        crf_list = crf.CRF.predict_single(crf.sent2features(feature_tuple_list))
        db_pos_list = []
        for navigable_parent in child_strs_list:
            db_pos_list = cu.append_parts_of_speech_list(navigable_parent, pos_list=db_pos_list)
        pos_list, indices_list = su.visualize_basic_quals_section(crf_list, child_strs_list, db_pos_list=db_pos_list, verbose=True)

In [None]:

import nltk.tokenize

dir(nltk.tokenize)

In [None]:

[f'nltk.tokenize.{fn}' for fn in dir(nltk.tokenize) if 'Tokenize' in fn]

In [None]:

nltk.tokenize.TweetTokenizer?

In [None]:

[f'nltk.tokenize.{fn}' for fn in dir(nltk.tokenize) if 'tokenize' in fn]

In [None]:

nltk.tokenize.wordpunct_tokenize(child_str)

In [None]:

nltk.tokenize.word_tokenize(child_str, preserve_line=True)

In [None]:

list(nltk.tokenize.string_span_tokenize(child_str, r';\s*'))

In [None]:

nltk.tokenize.sent_tokenize(child_str)

In [None]:

list(nltk.tokenize.regexp_tokenize(child_str, r'\w+'))

In [None]:

list(nltk.tokenize.regexp_span_tokenize(child_str, r'\s\s+'))

In [None]:

nltk.tokenize.line_tokenize(child_str)

In [None]:

nltk.tokenize.casual_tokenize(child_str)

In [None]:

nltk.tokenize.blankline_tokenize(child_str)