In [1]:

%pprint

Pretty printing has been turned OFF


In [16]:

%matplotlib inline
from IPython.display import clear_output, display
from cohere.error import CohereAPIError
from langchain.chains import LLMChain
from langchain.llms import Cohere, OpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import SpacyTextSplitter
from openai.error import RateLimitError
from pandas import DataFrame
from ratelimit import limits, sleep_and_retry
from tqdm.notebook import tqdm
import humanize
import os
import re
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
if ('../py' not in sys.path): sys.path.insert(1, '../py')

In [3]:

# Get the Storage object
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

# Get the WebScrapingUtilities object
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)

# To get your API key, visit https://serpapi.com/dashboard
os.environ['SERPAPI_API_KEY'] = wsu.secrets_json['SERPAPI_API_KEY']

# To get your API key, visit https://dashboard.cohere.ai/api-keys
os.environ['COHERE_API_KEY'] = wsu.secrets_json['Cohere_API_Key']

# To get your API key, visit https://beta.dreamstudio.ai/membership
os.environ['STABILITY_KEY'] = wsu.secrets_json['Dream_Studio_API_Key']

In [4]:

# Get the HeaderAnalysis object
from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the CypherUtilities object and Neo4j driver
from cypher_utils import CypherUtilities
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

# Get the SectionLRClassifierUtilities object
from section_classifier_utils import SectionLRClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Check if the slrcu has built its parts-of-speech logistic regression elements
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 49,163 labeled parts of speech in here
predict_single is now available
Parts-of-speech logistic regression elements built in 9 seconds


In [5]:

# Get the SectionCRFClassifierUtilities object
from section_classifier_utils import SectionCRFClassifierUtilities
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

# Check if the scrfcu has built its parts-of-speech conditional random field elements
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is now available
Parts-of-speech conditional random field elements built in 1 second


In [6]:

# Get the SectionSGDClassifierUtilities object
from section_classifier_utils import SectionSGDClassifierUtilities
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient descent elements built in {duration_str}')

I have 49,163 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 11 seconds


In [7]:

def combine_adjacent(split_strs_list):
    combined_list = []
    for i, s in enumerate(split_strs_list):
        if i == 0:
            combined_list.append(s)
        elif combined_list[-1].lower().endswith(' and'):
            combined_list[-1] = combined_list[-1] + ' ' + s
        else:
            combined_list.append(s)
    
    return combined_list

In [20]:

# Define the Trial key rate limit - 5 API calls / minute
@sleep_and_retry
@limits(calls=5, period=60)
def get_suggestion(chain, navigable_parent):
    llm_suggestion = chain.run(navigable_parent).strip()
    
    return llm_suggestion

In [8]:

orq_cypher_str = '''
    // Filter for NavigableParents nodes with an ambiguous SUMMARIZES relationship
    MATCH (np:NavigableParents)
    WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) >= 1
    WITH np

    // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
    MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
    WHERE pos.pos_symbol = "O-RQ"

    // Return the navigable parent
    RETURN np.navigable_parent AS navigable_parent
    ORDER BY size(navigable_parent) DESC;'''
cleanup_regex = re.compile(r'^<orq>Ability to \(\d+%\) ([A-Z])')
cleanup_cypher_str = '''
    MATCH (np:NavigableParents)
    WHERE np.navigable_parent = $old_child_str
    SET np.navigable_parent = $new_child_str;'''
fake_stops_list = ['e.g.', 'etc.', 'M.S.', 'B.S.', 'Ph.D.', '(ex.', '(Ex.',
                   'U.S.', 'i.e.', '&amp;', 'E.g.', 'Bsc.', 'MSc.', 'incl.']
replacements_list = ['eg', 'etc', 'MS', 'BS', 'PhD', '(eg', '(eg', 'US',
                     'ie', '&', 'eg', 'BS', 'MS', 'include']
tag_regex = re.compile('<([a-z][a-z0-9]*)[^<>]*>')


----
# Make this Work for Job Hunting

In [9]:

# Break the long HTML string into sentences and check if each is a qualification string
if not s.pickle_exists('split_orqs_df'):
    t1 = time.time()
    with cu.driver.session() as session: df = DataFrame(session.write_transaction(cu.do_cypher_tx, orq_cypher_str))
        
    # Clean up the "Ability to (5%)" stuff
    mask_series = df.navigable_parent.map(lambda x: bool(re.search(r'^<orq>Ability to \(\d+%\) ([A-Z])', x)))
    for navigable_parent in df[mask_series].navigable_parent:
        new_child_str = cleanup_regex.sub(r'<orq>Ability to \g<1>', navigable_parent)
        new_child_str = new_child_str[:16] + new_child_str[16:17].lower() + new_child_str[17:]
        def do_cypher_tx(tx, old_child_str, new_child_str):
            tx.run(query=cleanup_cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
        with cu.driver.session() as session: session.write_transaction(
            do_cypher_tx, old_child_str=navigable_parent, new_child_str=new_child_str
        )
    
    with cu.driver.session() as session: df = DataFrame(session.write_transaction(cu.do_cypher_tx, orq_cypher_str))
    text_splitter = SpacyTextSplitter()
    rows_list = []
    for html_str in df.navigable_parent:
        unhtml_str = re.sub('</?[^><]+>', '', html_str)
        for fake_stop, replacement in zip(fake_stops_list, replacements_list):
            unhtml_str = unhtml_str.replace(fake_stop, replacement)
        split_strs_list = combine_adjacent([str(split_str) for split_str in text_splitter._tokenizer(unhtml_str).sents])
        for split_str in split_strs_list:
            row_dict = {}
            split_str = re.sub(r'\s*[:;.*]+\s*$', '', split_str)
            row_dict['split_str'] = split_str
            row_dict['char_count'] = len(split_str)
            match_obj = tag_regex.search(html_str)
            if match_obj:
                tag_name = match_obj.group()
                split_str = f'<{tag_name}>{split_str}</{tag_name}>'
            else:
                tag_name = 'plaintext'
            row_dict['tag_name'] = tag_name
            score = 1.0
            score *= slrcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
            score *= scrfcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
            score *= ssgdcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
            row_dict['orq_score'] = score
            score = 1.0
            score *= slrcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
            score *= scrfcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
            score *= ssgdcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
            row_dict['opq_score'] = score
            rows_list.append(row_dict)
    split_orqs_df = DataFrame(rows_list)
    duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
    print(f'Split O-RQs DataFrame built in {duration_str}')
    s.store_objects(split_orqs_df=split_orqs_df)
else:
    split_orqs_df = s.load_object('split_orqs_df')
    df = split_orqs_df.copy()
    
    # Avoid the "<orq>Ability to " hack
    mask_series = (split_orqs_df.tag_name == '<orq>') & split_orqs_df.split_str.map(lambda x: x.startswith('Ability to '))
    display(split_orqs_df[mask_series].sort_values('orq_score').tail())
    min_split_score = split_orqs_df[mask_series].orq_score.max()
    mask_series = (split_orqs_df.orq_score >= min_split_score)
    display(split_orqs_df[mask_series].sort_values('orq_score').head())

Unnamed: 0,split_str,char_count,tag_name,orq_score,opq_score
4096,Ability to jAVA,15,<orq>,0.885057,9.855220999999999e-44
4533,Ability to o,12,<orq>,0.88779,1.38622e-47
5297,Ability to o,12,<orq>,0.88779,1.38622e-47
9262,Ability to wORKDAY,18,<orq>,0.890161,3.8488349999999997e-44
18623,Ability to lead,15,<orq>,0.905593,2.914283e-48


Unnamed: 0,split_str,char_count,tag_name,orq_score,opq_score
18623,Ability to lead,15,<orq>,0.905593,2.914283e-48
16333,Programming (P3 - Advanced),27,<orq>,0.908975,4.871548e-37
16239,Programming (P3 - Advanced),27,<orq>,0.908975,4.871548e-37
16074,Programming (P3 - Advanced),27,<orq>,0.908975,4.871548e-37
14427,Architecture (P2 - Intermediate),32,<orq>,0.920499,1.6341230000000002e-43



----
## Get GPT's help to rephrase badly-concatenated O-RQs

In [10]:

if not s.pickle_exists('ability_to_df'):
    def do_cypher_tx(tx):
        cypher_str = '''
            // Filter for NavigableParents nodes with an ambiguous SUMMARIZES relationship
            MATCH (np:NavigableParents)
            WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) >= 1
            WITH np

            // Find all NavigableParents nodes in the graph with an
            // incoming SUMMARIZES relationship to a PartsOfSpeech node
            MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
            WHERE (np.navigable_parent STARTS WITH "<orq>Ability to ")

            // Return the navigable parent
            RETURN
                np.navigable_parent AS navigable_parent,
                pos.pos_symbol AS pos_symbol
            ORDER BY size(navigable_parent) ASC;'''
        results_list = tx.run(query=cypher_str, parameters={})

        return [dict(record.items()) for record in results_list]
    row_objs_list = []
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx)
    if row_objs_list:
        ability_to_df = DataFrame(row_objs_list)
        print(ability_to_df.shape) # (4159, 2)
        
        # Hand-labeled examples
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to data Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Skills as a Data Lead are required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to data Lake</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience working with a Data Lake are required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to no travel.</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Ability to not have to travel is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to oracle HCM</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience working with Oracle HCM is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to no Travel.</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Ability to not have to travel is required.</orq>'
        
        # Suggested and fed back in
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Leadership skills are required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to jIRA).</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Proficiency in JIRA is necessary.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to elysian.</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Proficiency in Elysian is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to pMO Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Leadership skills in project management are necessary.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to pVC Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Leadership skills in venture capital are necessary.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to qA Director</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience as a QA Director is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to tax reports</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Knowledge of tax reports is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to pMO Analyst</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience as a PMO Analyst is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to team Support</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Ability to provide team support is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to data Synapse</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience working with Data Synapse is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to data Support</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Skills in data support are required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to oCM Advisory</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience in OCM Advisory is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to training Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Leadership in training is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to security Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience as a Security Lead is required.</orq>'
        
        mask_series = ~ability_to_df.llm_suggestion.isnull() & ~ability_to_df.llm_suggestion.map(lambda x: str(x).startswith('<orq>'))
        ability_to_df.loc[mask_series, 'llm_suggestion'] = ability_to_df[mask_series].llm_suggestion.map(lambda x: '<orq>' + x + '</orq>')
        
        s.store_objects(ability_to_df=ability_to_df)
else:
    ability_to_df = s.load_object('ability_to_df')
    df = ability_to_df.copy()
    df['suggestion_size'] = df.llm_suggestion.map(lambda x: len(str(x)))
    mask_series = df.llm_suggestion.isnull()
    print(f'We have {df[mask_series].shape[0]} more "Ability to" requirements to fix')
    columns_list = ['navigable_parent', 'pos_symbol', 'llm_suggestion']
    display(df[~mask_series].sort_values('suggestion_size')[columns_list].head(5))

We have 1388 more "Ability to" requirements to fix


Unnamed: 0,navigable_parent,pos_symbol,llm_suggestion
52,<orq>Ability to aWS Certification.</orq>,O-RQ,<orq>AWS Certification is required.</orq>
0,<orq>Ability to lead</orq>,O-RQ,<orq>Leadership skills are required.</orq>
51,<orq>Ability to ; Average FTE:91%.</orq>,O-RQ,<orq>Average FTE of 91% is required.</orq>
55,<orq>Ability to ; Average FTE:63%.</orq>,O-RQ,<orq>Average FTE of 63% is required.</orq>
513,<orq>Ability to someone with strong excel skil...,O-RQ,<orq>Strong Excel skills are a must.</orq>


In [50]:

template = 'Rephrase the following HTML STRING so that it reads like a MINIMUM REQUIREMENT.'

# Hand-labeled examples
template += '\n\nHTML STRING: "<orq>Ability to data Lead</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Skills as a Data Lead are required."
template += '\n\nHTML STRING: "<orq>Ability to data Lake</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Experience working with a Data Lake are required."
template += '\n\nHTML STRING: "<orq>Ability to no travel.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Ability to not have to travel is required."
template += '\n\nHTML STRING: "<orq>Ability to oracle HCM</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Experience working with Oracle HCM is required."
template += '\n\nHTML STRING: "<orq>Ability to no Travel.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Ability to not have to travel is required."
template += '\n\nHTML STRING: "<orq>Ability to this role is for International Paper (IP) which is on Microsoft Azure Platform, It is a'
template += ' telecom / network engineer role.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to work with International Paper (IP), on Microsoft Azure Platform, in a telecom / network engineer role."
template += '\n\nHTML STRING: "<orq>Ability to the platform will need to support thousands of applications globally, and real time'
template += ' onboarding into secure environments.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to get the platform to support thousands of applications globally, and real time onboarding into secure environments is required."
template += '\n\nHTML STRING: "<orq>Ability to support the Learning/Training Lead by designing, developing and deploying the training and'
template += ' performance support materials.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to support the Learning/Training Lead by designing, developing and deploying the training and performance support materials is required."
template += '\n\nHTML STRING: "<orq>Ability to experience building Ab initio Real-time service SOAP, Restful and queue based streaming system using'
template += ' IBM MQ,Rabbit Queue.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate with experience building Ab initio with real-time service SOAP, Restful and queue based streaming system"
template += " using IBM MQ or Rabbit Queue."
template += '\n\nHTML STRING: "<orq>Ability to the client is looking to automate FRR processes and will require several Informatica/ETL developers'
template += ' to support this work.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to be a Informatica/ETL developer is required."
template += '\n\nHTML STRING: "<orq>Ability to we?re looking more for an app arch that can help support the Insights Marketplace web application'
template += ' from an arch perspective.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to be an Application Architect who can help support the Insights Marketplace web application from an architectural perspective is required."
template += '\n\nHTML STRING: "<orq>Ability to ***US PERSONS ONLY*** Client team is in need of an experienced SAP PP resource to support'
template += ' an ongoing SAP PEO implementation.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "US Persons Only: we are looking for an experienced SAP PP resource to support an ongoing SAP PEO implementation."
template += '\n\nHTML STRING: "<orq>Ability to responsible for the configuration and testing of functions in DMS applications/ DERMS based on'
template += ' requirements from Client team.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate who is responsible for the configuration and testing of functions in DMS applications/"
template += "DERMS based on requirements from Client team."
template += '\n\nHTML STRING: "<orq>Ability to the PeopleSoft Finances Functional Lead will work with the client to analyze impacts from the new'
template += ' PUM and lead implementation.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to be a PeopleSoft Finances Functional Lead who will work with the client to analyze impacts from the new PUM and lead implementation."
template += '\n\nHTML STRING: "<orq>Ability to need some one who has strong PCF expertise , eCommerce domain expertise and LS industry expertise'
template += ' ...specially LS Distribution .</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate with strong PCF expertise, eCommerce domain expertise, and LS industry expertise. Especially LS Distribution."
template += '\n\nHTML STRING: "<orq>Ability to lead the effort to design, build and configure Zscaler\'s Tunnel 2.0 network security capabilities,'
template += ' acting as the primary point of contact.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Ability lead the effort to design, build and configure Zscaler's Tunnel 2.0 network security capabilities, acting as the primary point of contact."
template += '\n\nHTML STRING: "<orq>Ability to act as the primary point of contact and coordinator for client\'s IT stakeholders, vendors'
template += ' interfacing with CDW, and the offshore support team.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Ability to act as the primary point of contact and coordinator for client\'s IT stakeholders, vendors interfacing with CDW,"
template += " and the offshore support team."
template += '\n\nHTML STRING: "<orq>Ability to the goal of this Data analytics solution is to give the client pre-emptive information,'
template += ' so they can take corrective action before the month ends.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Ability to give the client pre-emptive information so they can take corrective action before the month ends."

# Suggested examples fed back in here
template += '\n\nHTML STRING: "<orq>Ability to responsible for development of required custom solution on top of out-of-box ORMB features to suit'
template += ' client business needs.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate who is responsible for development of required custom solution on top of out-of-box ORMB features to suit"
template += " client business needs."
template += '\n\nHTML STRING: "<orq>Ability to as a Developer, these will be your key responsibilities: Coding and developing by applying and'
template += ' design/code best practices.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a Developer who can code and develop by applying and design/code best practices."
template += '\n\nHTML STRING: "<orq>Ability to required Skills: SuccessFactors Compensation certification, Workstream Lead of 3+ SuccessFactors'
template += ' Compensation implementations</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate with SuccessFactors Compensation certification and Workstream Lead of 3+ SuccessFactors Compensation implementations."
template += '\n\nHTML STRING: "<orq>Ability to lead the effort to design, build and configure applications, acting as the primary point of contact'
template += ' for all the legacy systems at Wendys.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += 'Ability to lead the effort to design, build and configure applications, acting as the primary point of contact'
template += ' for all the legacy systems at Wendys.'

template += "\n\nHTML STRING: {html_str}\n=========\nMINIMUM REQUIREMENT:"
prompt = PromptTemplate(
    input_variables=['html_str'],
    template=template,
)

In [51]:

# llm = OpenAI(model_name='text-davinci-003', temperature=1.0, max_retries=1)
# What are the various strings (names of llm models that have
# been pre-trained and made available within the `langchain` package)
# that can be used as the input to the `model=` parameter of `langchain.llms.Cohere`?
llm = Cohere(model='command-xlarge-nightly', temperature=0)
chain = LLMChain(llm=llm, prompt=prompt)

In [None]:

ability_to_df = s.load_object('ability_to_df')
mask_series = ability_to_df.llm_suggestion.isnull()
for (row_index, row_series) in tqdm(ability_to_df[mask_series].iterrows(), total=ability_to_df[mask_series].shape[0]):
    navigable_parent = row_series.navigable_parent
    try:
        llm_suggestion = get_suggestion(chain, navigable_parent)
        time.sleep(1) # Sleep for 1 second between each call
    except RateLimitError:
        print("You've already spent 10 bucks on this; your wife will be pissed")
        break
    except CohereAPIError as e:
        message_str = str(e).strip()
        if 'too many tokens' in message_str:
            print(message_str)
            break
        time.sleep(60)
        llm_suggestion = get_suggestion(chain, navigable_parent)
        time.sleep(1) # Sleep for 1 second between each call
    except Exception as e:
        print(f'{e.__class__.__name__} error: {str(e).strip()}')
        print()
        print(f"""template += '\\n\\nHTML STRING: "{navigable_parent}"\\n=========\\n'""")
        print("""template += 'MINIMUM REQUIREMENT:\\n'""")
        # print(f'template += "{llm_suggestion}"')
        print(f"mask_series = (ability_to_df.navigable_parent == '{navigable_parent}')")
        # print(f"ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>{llm_suggestion}</orq>'")
        break
    score = 0.0
    for scu_obj in [slrcu, scrfcu, ssgdcu]:
        score = max(score, scu_obj.pos_predict_percent_fit_dict['O-RQ'](llm_suggestion))
    if score < 0.38346451108039514:
        clear_output(wait=True)
        print(score)
        print(f"""template += '\\n\\nHTML STRING: "{navigable_parent}"\\n=========\\n'""")
        print("""template += 'MINIMUM REQUIREMENT:\\n'""")
        print(f'template += "{llm_suggestion}"')
        print(f"mask_series = (ability_to_df.navigable_parent == '{navigable_parent}')")
        print(f"ability_to_df.loc[mask_series, 'llm_suggestion'] = '<opq>{llm_suggestion}</opq>'")
        print('s.store_objects(ability_to_df=ability_to_df, verbose=True)')
        break
    else:
        ability_to_df.loc[row_index, 'llm_suggestion'] = '<orq>' + llm_suggestion + '</orq>'
        s.store_objects(ability_to_df=ability_to_df, verbose=False)
wsu.beep(freq, duration)

In [57]:

print(len(prompt.format(html_str=navigable_parent)))

6583


In [58]:

for scu_obj in [slrcu, scrfcu, ssgdcu]:
    print(scu_obj.pos_predict_percent_fit_dict['O-RQ'](llm_suggestion), end=', ')

0.5789182751664326, 0.7504786553845737, 0.3941916696824917, 

In [59]:

# Go ahead and update what you can in the database
mask_series = ~ability_to_df.llm_suggestion.isnull() & (ability_to_df.pos_symbol == 'O-RQ')
for row_index, row_series in ability_to_df[mask_series].iterrows():
    old_child_str = row_series.navigable_parent
    new_child_str = row_series.llm_suggestion
    def do_cypher_tx(tx, old_child_str, new_child_str):
        cypher_str = '''
            MATCH (np:NavigableParents)
            WHERE np.navigable_parent = $old_child_str
            SET np.navigable_parent = $new_child_str;'''
        tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
    with cu.driver.session() as session: session.write_transaction(do_cypher_tx, old_child_str=navigable_parent, new_child_str=new_child_str)