In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

%matplotlib inline
from IPython.display import display
from PIL import Image
from datetime import datetime
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.llms import Cohere
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain.vectorstores import Chroma
from neo4j.exceptions import ServiceUnavailable
from openai.error import RateLimitError
from pandas import DataFrame
from stability_sdk import client as stability_client
import cohere
import getpass
import humanize
import io
import os
import pandas as pd
import random
import re
import stability_sdk.interfaces.gooseai.generation.generation_pb2 as stability_generation
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

# Get the Storage object
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

# Get the WebScrapingUtilities object
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)

os.environ['SERPAPI_API_KEY'] = wsu.secrets_json['SERPAPI_API_KEY']

# Paste your API key here. Remember to not share it publicly
co_key = wsu.secrets_json['Cohere_API_Key']
os.environ['COHERE_API_KEY'] = co_key
co = cohere.Client(co_key)

# To get your API key, visit https://beta.dreamstudio.ai/membership
os.environ['STABILITY_KEY'] = wsu.secrets_json['Dream_Studio_API_Key']
stability_api = stability_client.StabilityInference(
    key=os.environ['STABILITY_KEY'], 
    verbose=True,
)

In [4]:

# Get the HeaderAnalysis object
from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the CypherUtilities object and Neo4j driver
from cypher_utils import CypherUtilities
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

# Get the SectionLRClassifierUtilities object
from section_classifier_utils import SectionLRClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Check if the slrcu has built its parts-of-speech logistic regression elements
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 49,163 labeled parts of speech in here
predict_single is now available
Parts-of-speech logistic regression elements built in 16 seconds


In [5]:

# Get the SectionCRFClassifierUtilities object
from section_classifier_utils import SectionCRFClassifierUtilities
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

# Check if the scrfcu has built its parts-of-speech conditional random field elements
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is now available
Parts-of-speech conditional random field elements built in 2 seconds


In [6]:

# Get the SectionSGDClassifierUtilities object
from section_classifier_utils import SectionSGDClassifierUtilities
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient descent elements built in {duration_str}')

I have 49,163 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 15 seconds


In [7]:

def combine_adjacent(split_strs_list):
    combined_list = []
    for i, s in enumerate(split_strs_list):
        if i == 0:
            combined_list.append(s)
        elif combined_list[-1].lower().endswith(' and'):
            combined_list[-1] = combined_list[-1] + ' ' + s
        else:
            combined_list.append(s)
    
    return combined_list

In [8]:

orq_cypher_str = '''
    // Filter for NavigableParents nodes with an ambiguous SUMMARIZES relationship
    MATCH (np:NavigableParents)
    WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) >= 1
    WITH np

    // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
    MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
    WHERE pos.pos_symbol = "O-RQ"

    // Return the navigable parent
    RETURN np.navigable_parent AS navigable_parent
    ORDER BY size(navigable_parent) DESC;'''
cleanup_regex = re.compile(r'^<orq>Ability to \(\d+%\) ([A-Z])')
cleanup_cypher_str = '''
    MATCH (np:NavigableParents)
    WHERE np.navigable_parent = $old_child_str
    SET np.navigable_parent = $new_child_str;'''
fake_stops_list = ['e.g.', 'etc.', 'M.S.', 'B.S.', 'Ph.D.', '(ex.', '(Ex.',
                   'U.S.', 'i.e.', '&amp;', 'E.g.', 'Bsc.', 'MSc.', 'incl.']
replacements_list = ['eg', 'etc', 'MS', 'BS', 'PhD', '(eg', '(eg', 'US',
                     'ie', '&', 'eg', 'BS', 'MS', 'include']
tag_regex = re.compile('<([a-z][a-z0-9]*)[^<>]*>')


----
# Make this Work for Job Hunting

In [9]:

# Break the long HTML string into sentences and check if each is a qualification string
if not s.pickle_exists('split_orqs_df'):
    t1 = time.time()
    with cu.driver.session() as session: df = DataFrame(session.write_transaction(cu.do_cypher_tx, orq_cypher_str))
        
    # Clean up the "Ability to (5%)" stuff
    mask_series = df.navigable_parent.map(lambda x: bool(re.search(r'^<orq>Ability to \(\d+%\) ([A-Z])', x)))
    for navigable_parent in df[mask_series].navigable_parent:
        new_child_str = cleanup_regex.sub(r'<orq>Ability to \g<1>', navigable_parent)
        new_child_str = new_child_str[:16] + new_child_str[16:17].lower() + new_child_str[17:]
        def do_cypher_tx(tx, old_child_str, new_child_str):
            tx.run(query=cleanup_cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
        with cu.driver.session() as session: session.write_transaction(
            do_cypher_tx, old_child_str=navigable_parent, new_child_str=new_child_str
        )
    
    with cu.driver.session() as session: df = DataFrame(session.write_transaction(cu.do_cypher_tx, orq_cypher_str))
    text_splitter = SpacyTextSplitter()
    rows_list = []
    for html_str in df.navigable_parent:
        unhtml_str = re.sub('</?[^><]+>', '', html_str)
        for fake_stop, replacement in zip(fake_stops_list, replacements_list):
            unhtml_str = unhtml_str.replace(fake_stop, replacement)
        split_strs_list = combine_adjacent([str(split_str) for split_str in text_splitter._tokenizer(unhtml_str).sents])
        for split_str in split_strs_list:
            row_dict = {}
            split_str = re.sub(r'\s*[:;.*]+\s*$', '', split_str)
            row_dict['split_str'] = split_str
            row_dict['char_count'] = len(split_str)
            match_obj = tag_regex.search(html_str)
            if match_obj:
                tag_name = match_obj.group()
                split_str = f'<{tag_name}>{split_str}</{tag_name}>'
            else:
                tag_name = 'plaintext'
            row_dict['tag_name'] = tag_name
            score = 1.0
            score *= slrcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
            score *= scrfcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
            score *= ssgdcu.pos_predict_percent_fit_dict['O-RQ'](split_str)
            row_dict['orq_score'] = score
            score = 1.0
            score *= slrcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
            score *= scrfcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
            score *= ssgdcu.pos_predict_percent_fit_dict['O-PQ'](split_str)
            row_dict['opq_score'] = score
            rows_list.append(row_dict)
    split_orqs_df = DataFrame(rows_list)
    duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
    print(f'Split O-RQs DataFrame built in {duration_str}')
    s.store_objects(split_orqs_df=split_orqs_df)
else:
    split_orqs_df = s.load_object('split_orqs_df')
    df = split_orqs_df.copy()
    
    # Avoid the "<orq>Ability to " hack
    mask_series = (split_orqs_df.tag_name == '<orq>') & split_orqs_df.split_str.map(lambda x: x.startswith('Ability to '))
    display(split_orqs_df[mask_series].sort_values('orq_score').tail())
    min_split_score = split_orqs_df[mask_series].orq_score.max()
    mask_series = (split_orqs_df.orq_score >= min_split_score)
    display(split_orqs_df[mask_series].sort_values('orq_score').head())

Unnamed: 0,split_str,char_count,tag_name,orq_score,opq_score
4096,Ability to jAVA,15,<orq>,0.885057,9.855220999999999e-44
4533,Ability to o,12,<orq>,0.88779,1.38622e-47
5297,Ability to o,12,<orq>,0.88779,1.38622e-47
9262,Ability to wORKDAY,18,<orq>,0.890161,3.8488349999999997e-44
18623,Ability to lead,15,<orq>,0.905593,2.914283e-48


Unnamed: 0,split_str,char_count,tag_name,orq_score,opq_score
18623,Ability to lead,15,<orq>,0.905593,2.914283e-48
16333,Programming (P3 - Advanced),27,<orq>,0.908975,4.871548e-37
16239,Programming (P3 - Advanced),27,<orq>,0.908975,4.871548e-37
16074,Programming (P3 - Advanced),27,<orq>,0.908975,4.871548e-37
14427,Architecture (P2 - Intermediate),32,<orq>,0.920499,1.6341230000000002e-43



----
## Get GPT's help to rephrase badly-concatenated O-RQs

In [10]:

if not s.pickle_exists('ability_to_df'):
    def do_cypher_tx(tx):
        cypher_str = '''
            // Filter for NavigableParents nodes with an ambiguous SUMMARIZES relationship
            MATCH (np:NavigableParents)
            WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) >= 1
            WITH np

            // Find all NavigableParents nodes in the graph with an
            // incoming SUMMARIZES relationship to a PartsOfSpeech node
            MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
            WHERE (np.navigable_parent STARTS WITH "<orq>Ability to ")

            // Return the navigable parent
            RETURN
                np.navigable_parent AS navigable_parent,
                pos.pos_symbol AS pos_symbol
            ORDER BY size(navigable_parent) ASC;'''
        results_list = tx.run(query=cypher_str, parameters={})

        return [dict(record.items()) for record in results_list]
    row_objs_list = []
    with cu.driver.session() as session:
        row_objs_list = session.write_transaction(do_cypher_tx)
    if row_objs_list:
        ability_to_df = DataFrame(row_objs_list)
        print(ability_to_df.shape) # (4159, 2)
        
        # Hand-labeled examples
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to data Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Skills as a Data Lead are required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to data Lake</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience working with a Data Lake are required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to no travel.</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Ability to not have to travel is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to oracle HCM</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience working with Oracle HCM is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to no Travel.</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Ability to not have to travel is required.</orq>'
        
        # Suggested and fed back in
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Leadership skills are required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to jIRA).</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Proficiency in JIRA is necessary.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to elysian.</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Proficiency in Elysian is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to pMO Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Leadership skills in project management are necessary.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to pVC Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Leadership skills in venture capital are necessary.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to qA Director</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience as a QA Director is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to tax reports</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Knowledge of tax reports is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to pMO Analyst</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience as a PMO Analyst is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to team Support</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Ability to provide team support is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to data Synapse</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience working with Data Synapse is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to data Support</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Skills in data support are required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to oCM Advisory</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience in OCM Advisory is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to training Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Leadership in training is required.</orq>'
        mask_series = (ability_to_df.navigable_parent == '<orq>Ability to security Lead</orq>')
        ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>Experience as a Security Lead is required.</orq>'
        
        mask_series = ~ability_to_df.llm_suggestion.isnull() & ~ability_to_df.llm_suggestion.map(lambda x: str(x).startswith('<orq>'))
        ability_to_df.loc[mask_series, 'llm_suggestion'] = ability_to_df[mask_series].llm_suggestion.map(lambda x: '<orq>' + x + '</orq>')
        
        s.store_objects(ability_to_df=ability_to_df)
else:
    ability_to_df = s.load_object('ability_to_df')
    df = ability_to_df.copy()
    df['suggestion_size'] = df.llm_suggestion.map(lambda x: len(str(x)))
    mask_series = df.llm_suggestion.isnull()
    print(f'We have {df[mask_series].shape[0]} more "Ability to" requirements to fix')
    columns_list = ['navigable_parent', 'pos_symbol', 'llm_suggestion']
    display(df[~mask_series].sort_values('suggestion_size')[columns_list].head(5))

We have 2725 more "Ability to" requirements to fix


Unnamed: 0,navigable_parent,pos_symbol,llm_suggestion
52,<orq>Ability to aWS Certification.</orq>,O-RQ,<orq>AWS Certification is required.</orq>
0,<orq>Ability to lead</orq>,O-RQ,<orq>Leadership skills are required.</orq>
51,<orq>Ability to ; Average FTE:91%.</orq>,O-RQ,<orq>Average FTE of 91% is required.</orq>
55,<orq>Ability to ; Average FTE:63%.</orq>,O-RQ,<orq>Average FTE of 63% is required.</orq>
513,<orq>Ability to someone with strong excel skil...,O-RQ,<orq>Strong Excel skills are a must.</orq>


In [129]:

template = 'Rephrase the following HTML STRING so that it reads like a MINIMUM REQUIREMENT.'

# Hand-labeled examples
template += '\n\nHTML STRING: "<orq>Ability to data Lead</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Skills as a Data Lead are required."
template += '\n\nHTML STRING: "<orq>Ability to data Lake</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Experience working with a Data Lake are required."
template += '\n\nHTML STRING: "<orq>Ability to no travel.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Ability to not have to travel is required."
template += '\n\nHTML STRING: "<orq>Ability to oracle HCM</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Experience working with Oracle HCM is required."
template += '\n\nHTML STRING: "<orq>Ability to no Travel.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Ability to not have to travel is required."
template += '\n\nHTML STRING: "<orq>Ability to this role is for International Paper (IP) which is on Microsoft Azure Platform, It is a'
template += ' telecom / network engineer role.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to work with International Paper (IP), on Microsoft Azure Platform, in a telecom / network engineer role."
template += '\n\nHTML STRING: "<orq>Ability to the platform will need to support thousands of applications globally, and real time'
template += ' onboarding into secure environments.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to get the platform to support thousands of applications globally, and real time onboarding into secure environments is required."
template += '\n\nHTML STRING: "<orq>Ability to support the Learning/Training Lead by designing, developing and deploying the training and'
template += ' performance support materials.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to support the Learning/Training Lead by designing, developing and deploying the training and performance support materials is required."
template += '\n\nHTML STRING: "<orq>Ability to experience building Ab initio Real-time service SOAP, Restful and queue based streaming system using'
template += ' IBM MQ,Rabbit Queue.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate with experience building Ab initio with real-time service SOAP, Restful and queue based streaming system"
template += " using IBM MQ or Rabbit Queue."
template += '\n\nHTML STRING: "<orq>Ability to the client is looking to automate FRR processes and will require several Informatica/ETL developers'
template += ' to support this work.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to be a Informatica/ETL developer is required."
template += '\n\nHTML STRING: "<orq>Ability to we?re looking more for an app arch that can help support the Insights Marketplace web application'
template += ' from an arch perspective.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to be an Application Architect who can help support the Insights Marketplace web application from an architectural perspective is required."
template += '\n\nHTML STRING: "<orq>Ability to ***US PERSONS ONLY*** Client team is in need of an experienced SAP PP resource to support'
template += ' an ongoing SAP PEO implementation.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "US Persons Only: we are looking for an experienced SAP PP resource to support an ongoing SAP PEO implementation."
template += '\n\nHTML STRING: "<orq>Ability to responsible for the configuration and testing of functions in DMS applications/ DERMS based on'
template += ' requirements from Client team.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate who is responsible for the configuration and testing of functions in DMS applications/"
template += "DERMS based on requirements from Client team."
template += '\n\nHTML STRING: "<orq>Ability to the PeopleSoft Finances Functional Lead will work with the client to analyze impacts from the new'
template += ' PUM and lead implementation.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "The ability to be a PeopleSoft Finances Functional Lead who will work with the client to analyze impacts from the new PUM and lead implementation."
template += '\n\nHTML STRING: "<orq>Ability to need some one who has strong PCF expertise , eCommerce domain expertise and LS industry expertise'
template += ' ...specially LS Distribution .</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate with strong PCF expertise, eCommerce domain expertise, and LS industry expertise. Especially LS Distribution."

# Suggested and fed back in
# template += '\n\nHTML STRING: "<orq>Ability to lead</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Leadership skills are required."
# template += '\n\nHTML STRING: "<orq>Ability to jIRA).</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Proficiency in JIRA is necessary."
# template += '\n\nHTML STRING: "<orq>Ability to elysian.</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Proficiency in Elysian is required."
# template += '\n\nHTML STRING: "<orq>Ability to pMO Lead</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Leadership skills in project management are necessary."
# template += '\n\nHTML STRING: "<orq>Ability to pVC Lead</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Leadership skills in venture capital are necessary."
# template += '\n\nHTML STRING: "<orq>Ability to qA Director</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Experience as a QA Director is required."
# template += '\n\nHTML STRING: "<orq>Ability to tax reports</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Knowledge of tax reports is required."
# template += '\n\nHTML STRING: "<orq>Ability to pMO Analyst</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Experience as a PMO Analyst is required."
# template += '\n\nHTML STRING: "<orq>Ability to team Support</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Ability to provide team support is required."
# template += '\n\nHTML STRING: "<orq>Ability to data Synapse</orq>"\n=========\n'
# template += 'MINIMUM REQUIREMENT:\n'
# template += "Experience working with Data Synapse is required."
template += '\n\nHTML STRING: "<orq>Ability to data Support</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Skills in data support are required."
template += '\n\nHTML STRING: "<orq>Ability to oCM Advisory</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Experience in OCM Advisory is required."
template += '\n\nHTML STRING: "<orq>Ability to training Lead</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Leadership in training is required."
template += '\n\nHTML STRING: "<orq>Ability to security Lead</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "Experience as a Security Lead is required."
template += '\n\nHTML STRING: "<orq>Ability to interact with business and works with dev team to incorporate new features.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "A candidate who is able to interact with business and works with dev team to incorporate new features is required."
template += '\n\nHTML STRING: "<orq>Ability to work with other architects to model, design, and lead the data modeling, application, interface'
template += ' and database activities.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a Data Architect who can work with other architects to model, design, and lead the data modeling, application,"
template += " interface and database activities."
template += '\n\nHTML STRING: "<orq>Ability to resource must understand end-to-end Treasury process dependencies, integration with other'
template += ' Finance and cross-team modules.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a resource who understands end-to-end Treasury process dependencies, integration with other Finance and cross-team modules."
template += '\n\nHTML STRING: "<orq>Ability to responsible for development of required custom solution on top of out-of-box ORMB features to suit'
template += ' client business needs.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate who is responsible for development of required custom solution on top of out-of-box ORMB features to suit"
template += " client business needs."
template += '\n\nHTML STRING: "<orq>Ability to as a Developer, these will be your key responsibilities: Coding and developing by applying and'
template += ' design/code best practices.</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a Developer who can code and develop by applying and design/code best practices."
template += '\n\nHTML STRING: "<orq>Ability to required Skills: SuccessFactors Compensation certification, Workstream Lead of 3+ SuccessFactors'
template += ' Compensation implementations</orq>"\n=========\n'
template += 'MINIMUM REQUIREMENT:\n'
template += "We are looking for a candidate with SuccessFactors Compensation certification and Workstream Lead of 3+ SuccessFactors Compensation implementations."

template += "\n\nHTML STRING: {html_str}\n=========\nMINIMUM REQUIREMENT:"
prompt = PromptTemplate(
    input_variables=['html_str'],
    template=template,
)

In [130]:

# llm = OpenAI(model_name='text-davinci-003', temperature=1.0, max_retries=1)
# What are the various strings (names of llm models that have
# been pre-trained and made available within the `langchain` package)
# that can be used as the input to the `model=` parameter of `langchain.llms.Cohere`?
llm = Cohere(model='command-xlarge-nightly', temperature=0)
chain = LLMChain(llm=llm, prompt=prompt)

In [133]:

from tqdm.notebook import tqdm
from cohere.error import CohereAPIError
from ratelimit import limits, sleep_and_retry
from IPython.display import clear_output

ability_to_df = s.load_object('ability_to_df')
mask_series = ability_to_df.llm_suggestion.isnull()

# Define the Trial key rate limit - 5 API calls / minute
@sleep_and_retry
@limits(calls=5, period=60)
def get_suggestion(chain, navigable_parent):
    llm_suggestion = chain.run(navigable_parent).strip()
    
    return llm_suggestion

for (row_index, row_series) in tqdm(ability_to_df[mask_series].iterrows(), total=ability_to_df[mask_series].shape[0]):
    navigable_parent = row_series.navigable_parent
    try:
        llm_suggestion = get_suggestion(chain, navigable_parent)
        time.sleep(1) # Sleep for 1 second between each call
    except RateLimitError:
        print("You've already spent 10 bucks on this; your wife will be pissed")
        break
    except CohereAPIError as e:
        message_str = str(e).strip()
        if 'too many tokens' in message_str:
            print(message_str)
            break
        time.sleep(60)
        llm_suggestion = get_suggestion(chain, navigable_parent)
        time.sleep(1) # Sleep for 1 second between each call
    except Exception as e:
        print(f'{e.__class__} error: {str(e).strip()}')
        print()
        print(f"""template += '\\n\\nHTML STRING: "{navigable_parent}"\\n=========\\n'""")
        print("""template += 'MINIMUM REQUIREMENT:\\n'""")
        # print(f'template += "{llm_suggestion}"')
        print(f"mask_series = (ability_to_df.navigable_parent == '{navigable_parent}')")
        # print(f"ability_to_df.loc[mask_series, 'llm_suggestion'] = '<orq>{llm_suggestion}</orq>'")
        break
    score = 0.0
    for scu_obj in [slrcu, scrfcu, ssgdcu]:
        score = max(score, scu_obj.pos_predict_percent_fit_dict['O-RQ'](llm_suggestion))
    if score < 0.38576917248603376:
        clear_output(wait=True)
        print(score)
        print(f"""template += '\\n\\nHTML STRING: "{navigable_parent}"\\n=========\\n'""")
        print("""template += 'MINIMUM REQUIREMENT:\\n'""")
        print(f'template += "{llm_suggestion}"')
        print(f"mask_series = (ability_to_df.navigable_parent == '{navigable_parent}')")
        print(f"ability_to_df.loc[mask_series, 'llm_suggestion'] = '<opq>{llm_suggestion}</opq>'")
        print('s.store_objects(ability_to_df=ability_to_df, verbose=True)')
        break
    else:
        ability_to_df.loc[row_index, 'llm_suggestion'] = '<orq>' + llm_suggestion + '</orq>'
        s.store_objects(ability_to_df=ability_to_df, verbose=False)

  0%|          | 0/1469 [00:00<?, ?it/s]

<class 'NameError'> error: name 'CohereConnectionError' is not defined

template += 'MINIMUM REQUIREMENT:\n'
mask_series = (ability_to_df.navigable_parent == '<orq>Ability to this resource will work with client to manage the development and support of the Risk adjustment unified data repository application.</orq>')


In [134]:

llm_suggestion = get_suggestion(chain, navigable_parent)

In [135]:

llm_suggestion

'We are looking for a resource who will work with the client to manage the development and support of the Risk adjustment unified data repository application.'

In [115]:

print(len(prompt.format(html_str=navigable_parent)))

6753


In [55]:

for scu_obj in [slrcu, scrfcu, ssgdcu]:
    print(scu_obj.pos_predict_percent_fit_dict['O-RQ'](llm_suggestion), end=', ')

0.0013165224780264965, 0.03767744998335852, 0.37688375253859563, 

In [None]:

# Go ahead and update what you can in the database
mask_series = ~ability_to_df.llm_suggestion.isnull() & (ability_to_df.pos_symbol == 'O-RQ')
for row_index, row_series in ability_to_df[mask_series].iterrows():
    old_child_str = row_series.navigable_parent
    new_child_str = row_series.llm_suggestion
    def do_cypher_tx(tx, old_child_str, new_child_str):
        cypher_str = '''
            MATCH (np:NavigableParents)
            WHERE np.navigable_parent = $old_child_str
            SET np.navigable_parent = $new_child_str;'''
        tx.run(query=cypher_str, parameters={'old_child_str': old_child_str, 'new_child_str': new_child_str})
    with cu.driver.session() as session: session.write_transaction(do_cypher_tx, old_child_str=navigable_parent, new_child_str=new_child_str)


----

In [None]:

print(llm('Tell me a joke'))

In [None]:

template = '''
I want you to act as a classical music historian that knows the history of a composition when given the name of it.

Here are some examples of historical descriptions:

The Symphony No. 3 in D minor by Gustav Mahler was written in sketch beginning in 1893, composed primarily in 1895,'''
template += " and took final form in 1896. Consisting of six movements, it is Mahler's longest composition and is the"
template += ' longest symphony in the standard repertoire, with a typical performance lasting around 95 to 110 minutes.'
template += ' It was voted one of the ten greatest symphonies of all time in a survey of conductors carried out by'
template += ''' the BBC Music Magazine.

The answer should detailed as if written by an expert.

What is the history of {composition}?
'''

prompt = PromptTemplate(
    input_variables=['composition'],
    template=template,
)

In [None]:

print(prompt.format(composition='colorful socks'))

In [None]:

chain = LLMChain(llm=llm, prompt=prompt)

In [None]:

# Run the chain only specifying the input variable
print(chain.run('Alpine Symphony'))

In [None]:

llm = OpenAI(temperature=0.9)
prompt = PromptTemplate(
    input_variables=['product'],
    template='What is a good name for a company that makes {product}?',
)
chain = LLMChain(llm=llm, prompt=prompt)
print(chain.run('colorful socks').strip())

In [None]:

from langchain.agents import load_tools
from langchain.agents import initialize_agent

# First, let's load the language model we're going to use to control the agent
llm = OpenAI(temperature=0)

# Next, let's load some tools to use. Note that the `llm-math` tool uses an LLM, so we need to pass that in.
tools = load_tools(['serpapi', 'llm-math'], llm=llm)

# Finally, let's initialize an agent with the tools, the language model, and the type of agent we want to use.
agent = initialize_agent(tools, llm, agent='zero-shot-react-description', verbose=True)

# Now let's test it out!
agent.run('What was the high temperature in SF yesterday in Fahrenheit? What is that number raised to the .023 power?')

In [None]:

from langchain import ConversationChain

llm = OpenAI(temperature=0)
conversation = ConversationChain(llm=llm, verbose=True)

conversation.predict(input='Hi there!')

In [None]:

conversation.predict(input="I'm doing well! Just having a conversation with an AI.")

In [None]:

from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

chat = ChatOpenAI(temperature=0)

In [None]:

chat([HumanMessage(content='Translate this sentence from English to French. I love programming.')])

In [None]:

messages = [
    SystemMessage(content='You are a helpful assistant that translates English to French.'),
    HumanMessage(content='Translate this sentence from English to French. I love programming.')
]
chat(messages)

In [None]:

batch_messages = [
    [
        SystemMessage(content='You are a helpful assistant that translates English to French.'),
        HumanMessage(content='Translate this sentence from English to French. I love programming.')
    ],
    [
        SystemMessage(content='You are a helpful assistant that translates English to French.'),
        HumanMessage(content='Translate this sentence from English to French. I love artificial intelligence.')
    ],
]
result = chat.generate(batch_messages)
result

In [None]:

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

chat = ChatOpenAI(temperature=0)

template = 'You are a helpful assistant that translates {input_language} to {output_language}.'
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_template='{text}'
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

# Get a chat completion from the formatted messages
chat(chat_prompt.format_prompt(input_language='English', output_language='Navajo', text='I love programming.').to_messages())

In [None]:

chat = ChatOpenAI(temperature=0)

template='You are a helpful assistant that translates {input_language} to {output_language}.'
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_template='{text}'
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

chain = LLMChain(llm=chat, prompt=chat_prompt)
chain.run(input_language='English', output_language='Navajo', text='I love programming.')

In [None]:

# First, let's load the language model we're going to use to control the agent
chat = ChatOpenAI(temperature=0)

# Next, let's load some tools to use. Note that the `llm-math` tool uses an LLM, so we need to pass that in.
llm = OpenAI(temperature=0)
tools = load_tools(['serpapi', 'llm-math'], llm=llm)


# Finally, let's initialize an agent with the tools, the language model, and the type of agent we want to use.
agent = initialize_agent(tools, chat, agent='chat-zero-shot-react-description', verbose=True)

# Now let's test it out!
agent.run("Who is Olivia Wilde's boyfriend? What is his current age raised to the 0.23 power?")

In [None]:

from langchain.prompts import MessagesPlaceholder
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory

template_str = 'The following is a friendly conversation between a human and an AI. The AI is talkative and provides'
template_str += ' lots of specific details from its context. If the AI does not know the answer to a question,'
template_str += ' it truthfully says it does not know.'
prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(template_str),
    MessagesPlaceholder(variable_name='history'),
    HumanMessagePromptTemplate.from_template('{input}')
])

llm = ChatOpenAI(temperature=0)
memory = ConversationBufferMemory(return_messages=True)
conversation = ConversationChain(memory=memory, prompt=prompt, llm=llm)

conversation.predict(input='Hi there!')

In [None]:

conversation.predict(input="I'm doing well! Just having a conversation with an AI.")

In [None]:

conversation.predict(input='Tell me about yourself.')

In [None]:

from langchain.utilities import PythonREPL

python_repl = PythonREPL()
print(python_repl.run("""from langchain.utilities import PythonREPL;pr=PythonREPL();print(pr.run('print(1+1)'))""").strip())

In [None]:

import textwrap

llm = OpenAI(model_name='text-davinci-003', temperature=0)
print('\n'.join(textwrap.wrap(llm('What is LangChain?').strip())))

In [None]:

toplevel = 'https://langchain.readthedocs.io/en/latest'
soup = wsu.get_page_soup(toplevel)
print(soup.prettify())

In [None]:

anchors_attrs = [anchor.attrs for anchor in soup.find_all('a')]

In [None]:

paths = []
for anchor_attrs in anchors_attrs:
    try:
        classes = anchor_attrs['class']
        link = anchor_attrs['href']
        if 'reference' in classes:
            if 'internal' in classes:
                paths.append(link)
            elif 'external' in classes:
                if link.startswith('./'):
                    paths.append(link[len('./'):])
                else:
                    pass # Not a link to docs
            else:
                pass # I didn't understand that reference
        else:
            pass # Not a reference
    except KeyError:
        print('No classes or href:', anchor_attrs)

In [None]:

import requests

paths = ['index.html'] + paths
pages = []
for path in paths:
    try:
        url = '/'.join([toplevel, path])
        resp = requests.get(url)
        resp.raise_for_status()
    except Exception:
        print(url)
    finally:
        pages.append({'content': resp.content, 'url': url})

In [None]:

import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:

from unstructured.partition.html import partition_html

parsed_docs = [partition_html(text=page['content']) for page in pages]

In [None]:

texts = []
for doc in parsed_docs:
    texts.append('\n\n'.join(
        [str(el).strip() for el in doc]
    ).strip().replace('\\n', '').replace(r'\xe2\x80\x99', "'"))

In [None]:

print(*textwrap.wrap(texts[0]), sep='\n')

In [None]:

for page, text in zip(pages, texts):
    page['text'] = text

In [None]:

pages[0].keys()

In [None]:

import pandas as pd

pd.DataFrame(pages).sample(10)


## Chunk the text for use inside LLM prompts

In [None]:

from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024, chunk_overlap=128, separator=' '
)
documents = text_splitter.create_documents(
    [page['text'] for page in pages], metadatas=[{'source': page['url']} for page in pages]
)
print(documents[0].metadata['source'], *textwrap.wrap(documents[0].page_content), sep='\n')

In [None]:

from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [None]:

from langchain.vectorstores import FAISS

docsearch = FAISS.from_documents(documents, embeddings)

In [None]:

from langchain.chains.qa_with_sources import load_qa_with_sources_chain

chain = load_qa_with_sources_chain(llm, chain_type='stuff')

In [None]:

import langchain

dir(langchain.chains)

In [None]:

from langchain.chains import sequential

dir(sequential)

In [None]:

query = 'What is LangChain?'
docs = docsearch.similarity_search(query)
result = chain({'input_documents': docs, 'question': query})
text = '\n'.join(textwrap.wrap(result['output_text']))
text = '\n\nSOURCES:\n'.join(map(lambda s: s.strip(), text.split('SOURCES:')))
print(text)

In [None]:

for doc in docs:
    print()
    print(doc)

In [None]:

print(chain.llm_chain.prompt.template)

In [None]:

assert scrfcu.pos_predict_percent_fit_dict['O-RQ']('*') == 0.0, "You need to rerun this"

In [None]:

t1 = time.time()
cu.populate_pos_relationships(verbose=False)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech relationships repopulated in {duration_str}')

In [None]:

idx = split_orqs_df[mask_series].sort_values('orq_score').head(1).index[0]
child_str = split_orqs_df.iloc[idx].tag_name + split_orqs_df.iloc[idx].split_str + split_orqs_df.iloc[idx].tag_name.replace('<', '</')
child_str

In [None]:

def do_cypher_tx(tx, navigable_parent, verbose=False):
    cypher_str = '''
        MATCH (np:NavigableParents)
        WHERE np.navigable_parent CONTAINS $navigable_parent
        SET
            np.is_header = 'False',
            np.is_task_scope = 'True',
            np.is_minimum_qualification = 'False',
            np.is_preferred_qualification = 'False',
            np.is_educational_requirement = 'False',
            np.is_legal_notification = 'False',
            np.is_other = 'False',
            np.is_corporate_scope = 'False',
            np.is_job_title = 'False',
            np.is_office_location = 'False',
            np.is_job_duration = 'False',
            np.is_supplemental_pay = 'False',
            np.is_interview_procedure = 'False',
            np.is_posting_date = 'False'
        ''' + cu.return_everything_str + ';'
    results_list = tx.run(query=cypher_str, parameters={'navigable_parent': navigable_parent})
    
    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, navigable_parent=child_str, verbose=False)
row_objs_list

In [None]:

child_str

In [None]:

template = 'Given the following HTML strings, create a MINIMUM REQUIREMENTS section which parses out only the minimum requirements.\n\n'
template += 'HTML STRING: "'
template += html_strs_list[0]
template += '"\n=========\n'
template += 'MINIMUM REQUIREMENTS:'
template += "\nMust possess a Bachelor's degree or foreign equivalent in Statistics, Mathematics, or a closely related field"
template += "\nAcademic coursework or professional experience must include descriptive and predictive statistics"
template += "\nAcademic coursework or professional experience must include experience working with statistical programming tools,"
template += " such as R or Python"
template += "\nAcademic coursework or professional experience must include experience with Jupyter Notebooks, Matplotlib, Seaborn,"
template += " Dask, Docker, or BigQuery"
template += "\nAcademic coursework or professional experience must include experience with Google Cloud Platform"
template += "\nAcademic coursework or professional experience must include development and deployment of code (version control and CI/CD)"
template += "\nBachelor's / Master's degree in computer science or equivalent\n"
template += "\n2+ years of experience as a data analyst / engineer in a data focused environment and role\n"
template += "\nKnowledge of best practices with data wrangling/mapping/cleansing,"
template += " and other state-of-the-art relevant computational techniques and processes"
template += "\nStrong Python data science skill set"
template += "\nStrong SQL skills"
template += "\nStrong knowledge of engineering practices for development and deployment of code (version control + CI/CD)"
template += "\nSolid foundational knowledge of descriptive and predictive statistics"
template += "\nPassionate about data quality and able to clearly articulate its importance to internal and external partners in specific"
template += " contexts"
template += "\nCollaborative self-starter"
template += "\nClear and strong written and verbal communication skills"
template += "\nClear and strong presentation skills"
template += "\nClear and strong documentation (technical and non-technical) skills"
template += "\nAbility to collaborate with other peers and senior / principal engineers"
template += "\nEffective time management based on priorities dictated by the business"
template += "\nSelf-assessment and situational assessment skills"
template += "\nService orientation toward customers (both internal and external)"
template += "\nResilient in the face of adversity"
template += "\nAbility to solve technical and business problems with help from other team members collaboratively"
template += "\n\nHTML STRING: " + html_strs_list[1] + "\n=========\nMINIMUM REQUIREMENTS:"
template += "\nBachelors degree in Computer Science, Engineering (any) or a related field"
template += "\nOne (1) year of related work experience involving working across all phases of the Agile methodology"
template += " project delivery lifecycle for designing, developing, testing and implementing big data solutions,"
template += " data analytics, data visualization, data management, and OFSAA solutions for enterprise-level clients"
template += "\nOne (1) year of related work experience involving interacting directly with stakeholders in gathering"
template += " functional and technical requirements, analyzing client requirements, and translating requirements"
template += " into project designs to deliver solutions that satisfy business needs using Confluence wiki, Atlassian"
template += " Jira, Git, Microsoft (MS) Powerpoint, Visio And Sharepoint"
template += "\nOne (1) year of related work experience involving utilizing Cloudera Hadoop, Confluent KAFKA, OFSAA,"
template += " Arcadia Data, Unix, procedural language (PL/SQL) and SQL for the design and development of data solutions"
template += "\nOne (1) year of related work experience involving tracking and reporting on project and milestone"
template += " deliverable status to ensure timely project delivery and communicating updates to internal and external"
template += " stakeholders using Atlassian Jira, Confluence, Sharepoint, Powerpoint And Visio"
template += "\nOne (1) year of related work experience involving utilizing Bash shell, Intellij integrated development"
template += " environment (IDE), Eclipse integrated development environment (IDE), Toad and SQL developer in the"
template += " process of project-related responsibilities that include writing code in SQL and debugging code to solve"
template += " defects"
template += "\nOne (1) year of related work experience involving analyzing and documenting current-state systems,"
template += " processes, and environments to support project design of future-state systems, processes, and"
template += " environments using Confluent KAFKA, Cloudera Hadoop, Ofsaa, Arcadia Data, Python, Unix, PL/SQL, and SQL"
template += "\nOne (1) year of related work experience involving supporting and performing unit testing to ensure"
template += " successful design and implementation of project solutions"
template += "\nOne (1) year of related work experience involving designing and delivering user manuals and training"
template += " for solutions to ensure successful implementation, adoption, and maintenance of data solutions in big"
template += " data/Hive, OFSAA and using concepts in data warehouse, data lake, and data visualization"
template += "\nOne (1) year of related work experience involving supporting business development activities, including"
template += " proposal development and responses to requests for proposals"
template += "\nOne (1) year of related work experience involving utilizing Confluent (KAFKA) KSQL/SQL, Tableau,"
template += " Jupyter Notebook, Oracle and Apache Tools Impala, Spark, Livy, Zookeeper and Scoop for the completion"
template += " of tasks that include writing code in query language to build data pipelines, creating test cases to"
template += " evaluate products, data analysis, data migration, data modeling, data mapping and data reporting"
template += "\n\nHTML STRING: " + html_strs_list[2] + "\n=========\nMINIMUM REQUIREMENTS:"
template += "\nStrong Python experience especially in data engineering/ML for ML based product development"
template += "\nKnowledge on different algorithms and corresponding Python packages e.g. fuzzy match of strings,"
template += " graph algorithm to create connected lists, etc."
template += "\nStrong coding skills in Pandas, Numpy"
template += "\nGood understanding of Pandas groupby, sort, merge, append, assignment, filters, map, apply"
template += "\nStrong knowledge in Python objects, tuples, list, dict, generators, lambda, etc."
# template += "\n\nHTML STRING: " + html_strs_list[3] + "\n=========\nMINIMUM REQUIREMENTS:"
# template += "\nEnthusiasm for troubleshooting, analyzing,and resolving complex problems"
# template += "\nDemonstrable strong problem-solving and communication skills"
# template += "\nPrepared to be an expert performance engineering resource on multiple initiatives of diverse scopes"
# template += "\nHands-on experience in designing, developing and implementing state of the art test simulation, analysis"
# template += " tools and technologies to ensure platforms deliver industry-leading performance for high availability and"
# template += " great performance for achieving targeting revenues to the clients"
# template += "\nExperience with load testing using JMeter, API, and Microservice testing using RestAssure"
# template += "\nDemonstrable ability to design and delivered performance Testing and Engineering frameworks for complex"
# template += " enterprise applications."
# template += "\nHas played an architect-level role in handling end-to-end (frontend, Middleware and backend systems)"
# template += " performance tuning and optimization of the platform for at least 2 to 3 large engagements"
template += "\n\nHTML STRING: {html_str}\n=========\nMINIMUM REQUIREMENTS:"
prompt = PromptTemplate(
    input_variables=['html_str'],
    template=template,
)
chain = LLMChain(llm=llm, prompt=prompt)
print(f'"{html_strs_list[4]}"')
reqs_list = chain.run(html_strs_list[4]).split('\n')
for req_str in reqs_list:
    if req_str: print('\n'+req_str)

In [None]:

print()
for html_str in html_strs_list[3].split(' * '):
    print(f'template += "\\n{html_str}"')