In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

%matplotlib inline
from IPython.display import clear_output, display
from cohere.error import CohereAPIError
from langchain.chains import LLMChain
from langchain.llms import Cohere, OpenAI
from langchain.prompts import PromptTemplate
from openai.error import RateLimitError
from pandas import DataFrame
from ratelimit import limits, sleep_and_retry
from tqdm.notebook import tqdm
import humanize
import numpy as np
import os
import re
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

# Get the Storage object
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

# Get the WebScrapingUtilities object
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)

# To get your API key, visit https://serpapi.com/dashboard
os.environ['SERPAPI_API_KEY'] = wsu.secrets_json['SERPAPI_API_KEY']

# To get your API key, visit https://dashboard.cohere.ai/api-keys
os.environ['COHERE_API_KEY'] = wsu.secrets_json['Cohere_API_Key']

# To get your API key, visit https://beta.dreamstudio.ai/membership
os.environ['STABILITY_KEY'] = wsu.secrets_json['Dream_Studio_API_Key']

In [4]:

# Get the HeaderAnalysis object
from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the CypherUtilities object and Neo4j driver
from cypher_utils import CypherUtilities
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

# Get the SectionLRClassifierUtilities object
from section_classifier_utils import SectionLRClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Check if the slrcu has built its parts-of-speech logistic regression elements
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 49,116 labeled parts of speech in here
predict_single is now available
Parts-of-speech logistic regression elements built in 8 seconds


In [5]:

# Get the SectionCRFClassifierUtilities object
from section_classifier_utils import SectionCRFClassifierUtilities
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

# Check if the scrfcu has built its parts-of-speech conditional random field elements
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is now available
Parts-of-speech conditional random field elements built in 1 second


In [6]:

# Get the SectionSGDClassifierUtilities object
from section_classifier_utils import SectionSGDClassifierUtilities
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient descent elements built in {duration_str}')

I have 49,116 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 9 seconds


In [7]:

# Define the Trial key rate limit - 5 API calls / minute
@sleep_and_retry
@limits(calls=5, period=60)
def get_suggestion(chain, navigable_parent):
    llm_suggestion = chain.run(navigable_parent).strip()
    
    return llm_suggestion

In [8]:

def display_file_in_text_editor(file_name):
    text_editor_path = r"C:\Program Files\Notepad++\notepad++.exe"
    file_path = os.path.join(ha.SAVES_HTML_FOLDER, file_name)
    !"{text_editor_path}" "{os.path.abspath(file_path)}"


----
## Get GPT's help to break up overly-long O-RQs

In [19]:

if not s.pickle_exists('tldr_df'):
    tldr_cypher_str = """
        // Filter for NavigableParents nodes with an ambiguous SUMMARIZES relationship
        MATCH (np:NavigableParents)
        WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) >= 1
        WITH np

        // Find all NavigableParents nodes in the graph with
        // an incoming SUMMARIZES relationship to a PartsOfSpeech node
        MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
        WHERE
            pos.pos_symbol = "O-RQ"
            AND NOT (np.navigable_parent STARTS WITH "<orq>")
        WITH np

        // Find all the relationship types in the graph with a file_name property
        MATCH (np)-[r:NEXT]-(:NavigableParents)
        WHERE exists(r.file_name)
        WITH np, r

        // Return the navigable parent
        RETURN DISTINCT
            np.navigable_parent AS navigable_parent,
            r.file_name AS file_name
        ORDER BY size(navigable_parent) DESC;"""
    with cu.driver.session() as session: tldr_df = DataFrame(session.write_transaction(cu.do_cypher_tx, tldr_cypher_str))
    tldr_df['char_count'] = tldr_df.navigable_parent.map(lambda x: len(x))
    if not 'llm_suggestion' in tldr_df.columns:
        tldr_df['llm_suggestion'] = np.nan
    s.store_objects(tldr_df=tldr_df, verbose=False)
else: tldr_df = s.load_object('tldr_df')
display(tldr_df.head(20))

Unnamed: 0,navigable_parent,file_name,char_count,llm_suggestion
0,Experience in all phases of SDLC like Requirem...,1c57f00badeec1a2_Principal_Full_Stack_Engineer...,381,<p>We are seeking a candidate with the followi...
1,: No one is perfect. You’ve made some mistakes...,c5998e2b1c9b877e_Senior_Software_Engineer_Remo...,381,<p>No one is perfect. You’ve made some mistake...
2,We’re looking for a QA Engineer to be a key me...,e35bac775dc584b1_QA_Engineer_Remote_Indeed_com...,380,<p>We are seeking a candidate with the followi...
3,<li>Familiarity with statistical methods and r...,40f70f73663d2525_Data_Scientist_II_Remote_Inde...,379,
4,<li>A minimum of 8 years beyond a bachelor’s d...,b18474e2fa35c2f2_Senior_Research_Officer_Washi...,378,
5,<li>Advanced user of Microsoft Office (includi...,b18474e2fa35c2f2_Senior_Research_Officer_Washi...,378,
6,<li>Analytical Skills: A candidate for this po...,Senior_Data_Analyst_-_Washington_State_-_Indee...,377,
7,Our ideal teammate is someone who is a continu...,Junior_Data_Scientist_Data_Science_Development...,375,
8,<li>Ability to run standard model algorithms i...,Lead_Data_Scientist_World_Wide_Technology_Remo...,374,
9,<li>Ability to run standard model algorithms i...,Data_Scientist_World_Wide_Technology_Remote_or...,374,


In [10]:

template = 'Break up the following HTML STRING into paragraphs and bullet points so that it is easier to read.'
template += ' Ensure that the only thing added in the REFORMATTED HTML is tags, and nothing is rephrased.'

# Hand-labeled examples
template += '\n\nHTML STRING: "with bachelor’s degree in Computer Science, Engineering Any, Technology or'
template += ' related and 5 yrs. of exp to design & Implement test Automation Frameworks and suites using'
template += ' the automation tools and technologies like HP UFT, Selenium Web Driver, Java, Python, Junit,'
template += ' Macros, TestNG & Cucumber. Develop, integrate, and maintain Automation scripts. Work on'
template += ' Continuous Integration tools (CI/CD) like Jenkins for continuous integration and deployment of'
template += ' Automation builds. Have knowledge on understanding of API integration. Automate Database and'
template += ' have SQL knowledge. Should be well versed in using various highly specialized tools and'
template += ' technologies like Jira, ALM, Appium, Protractor, Jasmine, Maven, ANT, Jenkins, AWS, DevOps,'
template += ' CVCD Toolset, TOSCA, Gherkin, Groovy, Rest Assured, Postman, HTML and SoapUI. Should have good'
template += ' communication skills and experience in agile/scrum, implementing overall testing strategy,'
template += ' estimates and able to manage a team."\n=========\n'
template += 'REFORMATTED HTML:\n'
template += """
<p>We are seeking a candidate with the following qualifications:</p>
<ul>
    <li>Bachelor’s degree in Computer Science, Engineering Any, Technology or related field</li>
    <li>5 years of experience in designing and implementing test automation frameworks and suites</li>
</ul>
<p>The ideal candidate should have expertise in the following areas:</p>
<ul>
    <li>
        Automation tools and technologies, such as:
        <ul>
            <li>HP UFT</li>
            <li>Selenium Web Driver</li>
            <li>Java</li>
            <li>Python</li>
            <li>Junit</li>
            <li>Macros</li>
            <li>TestNG</li>
            <li>Cucumber</li>
        </ul>
    </li>
    <li>Developing, integrating, and maintaining automation scripts</li>
    <li>
        Working with Continuous Integration tools (CI/CD), such as:
        <ul>
            <li>Jenkins, for continuous integration and deployment of automation builds</li>
        </ul>
    </li>
    <li>Understanding of API integration</li>
    <li>Automating databases and having SQL knowledge</li>
    <li>
        Using various highly specialized tools and technologies, such as:
        <ul>
            <li>Jira</li>
            <li>ALM</li>
            <li>Appium</li>
            <li>Protractor</li>
            <li>Jasmine</li>
            <li>Maven</li>
            <li>ANT</li>
            <li>Jenkins</li>
            <li>AWS</li>
            <li>DevOps</li>
            <li>CVCD Toolset</li>
            <li>TOSCA</li>
            <li>Gherkin</li>
            <li>Groovy</li>
            <li>Rest Assured</li>
            <li>Postman</li>
            <li>HTML</li>
            <li>SoapUI</li>
        </ul>
    </li>
</ul>
<p>In addition, the candidate should:</p>
<ul>
    <li>Have good communication skills</li>
    <li>Have experience in agile/scrum methodologies</li>
    <li>Be able to implement an overall testing strategy and provide estimates</li>
    <li>Be able to manage a team.</li>
</ul>"""

template += '\n\nHTML STRING: "{html_str}"\n=========\nREFORMATTED HTML:'

In [11]:

prompt = PromptTemplate(
    input_variables=['html_str'],
    template=template,
)
# llm = OpenAI(model_name='text-davinci-003', temperature=1.0, max_retries=1)
llm = Cohere(model='command-xlarge-nightly', temperature=0)
chain = LLMChain(llm=llm, prompt=prompt)

In [None]:

print(prompt.format(html_str=tldr_df.loc[0, 'navigable_parent']))

In [44]:

tldr_df = s.load_object('tldr_df')
mask_series = tldr_df.llm_suggestion.isnull()
for row_index, row_series in tldr_df[mask_series].iterrows():
    navigable_parent = row_series.navigable_parent
    try:
        llm_suggestion = get_suggestion(chain, navigable_parent)
        time.sleep(1) # Sleep for 1 second between each call
    except RateLimitError:
        print("You've already spent 10 bucks on this; your wife will be pissed")
        break
    except CohereAPIError as e:
        message_str = str(e).strip()
        if 'too many tokens' in message_str:
            print(message_str)
            break
        time.sleep(60)
        llm_suggestion = get_suggestion(chain, navigable_parent)
        time.sleep(1) # Sleep for 1 second between each call
    except Exception as e:
        print(f'{e.__class__} error: {str(e).strip()}')
        print()
        print(f"""template += '\\n\\nHTML STRING: "{navigable_parent}"\\n=========\\n'""")
        print("""template += 'REFORMATTED HTML:\\n'""")
        print(f"mask_series = (tldr_df.navigable_parent == '{navigable_parent}')")
        break
    tldr_df.loc[row_index, 'llm_suggestion'] = llm_suggestion
    s.store_objects(tldr_df=tldr_df, verbose=True)
    break
winsound.Beep(freq, duration)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\tldr_df.pkl


In [45]:

# Break up overly-long O-RQs:
# Don't close the Notepad++ window until you have replaced the child string
file_name = tldr_df.loc[row_index, 'file_name']
child_str = tldr_df.loc[row_index, 'navigable_parent']
print()
print(child_str)
print()
print(tldr_df.loc[row_index, 'llm_suggestion'])
winsound.Beep(freq, duration)
display_file_in_text_editor(tldr_df.loc[row_index, 'file_name'])
cu.rebuild_filename_node(file_name, wsu, navigable_parent=child_str, verbose=False)


<p>· Masters or PhD Degree with a minimum of 2 years experience in at least one of the following: Machine Learning (Supervised, Unsupervised), Bayesian Statistics, Statistical Modeling (e.g. Survival Analysis, Generalized Linear models), Advanced Machine Learning (e.g. Reinforcement Learning, Deep Learning) , NLP, Dynamic Programming &amp; Optimal Control Theory</p>

<p>We are seeking a candidate with the following qualifications:</p>
<ul>
    <li>Masters or PhD degree with a minimum of 2 years experience in at least one of the following: Machine Learning (Supervised, Unsupervised), Bayesian Statistics, Statistical Modeling (e.g. Survival Analysis, Generalized Linear models), Advanced Machine Learning (e.g. Reinforcement Learning, Deep Learning) , NLP, Dynamic Programming &amp; Optimal Control Theory</li>
</ul>
<p>The ideal candidate should have expertise in the following areas:</p>
<ul>
    <li>
        Machine Learning (Supervised, Unsupervised), Bayesian Statistics, Statistical Mod

In [None]:

# Go ahead and update what you can in the database
tldr_df = s.load_object('tldr_df')
mask_series = ~tldr_df.llm_suggestion.isnull()
for (row_index, row_series) in tqdm(tldr_df[mask_series].iterrows(), total=tldr_df[mask_series].shape[0]):

    # Open the file in read mode
    file_name = row_series.file_name
    file_path = os.path.join(ha.SAVES_HTML_FOLDER, file_name)
    with open(file_path, 'r', encoding=s.encoding_type) as file:
        
        # Read the file content
        file_content = file.read()

    # Replace some text
    old_child_str = row_series.navigable_parent
    new_child_str = row_series.llm_suggestion
    file_content = file_content.replace(old_child_str, new_child_str)

    # Open the file in write mode
    with open(file_path, 'w', encoding=s.encoding_type) as file:
        
        # Write the modified content to the file
        file.write(file_content)
    
    # Rebuild the node in the database
    cu.rebuild_filename_node(file_name, wsu, verbose=False)