In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from IPython.display import clear_output
from datetime import datetime
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import humanize
import os
import re
import sys
import time
import winsound

# Insert at 1, 0 is the script path (or '' in REPL)
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))

duration = 1000  # milliseconds
freq = 880  # Hz

In [3]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage()

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
wsu.beep(freq, duration)
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 0 seconds
Last run on 2023-03-06 10:25:57.358185


In [13]:

def create_accenture_file(file_path, row, verbose=False):
    with open(file_path, 'w', encoding='utf-8') as f:
        print('<html><body><div id="jobDescriptionText" class="jobsearch-jobDescriptionText">', file=f)

        # Get dual-roled descriptions list
        role_description = row.role_description.strip()
        paragraph_str = re.sub('[·\*\-]?\t([^\r\n]+)', r'<li>\g<1></li>', role_description)
        paragraph_str = re.sub('([^\r\n]+):\s*[\r\n]+', r'<b>\g<1>:</b>\n', paragraph_str)
        paragraph_str = re.sub(r'\s+', ' ', paragraph_str).strip()
        paragraph_str = re.sub(r'>\s*<', '>\n<', paragraph_str).strip()
        for fake_stop, replacement in zip(fake_stops_list, replacements_list):
            paragraph_str = paragraph_str.replace(fake_stop, replacement)
        child_strs_list = [string for sublist in sent_tokenize(paragraph_str) for string in sublist.split('\n')]
        # child_tags_list = ha.construct_child_tags_list(child_strs_list)

        # Role Primary Skill
        role_primary_skill = row.role_primary_skill.strip()
        print('<hrq>Role Primary Skill:</hrq>', file=f)
        for role_str in child_strs_list:
            if ('<b>' in role_str) and ('</b>' in role_str):
                role_str = role_str.replace('<b>', '').replace('</b>', '')
                print(f'<hrq>{role_str}</hrq>', file=f)
            elif (' plus' in role_str) or ('preferred' in role_str):
                print(f'<opq>{role_str}</opq>', file=f)
            elif ('<li>' in role_str) and ('</li>' in role_str):
                role_str = role_str.replace('<li>', '').replace('</li>', '').lstrip('·*- \t')
                if role_str:
                    print(f'<orq>Ability to {role_str[0].lower()}{role_str[1:]}</orq>', file=f)
            else:
                print(f'<orq>Ability to {role_str[0].lower()}{role_str[1:]}</orq>', file=f)
        skills_list = [s.strip() for s in re.split('[|/]', role_primary_skill, 0) if s.strip()]
        for skill_str in skills_list:
            print(f'<orq>{skill_str}</orq>', file=f)

        # Role Description
        print('<hts>Role Description:</hts>', file=f)
        for role_str in child_strs_list:
            role_str = role_str.replace('<li>', '').replace('</li>', '').lstrip('·*- \t')
            if role_str:
                print(f'<ots>{role_str}</ots>', file=f)

        # Role ID
        print(f'<ojt>Role ID: {role_id}</ojt>', file=f)

        # Client
        print(f'<ocs>Client: {client_name}</ocs>', file=f)

        # Role Title
        print(f'<ojt>Role Title: {role_title}</ojt>', file=f)

        # Assigned Role
        assigned_role = row.assigned_role.strip()
        print(f'<ots>Assigned Role: {assigned_role}</ots>', file=f)

        # Project Metro City
        project_metro_city = row.project_metro_city.strip()
        print(f'<ool>Project Metro City: {project_metro_city}</ool>', file=f)

        # Career Level From - To
        career_level_from_to = row.career_level_from_to.strip()
        print(f'<osp>Career Level From - To: {career_level_from_to}</osp>', file=f)

        # Role Start Date
        role_start_date = row.role_start_date.strip()
        print(f'<ojd>Role Start Date: {role_start_date}</ojd>', file=f)

        # Role End Date
        role_end_date = row.role_end_date.strip()
        print(f'<ojd>Role End Date: {role_end_date}</ojd>', file=f)

        # Role Client Supply Contact
        role_client_supply_contact = row.role_client_supply_contact.strip()
        print(f'<oip>Role Client Supply Contact: {role_client_supply_contact}</oip>', file=f)

        # Role Primary Contact
        role_primary_contact = row.role_primary_contact.strip()
        print(f'<oip>Role Primary Contact: {role_primary_contact}</oip>', file=f)

        # Role Primary Contact (Email ID)
        role_primary_contact_email_id = row.role_primary_contact_email_id.strip()
        print(f'<oip>Role Primary Contact (Email ID): {role_primary_contact_email_id}</oip>', file=f)

        print('</div></body></html>', file=f)
        
        return (
            assigned_role, career_level_from_to, project_metro_city, role_client_supply_contact, role_end_date, role_primary_contact,
            role_primary_contact_email_id, role_start_date
        )

In [35]:

def reset_parts_of_speech(verbose=False):
    for is_header, html_prefix in zip([true, false], ['h', 'o']):
        for cypher_suffix, html_suffix in zip(
            ['corporate_scope', 'interview_procedure', 'job_duration', 'job_title', 'office_location', 'posting_date',
             'preferred_qualification', 'minimum_qualification', 'supplemental_pay', 'task_scope'],
            ['cs', 'ip', 'jd', 'jt', 'ol', 'pd', 'pq', 'rq', 'sp', 'ts']
        ):
            working_dict = base_dict.copy()
            working_dict.update({'header': is_header})
            working_dict.update({cypher_suffix: true})

            def do_cypher_tx(tx, verbose=False):
                cypher_str = STARTSWITH_STR.format(html_prefix, html_suffix)
                attrs_list = [f"np.is_{k} = '{v}'" for k, v in working_dict.items()]
                cypher_str += COMMA_STR.join(attrs_list)
                if verbose:
                    clear_output(wait=True)
                    print(cypher_str)
                parameter_dict = {}
                results_list = tx.run(query=cypher_str, parameters=parameter_dict)

            with cu.driver.session() as session:
                session.write_transaction(do_cypher_tx, verbose=verbose)

In [37]:

roles_df = s.load_csv(csv_name='Accenture_Technology_Open_Roles_03.06.2023', folder_path='../saves').rename(columns={'Client': 'Client Name'})
print(roles_df.shape)
roles_df.columns = [re.sub(r'[^A-Za-z0-9]+', ' ', cn).strip().replace(' ', '_').lower() for cn in roles_df.columns]
if 'role_is_sold' in roles_df.columns:
    mask_series = (roles_df.role_is_sold == 'Yes')
    df = roles_df[mask_series]
else:
    df = roles_df
base_dict = {
    'header': false,
    'task_scope': false,
    'minimum_qualification': false,
    'preferred_qualification': false,
    'educational_requirement': false,
    'legal_notification': false,
    'other': false,
    'corporate_scope': false,
    'job_title': false,
    'office_location': false,
    'job_duration': false,
    'supplemental_pay': false,
    'interview_procedure': false,
    'posting_date': false
}
fake_stops_list = ['e.g.', 'etc.', 'M.S.', 'B.S.', 'Ph.D.', '(ex.', '(Ex.', 'U.S.',
                   'i.e.', '&amp;', 'E.g.']
replacements_list = ['eg', 'etc', 'MS', 'BS', 'PhD', '(eg', '(eg', 'US', 'ie', '&', 'eg']
STARTSWITH_STR = '''
    MATCH (np:NavigableParents)
    WHERE (np.navigable_parent STARTS WITH "<{}{}>")
    SET
        '''
COMMA_STR = ''',
        '''

(1232, 13)



---

In [38]:

for row in tqdm(df.itertuples(index=False), total=df.shape[0]):
    if str(row.role_id) != 'nan':

        # Create the file path
        role_id = str(row.role_id).strip()
        client_name = row.client_name.strip()
        role_title = row.role_title.strip()
        file_name = re.sub(r'[^A-Za-z0-9]+', ' ', f'{role_id} {client_name} {role_title}').strip().replace(' ', '_') + '.html'
        file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
        if not os.path.exists(file_path):

            # Create the file using HTML
            (
                assigned_role, career_level_from_to, project_metro_city, role_client_supply_contact, role_end_date, role_primary_contact, role_primary_contact_email_id, role_start_date
            ) = create_accenture_file(file_path, row, verbose=True)

            # Populate the database with the file info
            page_soup = wsu.get_page_soup(file_path)
            div_soup = page_soup.find_all(name='div', id='jobDescriptionText')[0]
            child_strs_list = ha.get_navigable_children(div_soup, [])
            cu.ensure_filename(file_name, verbose=False)
            cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
            cu.set_accenture_data(
                file_name=file_name, assigned_role=assigned_role, career_level_from_to=career_level_from_to, client_name=client_name, project_metro_city=project_metro_city,
                role_client_supply_contact=role_client_supply_contact, role_end_date=role_end_date, role_id=role_id, role_primary_contact=role_primary_contact,
                role_primary_contact_email_id=role_primary_contact_email_id, role_start_date=role_start_date, role_title=role_title, verbose=False
            )
            reset_parts_of_speech(verbose=False)
            clear_output(wait=True)
            print(file_name)

100%|█████████████████████████████████| 1232/1232 [1:44:30<00:00,  5.09s/it]

4729451_ULTIMATE_KRONOS_GROUP_Application_Developer.html





In [None]:
raise


----

In [None]:

files_list = sorted([fn for fn in os.listdir(cu.SAVES_HTML_FOLDER) if fn.endswith('.html')])

In [None]:

t0 = time.time()
for file_name in files_list:
    cu.ensure_filename(file_name, verbose=True)
    file_path = os.path.join(cu.SAVES_HTML_FOLDER, file_name)
    page_soup = wsu.get_page_soup(file_path)
    row_div_list = page_soup.find_all(name='div', id='jobDescriptionText')
    for div_soup in row_div_list:
        child_strs_list = ha.get_navigable_children(div_soup, [])
        cu.populate_from_child_strings(child_strs_list, file_name, verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
wsu.beep(freq, duration)
print(f'{len(files_list):,} file names reinserted in {duration_str}')


----
## Unless you have written consent from the Generative AI and LLM CoE, you may not use generative AI tools while coding and cannot upload Accenture, ecosystem or client content or data to these tools.

In [None]:

import pandas as pd
from selenium.common.exceptions import TimeoutException

PREFIX_STR = '<orq>Ability to '
strip_count = len(PREFIX_STR)
def do_cypher_tx(tx, verbose=False):
    cypher_str = f'''
    MATCH (np:NavigableParents)
    WHERE
        (np.navigable_parent STARTS WITH "{PREFIX_STR}")
        //AND (np.navigable_parent CONTAINS "preferred")
    RETURN np.navigable_parent AS navigable_parent; '''
    if verbose:
        clear_output(wait=True)
        print(cypher_str)
    parameter_dict = {}
    results_list = tx.run(query=cypher_str, parameters={})
    values_list = []
    for record in results_list:
        values_list.append(dict(record.items()))

    return values_list

with cu.driver.session() as session:
    rows_list = session.write_transaction(do_cypher_tx, verbose=True)
df = pd.DataFrame(rows_list)
ability2_list = df.navigable_parent.tolist()
df.shape

In [None]:

driver = wsu.get_driver(verbose=False)
for navigable_parent in tqdm(ability2_list):
    orq_str = navigable_parent[strip_count:]
    orq_str = orq_str[0].upper() + orq_str[1:]
    orq_str = orq_str.replace('</orq>', '')
    try:
        youchat_text = wsu.get_chatgpt_rephrasing(driver, orq_str, part_of_speech='minimum requirement', verbose=False)
    except TimeoutException as e:
        clear_output(wait=True)
        driver.close()
        wsu.wait_for(1000, verbose=False)
        driver = wsu.get_driver(verbose=False)
        youchat_text = wsu.get_chatgpt_rephrasing(driver, orq_str, part_of_speech='minimum requirement', verbose=False)
    if youchat_text:
        youchat_text = '<orq>' + youchat_text + '</orq>'
        def do_cypher_tx(tx, navigable_parent, youchat_html, verbose=False):
            cypher_str = '''
                MATCH (np:NavigableParents {navigable_parent: $navigable_parent})
                SET np.navigable_parent = $youchat_html;'''
            if verbose:
                clear_output(wait=True)
                print(cypher_str.replace('$navigable_parent', f'"{navigable_parent}"').replace('$youchat_html', f'"{youchat_html}"'))
            parameter_dict = {'navigable_parent': navigable_parent, 'youchat_html': youchat_html}
            tx.run(query=cypher_str, parameters=parameter_dict)

        with cu.driver.session() as session:
            session.write_transaction(
                do_cypher_tx, navigable_parent=navigable_parent, youchat_html=youchat_text, verbose=False
            )
driver.close()

In [7]:

driver.close()


----

In [16]:

# Show what's in the database already for this html string
import pandas as pd

def do_cypher_tx(tx):
    cypher_str = '''
        MATCH (np:NavigableParents)
        WHERE
            (np.navigable_parent =~ "^<([^><]+)>.+</\\1>$")
            AND (np.is_header = false)
            AND (np.is_task_scope = true)
        ''' + cu.return_every_np_str + '''
        LIMIT 3;'''
    results_list = tx.run(query=cypher_str)

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx)
df = pd.DataFrame(row_objs_list[:3])
task_regex = re.compile(r'^<([^><]+)>([^><]+)</\1>$')
driver = wsu.get_driver(verbose=False)
for html_str in df.navigable_parent.to_list():
    for match_obj in task_regex.finditer(html_str):
        task_str = match_obj.group(2)
        youchat_text = wsu.get_chatgpt_rephrasing(driver, 'Ability to ' + task_str, part_of_speech='minimum requirement', verbose=False)
        print()
        print(task_str)
        print(youchat_text)


Prescient Edge is seeking a Backend Distributed ML Engineer to support a Federal government client. As a Backend Distributed ML Engineer, you will contribute to the development of innovative breakthroughs in machine learning, generating insights from private data silos faster than ever before while preserving the privacy of the underlying data. The technology is developed by a team of covert intelligence operatives as well as government and financial services executives all working together to build a platform that is ready to cope with the agility, efficiency, and relentless push required to plow through the regulatory and compliance that these organizations require.
We are seeking a Backend Distributed ML Engineer with the ability to contribute to the development of breakthroughs in machine learning and generate insights from private data silos. The ideal candidate must have experience in developing platforms that can handle the agility, efficiency, and compliance required by govern

In [17]:

driver.close()