
---
# Load needed libraries and functions

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

%%time
%run ../py/html_analysis.py
hc = HeaderCategories()
ha = HeaderAnalysis()
ea = ElementAnalysis()

Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\CHILD_STR_CLF.pickle
Wall time: 32.8 s


In [3]:

def get_pos_and_consecutives(file_name):
    child_strs_list = ha.get_child_strs_from_file(file_name)
    child_tags_list = ha.get_child_tags_list(child_strs_list)
    is_header_list = ha.get_is_header_list(child_strs_list)
    feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
    feature_tuple_list = [hc.get_feature_tuple(feature_dict) for feature_dict in feature_dict_list]
    crf_list = ea.CRF.predict_single(ea.sent2features(feature_tuple_list))
    pos_list = []
    for pos, feature_tuple in zip(crf_list, feature_tuple_list):
        navigable_parent = feature_tuple[1]
        if navigable_parent in ha.NAVIGABLE_PARENT_IS_HEADER_DICT:
            pos_list = hc.append_parts_of_speech_list(navigable_parent, pos_list)
        else:
            pos_list.append(pos)
    consecutives_list = []
    for pos, v in groupby(pos_list):
        consecutives_count = len(list(v))
        consecutives_tuple = (pos, consecutives_count)
        consecutives_list.append(consecutives_tuple)
    
    return consecutives_list, pos_list, child_strs_list

In [4]:

TITLE_SEPARATOR = ' - '
PARENTH_REGEX = re.compile(r'[\(\[][^\)\]]+[\)\]]')
def get_posting_location(file_name):
    title_str = ' '.join(file_name.split('_')[:-1])
    title_str = re.sub(' - *| *- ', TITLE_SEPARATOR, title_str)
    matched_text_list = []
    for match_obj in PARENTH_REGEX.finditer(title_str):
        matched_text = match_obj.group()
        matched_text_list.append(matched_text)
    for matched_text in matched_text_list:
        replacement_text = matched_text.replace(TITLE_SEPARATOR, '-')
        replacement_text = replacement_text[1:-1].strip()
        replacement_text = replacement_text.replace('. ', TITLE_SEPARATOR).replace(' + ', TITLE_SEPARATOR)
        replacement_text = TITLE_SEPARATOR + replacement_text + TITLE_SEPARATOR
        title_str = title_str.replace(matched_text, replacement_text)
    posting_title_list = title_str.split(TITLE_SEPARATOR)
    posting_title_list = [posting_title_str.strip() for posting_title_str in posting_title_list]
    posting_title_list = list(filter(len, posting_title_list))
    posting_location = posting_title_list.pop()
    if posting_location == 'Indeed.com':
        posting_location = posting_title_list.pop()
    
    return posting_location, posting_title_list

In [5]:

REMOTE_STRS_LIST = ['remote', 'telecommute', 'home-based', 'wfh', 'work from home']
QUAL_STRS_LIST = ['citizen', 'clearance', 'usc', 'gc', 'ead']
US_STRS_LIST = ['united states', 'us', 'usa', 'us citizens only']
PAY_STRS_LIST = ['b2b', 'up to $']
DURATION_STRS_LIST = ['direct hire']
def create_deatails_lists(posting_title_list):
    LOCATION_DETAILS_LIST = []
    REQ_QUALS_LIST = []
    job_title_list = []
    PAY_DETAILS_LIST = []
    duration_details = []
    while len(posting_title_list):
        posting_title_str = posting_title_list.pop()
        if any(map(lambda x: x in posting_title_str.lower(), REMOTE_STRS_LIST)):
            LOCATION_DETAILS_LIST.append(posting_title_str)
        elif any(map(lambda x: re.search(f'\\b{x}\\b', posting_title_str.lower()), QUAL_STRS_LIST)):
            REQ_QUALS_LIST.append(posting_title_str)
        elif any(map(lambda x: x == posting_title_str.lower(), US_STRS_LIST)):
            LOCATION_DETAILS_LIST.append(posting_title_str)
        elif any(map(lambda x: x in posting_title_str.lower(), PAY_STRS_LIST)):
            PAY_DETAILS_LIST.append(posting_title_str)
        elif any(map(lambda x: x in posting_title_str.lower(), DURATION_STRS_LIST)):
            duration_details.append(posting_title_str)
        else:
            posting_title_str = posting_title_str.strip()
            if posting_title_str.startswith(','):
                posting_title_str = posting_title_str[1:].strip()
            job_title_list.append(posting_title_str)
    
    return LOCATION_DETAILS_LIST, REQ_QUALS_LIST, PAY_DETAILS_LIST, duration_details, job_title_list

In [6]:

COMMAS_REGEX = re.compile(r', ([^,]+),')
REPLACEMENT_STR = f'{TITLE_SEPARATOR}\\1{TITLE_SEPARATOR}'
def extract_job_title_info(file_name):
    posting_location, posting_title_list = get_posting_location(file_name)
    commas_list = []
    while len(posting_title_list):
        posting_title_str = posting_title_list.pop()
        posting_title_str = re.sub(COMMAS_REGEX, REPLACEMENT_STR, posting_title_str)
        if any(map(lambda x: x in posting_title_str.lower(), REMOTE_STRS_LIST)):
            posting_title_str = re.sub(f"-({'|'.join(REMOTE_STRS_LIST)})", f'{TITLE_SEPARATOR}\\1', posting_title_str, 0, re.IGNORECASE)
        commas_list.append(posting_title_str)
    title_str = TITLE_SEPARATOR.join(commas_list)
    posting_title_list = title_str.split(TITLE_SEPARATOR)
    assert len(posting_title_list), f'{title_str}: {title_str.split(TITLE_SEPARATOR)}'
    LOCATION_DETAILS_LIST, REQ_QUALS_LIST, PAY_DETAILS_LIST, duration_details, job_title_list = create_deatails_lists(posting_title_list)
    
    return posting_location, LOCATION_DETAILS_LIST, REQ_QUALS_LIST, PAY_DETAILS_LIST, duration_details, job_title_list

In [7]:

def get_section_list(pos):
    if pos == 'H-JD':
        return JOB_DURATION_LIST
    elif pos == 'H-ER':
        return EDUCATION_REQUIREMENTS_LIST
    elif pos == 'H-SP':
        return PAY_DETAILS_LIST
    elif pos == 'H-PQ':
        return PREFF_QUALS_LIST
    elif pos == 'H-TS':
        return TASK_SCOPE_LIST
    elif pos == 'H-OL':
        return LOCATION_DETAILS_LIST
    elif pos == 'H-IP':
        return INTERVIEW_DETAILS_LIST
    elif pos == 'H-LN':
        return LEGAL_NOTIFS_LIST
    elif pos == 'H-RQ':
        return REQ_QUALS_LIST
    elif pos == 'H-CS':
        return CORP_SCOPE_LIST
    elif pos == 'H-O':
        return OTHER_LIST
    
    return []

In [8]:

import random

files_list = os.listdir(ha.SAVES_HTML_FOLDER)
file_name = random.choice(files_list)

In [11]:

EDUCATION_REQUIREMENTS_LIST = []
PREFF_QUALS_LIST = []
TASK_SCOPE_LIST = []
INTERVIEW_DETAILS_LIST = []
LEGAL_NOTIFS_LIST = []
CORP_SCOPE_LIST = []
OTHER_LIST = []
posting_location, LOCATION_DETAILS_LIST, REQ_QUALS_LIST, PAY_DETAILS_LIST, JOB_DURATION_LIST, job_title_list = extract_job_title_info(file_name)
def get_pos_list(file_name):
    child_strs_list = ha.get_child_strs_from_file(file_name)
    child_tags_list = ha.get_child_tags_list(child_strs_list)
    is_header_list = ha.get_is_header_list(child_strs_list)
    feature_dict_list = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
    feature_tuple_list = [hc.get_feature_tuple(feature_dict) for feature_dict in feature_dict_list]
    crf_list = ea.CRF.predict_single(ea.sent2features(feature_tuple_list))
    pos_list = []
    for pos, feature_tuple in zip(crf_list, feature_tuple_list):
        navigable_parent = feature_tuple[1]
        if navigable_parent in ha.NAVIGABLE_PARENT_IS_HEADER_DICT:
            pos_list = hc.append_parts_of_speech_list(navigable_parent, pos_list)
        else:
            pos_list.append(pos)
    
    return pos_list, child_strs_list

In [12]:

from itertools import groupby

#for file_name in files_list:
pos_list, child_strs_list = get_pos_list(file_name)
consecutives_list = []
for key_pos, consecutive_value in groupby(pos_list):
    value_list = list(consecutive_value)
    consecutives_count = len(value_list)
    consecutives_tuple = (key_pos, consecutives_count)
    consecutives_list.append(consecutives_tuple)
for pos in list(set(pos_list) - set(['O'])):
    section_list = get_section_list(pos)
    speech_parts_idx_list = ea.get_idx_list(pos_list, pos)
    consecutives_idx_list = ea.get_idx_list(consecutives_list, (pos, 1))
    for speech_part_idx, consecutives_idx in zip(speech_parts_idx_list, consecutives_idx_list):
        if consecutives_idx+1 < len(consecutives_list):
            o_count = consecutives_list[consecutives_idx+1][1]
            if pos == 'H-RQ':
                o_list = []
                for child_str in child_strs_list[speech_part_idx:speech_part_idx+o_count+1]:
                    if child_str not in ha.NAVIGABLE_PARENT_IS_QUAL_DICT:
                        o_list.append(child_str)
                    elif ha.NAVIGABLE_PARENT_IS_QUAL_DICT[child_str]:
                        o_list.append(child_str)
            else:
                 o_list = child_strs_list[speech_part_idx:speech_part_idx+o_count+1]
            section_list.extend(o_list)
print(consecutives_list)
#break

[('H-TS', 1), ('O', 17), ('H-LN', 1), ('O', 20), ('H-LN', 1), ('O', 1)]


In [13]:

for pos in list(set(pos_list) - set(['O'])):
    print(pos, hc.POS_EXPLANATION_DICT[pos], ea.get_idx_list(pos_list, pos))

H-LN Legal Notifications Header [18, 39]
H-TS Task Scope Header [0]


In [14]:

ZMQInteractiveShell_obj = get_ipython()
NAVIGABLE_PARENT_IS_QUAL_DICT = s.load_object('NAVIGABLE_PARENT_IS_QUAL_DICT')
def get_pos_code():
    output_str = ''
    for pos in list(set(pos_list) - set(['O'])):
        output_str += '\n'
        output_str += '# '
        output_str += pos
        output_str += ' '
        output_str += hc.POS_EXPLANATION_DICT[pos]
        output_str += '\n'
        section_list = get_section_list(pos)
        output_str += 'for tag_str in [\n'
        for child_str in section_list:
            if "'" in child_str:
                output_str += f'        "{child_str}",\n'
            else:
                output_str += f"        '{child_str}',\n"
        output_str += '    ]:\n'
        if pos in ['H-RQ', 'H-PQ']:
            output_str += '    NAVIGABLE_PARENT_IS_QUAL_DICT[tag_str] = True\n'
        else:
            output_str += '    NAVIGABLE_PARENT_IS_QUAL_DICT[tag_str] = False\n'
        output_str += '''print(len(NAVIGABLE_PARENT_IS_QUAL_DICT.keys()))
s.store_objects(NAVIGABLE_PARENT_IS_QUAL_DICT=NAVIGABLE_PARENT_IS_QUAL_DICT)\n'''
    
    return output_str


---
# Training

In [None]:

ZMQInteractiveShell_obj.set_next_input(text=get_pos_code(), replace=True)

In [15]:

# H-LN Legal Notifications Header
for tag_str in [
        '<b>Desirable Skills</b>',
        '<li>Demonstrated ability and problem solving skills to develop innovative machine learning applications</li>',
        '<b>About AIS</b>',
        '<b>AIS, Dedicated to Our People</b>',
        "AIS employees can spend their entire career at AIS doing challenging, rewarding work and reach their desired level of achievement and responsibility. We offer the opportunity to move up, without the obligation to move out of a position where one excels. We are committed to our employee's success; however, they define it.",
        "It's our dedication to our employees that inspired our leadership to invest in our future and become partially employee-owned through an Employee Stock Ownership Program (ESOP).",
        'Our employees are our greatest strength, and we do all that we can to serve them. We invest in technology as early adopters, allowing us to create transformative and innovative solutions for our customers while exposing our team to cutting edge technology.',
        'We hire outstanding individuals who are committed to curiosity, passionate about emerging technology, and who are excited to find innovative solutions for the biggest tech challenges facing international brands and government agencies today.',
        '<b>We Invest in Individuals Committed to Innovation</b>',
        'AIS is seeking professionals of a certain character and level of excellence. People that we can learn from and that we can help grow to achieve their personal career goals. We are looking for:',
        '<li>Smart people with a passion for technology</li>',
        '<li>Strong technical capabilities with a consultancy mindset</li>',
        '<li>Close involvement with local technical communities</li>',
        '<li>A willingness to think outside of the box to provide innovative solutions to clients</li>',
        '<li>Ability to solve challenging technical business problems</li>',
        '<li>Self-directed professionals</li>',
        '<b>Our Core Values</b>',
        '<li>Client Success</li>',
        '<li>Continued Learning and Technical Excellence</li>',
        '<li>Strong Client Relationships</li>',
        '<li>Citizenship and Community</li>',
        '<b>AIS is an Equal Opportunity Employer</b>',
        'Applied Information Sciences is an Equal Opportunity Employer and does not discriminate on the basis of race, national origin, religion, color, gender, sexual orientation, age, disability, protected veteran status or any other basis covered by law. Employment decisions are based solely on qualifications merit, and business need.',
    ]:
    NAVIGABLE_PARENT_IS_QUAL_DICT[tag_str] = False
print(len(NAVIGABLE_PARENT_IS_QUAL_DICT.keys()))
s.store_objects(NAVIGABLE_PARENT_IS_QUAL_DICT=NAVIGABLE_PARENT_IS_QUAL_DICT)

# H-TS Task Scope Header
for tag_str in [
        '<b>Now Hiring an AI/ML Engineer</b>',
        'As an',
        '<b>AI/ML Engineer,</b>',
        'you will get hands on with translating the vision from solution architects and work with data scientists, date engineers, loT specialists, and software developers to build complete end-to-end solutions. This is a remote position.',
        '<li>Knowledge and Experience in designing and implementing AI applications using Microsoft Azure Cognitive Services, Azure Bot Service and Azure Machine Learning</li>',
        '<li>Analyzing requirements for AI solutions</li>',
        '<li>Experience recommending the appropriate tools and technologies for AI solutions</li>',
        '<li>Experience in designing and implementing AI solutions that meet scalability and performance requirements</li>',
        '<li>Understanding when a custom model API should be developed to meet specific requirements</li>',
        '<li>Experience in understanding of programming language such as Python</li>',
        '<li>Ability to work autonomously and collaboratively as part of a team to both teach and learn every day</li>',
        '<li>Continuously looking for opportunities to learn, build skills and share learning.</li>',
        '<li>Excellent written and verbal communication skills</li>',
        '<li>Must have experience in application development</li>',
        '<b>Profile of Success</b>',
        '<li>Understanding of ML frameworks</li>',
        '<li>Proven experience as a Machine Learning Engineer or similar role</li>',
        '<li>Experience with setting up DevOps for Machine Learning to enable data science teams and IT teams to collaborate</li>',
    ]:
    NAVIGABLE_PARENT_IS_QUAL_DICT[tag_str] = False
print(len(NAVIGABLE_PARENT_IS_QUAL_DICT.keys()))
s.store_objects(NAVIGABLE_PARENT_IS_QUAL_DICT=NAVIGABLE_PARENT_IS_QUAL_DICT)



---
# Inference


----
# Create the Header Pattern Dictionary

In [None]:

child_strs_list

In [16]:

child_str = '<b>About AIS</b>'
child_str in ha.NAVIGABLE_PARENT_IS_HEADER_DICT

False

In [17]:

ha.NAVIGABLE_PARENT_IS_HEADER_DICT[child_str] = True
s.store_objects(NAVIGABLE_PARENT_IS_HEADER_DICT=ha.NAVIGABLE_PARENT_IS_HEADER_DICT)

Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\NAVIGABLE_PARENT_IS_HEADER_DICT.pickle


In [58]:

child_str in hc.TASK_SCOPE_HEADERS_LIST

False

In [59]:

hc.TASK_SCOPE_HEADERS_LIST.append(child_str)
s.store_objects(TASK_SCOPE_HEADERS_LIST=hc.TASK_SCOPE_HEADERS_LIST)

Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\TASK_SCOPE_HEADERS_LIST.pickle


In [60]:

files_list = os.listdir(ha.SAVES_HTML_FOLDER)
HEADER_PATTERN_DICT = {}
for file_name in files_list:
    if file_name in ea.CHILD_STRS_LIST_DICT:
        child_strs_list = ea.CHILD_STRS_LIST_DICT[file_name]
    else:
        file_path = os.path.join(ha.SAVES_HTML_FOLDER, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_str = f.read()
            body_soup = ha.get_body_soup(html_str)
            child_strs_list = ha.get_navigable_children(body_soup, [])
    if not len(child_strs_list):
            file_path = os.path.join(ha.SAVES_HTML_FOLDER, file_name)
            os.remove(file_path)
            continue
    navigable_parent = child_strs_list[0]
    if navigable_parent not in ha.NAVIGABLE_PARENT_IS_HEADER_DICT:
        continue
    child_tags_list = []
    is_header_list = []
    for navigable_parent in child_strs_list:
        if navigable_parent not in ha.NAVIGABLE_PARENT_IS_HEADER_DICT:
            break
        tokenized_sent = ha.html_regex_tokenizer(navigable_parent)
        try:
            first_token = tokenized_sent[0]
            if first_token[0] == '<':
                child_tags_list.append(first_token[1:])
            else:
                child_tags_list.append('plaintext')
        except:
            child_tags_list.append('plaintext')
        is_header = ha.NAVIGABLE_PARENT_IS_HEADER_DICT[navigable_parent]
        is_header_list.append(is_header)
    if len(child_tags_list) == len(child_strs_list):
        if file_name not in ea.CHILD_STRS_LIST_DICT:
            ea.CHILD_STRS_LIST_DICT[file_name] = child_strs_list
            s.store_objects(CHILD_STRS_LIST_DICT=ea.CHILD_STRS_LIST_DICT)
        if file_name not in HEADER_PATTERN_DICT:
            item_sequence = hc.get_feature_dict_list(child_tags_list, is_header_list, child_strs_list)
            HEADER_PATTERN_DICT[file_name] = item_sequence
            s.store_objects(HEADER_PATTERN_DICT=HEADER_PATTERN_DICT)

Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\HEADER_PATTERN_DICT.pickle
