In [1]:

%pprint

Pretty printing has been turned OFF


In [70]:

from IPython.display import clear_output
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from urllib.parse import urlparse, parse_qs
import numpy as np
import os
import random
import re
import requests

%run ../load_magic/storage.py
s = Storage()

LT_REGEX = re.compile(r'\s+<')
GT_REGEX = re.compile(r'>\s+')
SAVES_HTML_FOLDER = os.path.join(s.saves_folder, 'html')
try:
    BASIC_TAGS_DICT = s.load_object('BASIC_TAGS_DICT')
except:
    BASIC_TAGS_DICT = {}
    s.store_objects(BASIC_TAGS_DICT=BASIC_TAGS_DICT)
try:
    FIT_ESTIMATORS_DICT = s.load_object('FIT_ESTIMATORS_DICT')
except:
    FIT_ESTIMATORS_DICT = {}
    FIT_ESTIMATORS_DICT['LogisticRegression'] = LogisticRegression(**{'C': 85.0, 'class_weight': 'balanced', 'dual': False,
                                                                      'fit_intercept': True, 'max_iter': 6, 'penalty': 'l2', 'solver': 'sag',
                                                                      'tol': 1e-08})
    s.store_objects(FIT_ESTIMATORS_DICT=FIT_ESTIMATORS_DICT)

In [3]:

def clean_html_str(html_obj):
    html_str = str(html_obj)
    html_str = html_str.strip()
    html_str = LT_REGEX.sub('<', html_str)
    html_str = GT_REGEX.sub('>', html_str)
    
    return html_str

In [4]:

def get_navigable_children(tag, result_list=[]):
    if type(tag) is not NavigableString:
        for child_tag in tag.children:
            result_list = get_navigable_children(child_tag, result_list)
    else:
        base_str = clean_html_str(tag)
        if base_str:
            tag_str = clean_html_str(tag.parent)
            if tag_str.count('<') > 2:
                tag_str = base_str
            result_list.append(tag_str)
    
    return result_list

In [5]:

def update_child_strs_list_dictionary(child_strs_list_dict=None, basic_tags_dict=None):
    if basic_tags_dict is None:
        basic_tags_dict = s.load_object('basic_tags_dict')
    if child_strs_list_dict is None:
        files_list = os.listdir(SAVES_HTML_FOLDER)
        child_strs_list_dict = {}
        for file_name in files_list:
            file_path = os.path.join(SAVES_HTML_FOLDER, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                html_str = f.read()
                job_soup = BeautifulSoup(html_str, 'lxml')
                body_soup = job_soup.find_all(name='body')[0]
                child_strs_list = get_navigable_children(body_soup, [])
            if not len(child_strs_list):
                os.remove(file_path)
            child_strs_list = [child_str for child_str in child_strs_list if child_str not in basic_tags_dict]
            child_strs_list_dict[file_name] = child_strs_list
        
        return child_strs_list_dict
    
    for file_name, child_strs_list in child_strs_list_dict.items():
        child_strs_list = [child_str for child_str in child_strs_list if child_str not in basic_tags_dict]
        if not len(child_strs_list):
            child_strs_list = child_strs_list_dict.pop(file_name)
            break
        else:
            child_strs_list_dict[file_name] = child_strs_list
    
    return child_strs_list_dict

In [6]:

try:
    CHILD_STRS_LIST_DICT = s.load_object('CHILD_STRS_LIST_DICT')
except:
    CHILD_STRS_LIST_DICT = update_child_strs_list_dictionary()
    s.store_objects(CHILD_STRS_LIST_DICT=CHILD_STRS_LIST_DICT)

In [83]:

#SCANNER_REGEX = re.compile(r'(</?|\b|:)[1-9a-zA-Z][0-9a-zA-Z]*( *[#\+]{1,2}|>|\.\b|\b)')
SCANNER_REGEX = re.compile(r'</?\w+|\w+[#\+]*|:|\.')
def regex_tokenizer(corpus):
    
    return [match.group() for match in re.finditer(SCANNER_REGEX, corpus)]

In [300]:

regex_tokenizer('<li>MS or PhD in Applied Mathematics, Physics, Computer Science, Statistics or related technical field.</li>')

['<li', 'MS', 'or', 'PhD', 'in', 'Applied', 'Mathematics', 'Physics', 'Computer', 'Science', 'Statistics', 'or', 'related', 'technical', 'field', '.', '</li']


----

In [164]:

def create_dictionary_code():
    BASIC_TAGS_DICT = s.load_object('BASIC_TAGS_DICT')
    child_strs_list_dict = update_child_strs_list_dictionary(CHILD_STRS_LIST_DICT, BASIC_TAGS_DICT)
    s.store_objects(CHILD_STRS_LIST_DICT=child_strs_list_dict)
    for file_name, child_strs_list in child_strs_list_dict.items():
        for tag_str in child_strs_list:
            print()
            print()
            if "'" in tag_str:
                print(f'''BASIC_TAGS_DICT["{tag_str}"] = False''')
            else:
                print(f'''BASIC_TAGS_DICT['{tag_str}'] = False''')
            print(f'''print(len(BASIC_TAGS_DICT.keys()))\ns.store_objects(BASIC_TAGS_DICT=BASIC_TAGS_DICT)''')
            break
        break

In [388]:

create_dictionary_code()

Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\CHILD_STRS_LIST_DICT.pickle


BASIC_TAGS_DICT['<p>Your first three months will be spent getting up-to-speed on our team’s tools and processes, learning the details of the business problems we’re working on, contributing new ideas for solutions to those problems based on your experience</p>'] = False
print(len(BASIC_TAGS_DICT.keys()))
s.store_objects(BASIC_TAGS_DICT=BASIC_TAGS_DICT)


In [389]:

BASIC_TAGS_DICT['<b>What does your success look like in the first 90 days?</b>'] = True
print(len(BASIC_TAGS_DICT.keys()))
s.store_objects(BASIC_TAGS_DICT=BASIC_TAGS_DICT)

216
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\BASIC_TAGS_DICT.pickle


In [390]:

# Re-transform the bag-of-words and tf-idf from the new manual scores
BASIC_TAGS_DICT = s.load_object('BASIC_TAGS_DICT')
rows_list = [{'navigable_parent': navigable_parent, 'is_header': is_header} for navigable_parent, is_header in BASIC_TAGS_DICT.items()]
child_str_df = pd.DataFrame(rows_list)

if child_str_df.shape[0]:
    sents_list = child_str_df.navigable_parent.tolist()
    
    # Bag-of-words
    cv = CountVectorizer(**{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'lowercase': False, 'max_df': 1.0,
                            'max_features': None, 'min_df': 0.0, 'ngram_range': (1, 5), 'stop_words': None, 'strip_accents': 'ascii',
                            'tokenizer': regex_tokenizer})
    bow_matrix = cv.fit_transform(sents_list)
    s.store_objects(cs_cv_vocab=cv.vocabulary_)
    
    # Tf-idf must get from Bag-of-words first
    tt = TfidfTransformer(**{'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': False, 'use_idf': True})
    tfidf_matrix = tt.fit_transform(bow_matrix)
    s.store_objects(CS_TT=tt)
    
    # Re-train the classifier
    X = tfidf_matrix.toarray()
    y = child_str_df.is_header.to_numpy()
    FIT_ESTIMATORS_DICT = s.load_object('FIT_ESTIMATORS_DICT')
    #child_str_clf = FIT_ESTIMATORS_DICT['LogisticRegression']
    child_str_clf = LogisticRegression(**{'C': 375.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'max_iter': 4,
                                          'penalty': 'l1', 'solver': 'liblinear', 'tol': 7e-07})
    child_str_clf.fit(X, y)
    FIT_ESTIMATORS_DICT['LogisticRegression'] = child_str_clf
    s.store_objects(child_str_clf=child_str_clf, FIT_ESTIMATORS_DICT=FIT_ESTIMATORS_DICT)
    
    # Re-calibrate the inference engine
    cs_cv_vocab = s.load_object('cs_cv_vocab')
    CS_CV = CountVectorizer(vocabulary=cs_cv_vocab)
    CS_CV._validate_vocabulary()
    CS_TT = s.load_object('CS_TT')
    def predict_percent_fit(navigable_parents_list):
        y_predict_proba = np.array([])
        if len(navigable_parents_list):
            X_test = CS_TT.transform(CS_CV.transform(navigable_parents_list)).toarray()
            y_predict_proba = child_str_clf.predict_proba(X_test)

        return y_predict_proba
print('Retraining complete')

Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\cs_cv_vocab.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\CS_TT.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\child_str_clf.pickle
Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\FIT_ESTIMATORS_DICT.pickle
Retraining complete




In [391]:

sample_list = []
try:
    cs_cv_vocab = s.load_object('cs_cv_vocab')
    sample_list = random.sample([(w, i) for w, i in cs_cv_vocab.items() if '<' in w], 20)
except:
    cs_cv_vocab = {}
    s.store_objects(cs_cv_vocab=cs_cv_vocab)
sorted([(w, round(CS_TT.idf_[i], 2)) for w, i in sample_list], key=lambda x: x[1], reverse=False)

[('Preferred </li', 4.43), ('<p We are', 5.28), ('<li Good understanding of', 5.28), ('<li Retirement plan', 5.69), ('<li Bonus pay', 5.69), ('Liberty </b', 5.69), ('<p COVID 19', 5.69), ('<b Position Requirements : </b', 5.69), ('Retirement plan </li', 5.69), ('<li Partner with', 5.69), ('circuit design . </p', 5.69), ('always </li', 5.69), ('969 8488 . </i', 5.69), ('<li Oil', 5.69), ('<b Primary Responsibilities : </b', 5.69), ('learn on the job </li', 5.69), ('<p We are looking for', 5.69), ('<li Bachelor s Preferred', 5.69), ('<b Experience directly managing a', 5.69), ('on the job </li', 5.69)]

In [392]:

files_list = os.listdir(SAVES_HTML_FOLDER)
file_name = random.choice(files_list)
file_path = os.path.join(SAVES_HTML_FOLDER, file_name)
with open(file_path, 'r', encoding='utf-8') as f:
    html_str = f.read()
    job_soup = BeautifulSoup(html_str, 'lxml')
    body_soup = job_soup.find_all(name='body')[0]
    child_strs_list = get_navigable_children(body_soup, [])
for y_predict_proba, child_str in zip(predict_percent_fit(child_strs_list), child_strs_list):
    print()
    if y_predict_proba[1] > 0.05:
        print('Header')
    print(child_str)


<p>Tiger Analytics is looking for experienced Data Scientists to join our fast-growing advanced analytics consulting firm. Our consultants bring deep expertise in Data Science, Machine Learning and AI. We are the trusted analytics partner for multiple Fortune 500 companies, enabling them to generate business value from data. Our business value and leadership has been recognized by various market research firms, including Forrester and Gartner. We are looking for top-notch talent as we continue to build the best global analytics consulting team in the world.</p>

As a Data Scientist, you will apply strong expertise in AI through the use of machine learning, data mining, and information retrieval to design, prototype, and build next generation advanced analytics engines and services. You will collaborate with cross-functional teams and business partners to define the technical problem statement and hypotheses to test. You will develop efficient and accurate analytical models which mimic

In [168]:

[fn for fn in dir(cv) if not fn.startswith('_')]

['analyzer', 'binary', 'build_analyzer', 'build_preprocessor', 'build_tokenizer', 'decode', 'decode_error', 'dtype', 'encoding', 'fit', 'fit_transform', 'fixed_vocabulary_', 'get_feature_names', 'get_params', 'get_stop_words', 'input', 'inverse_transform', 'lowercase', 'max_df', 'max_features', 'min_df', 'ngram_range', 'preprocessor', 'set_params', 'stop_words', 'stop_words_', 'strip_accents', 'token_pattern', 'tokenizer', 'transform', 'vocabulary', 'vocabulary_']

In [87]:

random.sample([w for w in cv.get_feature_names()], 20)

['capability for', 'Come In', 'You', 'example', 'Experience', '80', 'language', 'learning to', 'and the', 'safe and', 'graduate', 'Schedule :', 'Yelp', 'Bachelor s', 'experience </li', 'Supplemental Pay', 'automate', 'work', 'persistence', 'full time']


----

In [6]:

if len(child_strs_list):
    df = pd.DataFrame(child_strs_list, columns=['navigable_parent'])
    df['is_header'] = False
    id_list = []
    df.loc[id_list, 'is_header'] = True
    df.tail(60)

In [7]:

try:
    for row_index, row_series in df.iterrows():
        navigable_parent = row_series.navigable_parent
        is_header = row_series.is_header
        BASIC_TAGS_DICT[navigable_parent] = is_header
    s.store_objects(BASIC_TAGS_DICT=BASIC_TAGS_DICT)
    rows_list = [{'navigable_parent': navigable_parent, 'is_header': is_header} for navigable_parent, is_header in BASIC_TAGS_DICT.items()]
    child_str_df = pd.DataFrame(rows_list)
    s.store_objects(child_str_df=child_str_df)
except Exception as e:
    print(str(e).strip())

name 'df' is not defined



----

In [14]:

try:
    BASIC_TAGS_DICT = s.load_object('BASIC_TAGS_DICT')
    rows_list = [{'navigable_parent': navigable_parent, 'is_header': is_header} for navigable_parent, is_header in BASIC_TAGS_DICT.items()]
    child_str_df = pd.DataFrame(rows_list)
    s.store_objects(child_str_df=child_str_df)
except:
    child_str_df = s.load_object('child_str_df')
    try:
        child_str_df = pd.concat([child_str_df, df])
    except:
        pass
    BASIC_TAGS_DICT = child_str_df.set_index('navigable_parent').to_dict()['is_header']
    s.store_objects(BASIC_TAGS_DICT=BASIC_TAGS_DICT)

Pickling to C:\Users\dev\Documents\Repositories\job-hunting\saves\pickle\child_str_df.pickle



----
# Download Job HTML

In [9]:

%run ../load_magic/storage.py
s = Storage()
base_url = 'https://www.indeed.com'
site_url = base_url + '/jobs'
site_url = '?'.join([site_url, 'q=data+scientist'])
site_url = '&'.join([site_url, 'jt=fulltime'])
site_url = '&'.join([site_url, 'remotejob=032b3046-06a3-4876-8dfd-474eb5e7ed11'])
site_url = '&'.join([site_url, 'vjk=ca16b63c03e40c57'])
#site_url = '&'.join([site_url, 'pp=gQAPAAABdY7RMKwAAAABkQdgZAAkAQBEKPpaoZstIag3f-UtQXXG_HFSo1gfBp9OQ0B0TvZ4yMp4AAA'])
start_num = 0
try:
    job_urls_list = s.load_object('job_urls_list')
except:
    job_urls_list = []
    s.store_objects(job_urls_list=job_urls_list)
space_regex = re.compile(r'[\s<>:"/\\\|\?\*_]+')
print_regex = re.compile(r'[\x9c-\x9d\uf0b7\u200b\ufb02]+')
s.encoding_type = ['latin1', 'iso8859-1', 'utf-8'][2]

In [21]:

fccid_htmls_list = [fn.split('_')[-1] for fn in os.listdir(SAVES_HTML_FOLDER)]
row_count = len(job_urls_list)
for i, job_url in enumerate(job_urls_list):
    qs = urlparse(job_url).query
    query_dict = parse_qs(qs)
    fccid_str = query_dict['fccid'][0]
    file_name = f'{fccid_str}.html'
    if file_name not in fccid_htmls_list:
        job_page = requests.get(url=job_url)
        job_soup = BeautifulSoup(job_page.content, 'lxml')
        if not len(job_soup.text):
            break
        title_str = job_soup.find_all(name='title')[0].text.strip()
        clear_output(wait=True)
        print(f'{title_str}')
        print(f'{i}/{row_count}: {job_url}')
        if 'CAPTCHA' in title_str:
            break
        file_name = space_regex.sub('_', title_str)
        file_name = f'{file_name}_{fccid_str}.html'
        file_path = os.path.join(SAVES_HTML_FOLDER, file_name)
        body_soup = job_soup.find_all(name='body')[0]
        html_str = '<html><head><title>' + title_str + '</title></head><body>'
        with open(file_path, 'w', encoding='utf-8') as f:
            print(html_str, file=f)
            for div_tag in body_soup.find_all(name='div', class_='jobsearch-JobComponent-description'):
                for s in div_tag.select('template'):
                    s.extract()
                for s in div_tag.select('script'):
                    s.extract()
                div_str = div_tag.prettify(formatter='html')
                div_str = print_regex.sub('', div_str)
                print(div_str, file=f)
            print('</body></html>', file=f)
print('Complete.')

Complete.


In [6]:

while start_num < 3000:
    page_url = '&'.join([site_url, f'start={start_num}'])
    start_num += 10
    site_page = requests.get(url=page_url)
    page_soup = BeautifulSoup(site_page.content, 'lxml')
    row_div_list = page_soup.find_all(name='div', class_=['row', 'result'])
    row_count = len(row_div_list)
    if row_count == 0:
        print('Nothing left')
        break
    for i, row_div in enumerate(row_div_list):
        link = row_div.find_all(name='a')[0]
        if 'title' in link.attrs:
            if 'href' in link.attrs:
                job_url = base_url + link['href']
                qs = urlparse(job_url).query
                query_dict = parse_qs(qs)
                if 'fccid' in query_dict:
                    job_urls_list.append(job_url)
                    s.store_objects(verbose=False, job_urls_list=job_urls_list)
                    clear_output(wait=True)
                    print(f'{page_url}')
                    print(f'{i}/{row_count}: {job_url}')

In [22]:

fccid_htmls_list = [fn.split('_')[-1] for fn in os.listdir(SAVES_HTML_FOLDER)]
len(fccid_htmls_list)

273

In [22]:

import sys

command_str = f'{sys.executable} -m pip install --upgrade lxml'
print(command_str)
!{command_str}

C:\Users\dev\Anaconda3\envs\jh\python.exe -m pip install --upgrade lxml
Collecting lxml
  Downloading lxml-4.6.1-cp39-cp39-win_amd64.whl (3.5 MB)
Installing collected packages: lxml
Successfully installed lxml-4.6.1



----

In [2]:

import os

from selenium import webdriver
from base64 import b64encode
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import selenium.webdriver.support.ui as ui

In [4]:

# Retrieve the page with tag results and set it up to be scraped
sitePage = requests.get(url=site_url)
sitePageSoup = BeautifulSoup(sitePage.content, 'lxml')

In [5]:

saves_txt_folder = os.path.join(s.saves_folder, 'txt')
os.makedirs(name=saves_txt_folder, exist_ok=True)

In [6]:

row_div_list = sitePageSoup.find_all(name='div', class_=['row', 'result'])
reqs_regex = re.compile('ducat|xperience|equire')
html_regex = re.compile('<[^>]+>')

In [201]:

df.loc[0, 'navigable_parent']

'<div class="jobsearch-JobDescriptionSection-jobDescriptionTitle icl-u-xs-my--md" id="jobDescriptionTitle">Full Job Description</div>'

In [None]:

import pandas as pd

div_tag = div_tag_list[11]
child_str_list = get_navigable_children(div_tag, [])
df = pd.DataFrame(child_str_list, columns=['navigable_parent'])
df['is_header'] = False
#df.loc[[0, 2, 4, 17, 19], 'is_header'] = True
df

In [None]:

text_list = ['What we are looking for', 'Key skills and Experience', 'Minimum qualifications', 'The Essentials', 'Qualifications',
             'Skills and experience', 'Required Qualifications', 'What your background looks like', "We(?:&rsquo;|')re looking for \w+ who have",
             'Experience/Minimum Qualifications', 'Requirements', 'We are looking for someone with', 'Qualifications']
reqs_regex = re.compile(f'^\\s*({'|'.join(text_list)}):?\\s*$', re.IGNORECASE | re.MULTILINE)
for match_obj in reqs_regex.finditer(text_str):
    # match start: match_obj.start()
    # match end (exclusive): match_obj.end()
    # matched text: match_obj.group()

In [39]:

[f'div_tag.{fn}' for fn in dir(div_tag) if 'child' in fn.lower()]

['div_tag._lastRecursiveChild', 'div_tag.childGenerator', 'div_tag.children', 'div_tag.findChild', 'div_tag.findChildren', 'div_tag.recursiveChildGenerator', 'div_tag.replaceWithChildren', 'div_tag.replace_with_children']

In [117]:

selector_list = ['body', 'div', 'div.jobsearch-ViewJobLayout-fluidContainer',
                 'div.jobsearch-ViewJobLayout-content.jobsearch-ViewJobLayout-mainContent', 'div', 'div',
                 'div.jobsearch-ViewJobLayout-jobDisplay', 'div.jobsearch-JobComponent', 'div.jobsearch-JobComponent-description']
content_selector = ' > '.join(selector_list)

def has_role_attr(tag):
    
    return tag.has_attr('role')

In [None]:

def has_class_and_id(tag):
    
    return tag.has_attr('class') and tag.has_attr('id')

# <span class="summary">We need a <b>data</b> <b>scientist</b> and <b>data</b> wrangler. Maybe survey <b>data</b>. 
# Better still if you have some demonstrable experience with more advanced machine learning methods...</span>
summary_list = sitePageSoup.find_all(name='span', class_='summary')
for summary in summary_list:
    print(summary.text.strip())
    print()

In [None]:

# <a target="_blank" id="sja5" data-tn-element="jobTitle" class="jobtitle turnstileLink" 
# href="https://www.indeed.com/pagead/clk?mo=r&amp;ad=-...-...-...-...-...-...-...&
# amp;p=5&amp;sk=&amp;fvj=1&amp;tk=1c06s8995av53coj&amp;jsa=6565" 
# title="Perception Scientist for Marine Autonomy" rel="noopener nofollow" 
# onmousedown="sjomd('sja5'); clk('sja5');" onclick="setRefineByCookie([]); sjoc('sja5',0); convCtr('SJ')"
# >Perception <b>Scientist</b> for Marine Autonomy</a>
siteCss = '#sja5'
siteLinks = sitePageSoup.select(siteCss)
print(siteLinks)

max_page = 0
if len(siteLinks):
    max_page = int(siteLinks[0]["href"].split('/')[-1].split('-')[1].split(',')[0])
print(max_page)

In [6]:
#<h1 class="srp-header">25,589 Used Vehicles for sale</h1>

for parser in ['lxml', 'html5lib', 'html.parser']:
    sitePageSoup = BeautifulSoup(page_html, parser)
    print(parser, len(sitePageSoup.select('h1.srp-header')))

#max_page = int(sitePageSoup.find_all("a", class_="js-last-page")[0].text)
#print(max_page)

lxml 1
html5lib 0
html.parser 1
