In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

import sys
import os
sys.path.insert(1, '../py')

from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities()
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)


---
# Data Preparation
This stage involves preparing the dataset for comparison by cleaning and pre-processing the data.

In [23]:

t0 = time.time()
cu.populate_pos_relationships(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech relationships repopulated in {duration_str}')

Parts-of-speech relationships repopulated in 12 seconds


In [24]:

cypher_str = '''
    // Filter for NavigableParents nodes with an unambiguous SUMMARIZES relationship
    MATCH (np:NavigableParents)
    WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) = 1
    
    // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
    WITH np
    MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
    
    // Return the navigable parent and important properties
    RETURN
        pos.pos_symbol AS pos_symbol,
        pos.is_header AS pos_is_header,
        pos.is_task_scope AS pos_is_task_scope,
        pos.is_minimum_qualification AS pos_is_minimum_qualification,
        pos.is_preferred_qualification AS pos_is_preferred_qualification,
        pos.is_legal_notification AS pos_is_legal_notification,
        pos.is_job_title AS pos_is_job_title,
        pos.is_office_location AS pos_is_office_location,
        pos.is_job_duration AS pos_is_job_duration,
        pos.is_supplemental_pay AS pos_is_supplemental_pay,
        pos.is_educational_requirement AS pos_is_educational_requirement,
        pos.is_interview_procedure AS pos_is_interview_procedure,
        pos.is_corporate_scope AS pos_is_corporate_scope,
        pos.is_posting_date AS pos_is_posting_date,
        pos.is_other AS pos_is_other,
        
        np.navigable_parent AS navigable_parent,
        np.is_header AS np_is_header,
        np.is_task_scope AS np_is_task_scope,
        np.is_minimum_qualification AS np_is_minimum_qualification,
        np.is_preferred_qualification AS np_is_preferred_qualification,
        np.is_legal_notification AS np_is_legal_notification,
        np.is_job_title AS np_is_job_title,
        np.is_office_location AS np_is_office_location,
        np.is_job_duration AS np_is_job_duration,
        np.is_supplemental_pay AS np_is_supplemental_pay,
        np.is_educational_requirement AS np_is_educational_requirement,
        np.is_interview_procedure AS np_is_interview_procedure,
        np.is_corporate_scope AS np_is_corporate_scope,
        np.is_posting_date AS np_is_posting_date,
        np.is_other AS np_is_other;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    pos_html_strs_df = DataFrame(row_objs_list)
    
    # (46408, 30)
    print(pos_html_strs_df.shape)

(47299, 30)


In [25]:

columns_list = ['navigable_parent', 'pos_symbol', 'pos_is_header']
pos_html_strs_df[columns_list].sample(5)

Unnamed: 0,navigable_parent,pos_symbol,pos_is_header
23079,<li>Test-and-learn mentality – you pivot quick...,O-RQ,False
31048,<p>Role Title: Salesforce Functional Consultan...,O-JT,False
42590,"<oip>Role Client Supply Contact: Haring,Bruno ...",O-IP,False
35862,<p>Role ID: 4714565</p>,O-JT,False
32315,<ojt>Role ID: 4718457</ojt>,O-JT,False


In [26]:

# Prepare the comparison data
part_of_speech_dict = pos_html_strs_df.set_index('navigable_parent').pos_symbol.to_dict()
y_actual = [(pos_symbol, ) for pos_symbol in part_of_speech_dict.values()]


---
# Parts-of-speech Prediction by Stochastic Gradient Descent Algorithm
## SGD Data Preparation
This stage involves preparing the SGD dataset for evaluation by transforming the data with a multi-label binarizer.

In [22]:

import time
t0 = t1 = time.time()

from is_corporate_scope_sgd_classifier import IsCorporateScopeSgdClassifier
csu = IsCorporateScopeSgdClassifier(ha=ha, cu=cu, verbose=False)
csu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_educational_requirement_sgd_classifier import IsEducationalRequirementSgdClassifier
eru = IsEducationalRequirementSgdClassifier(ha=ha, cu=cu, verbose=False)
eru.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_header_sgd_classifier import IsHeaderSgdClassifier
ihu = IsHeaderSgdClassifier(ha=ha, cu=cu, verbose=False)
ihu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_interview_procedure_sgd_classifier import IsInterviewProcedureSgdClassifier
ipu = IsInterviewProcedureSgdClassifier(ha=ha, cu=cu, verbose=False)
ipu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_job_duration_sgd_classifier import IsJobDurationSgdClassifier
jdu = IsJobDurationSgdClassifier(ha=ha, cu=cu, verbose=False)
jdu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_job_title_sgd_classifier import IsJobTitleSgdClassifier
jtu = IsJobTitleSgdClassifier(ha=ha, cu=cu, verbose=False)
jtu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_legal_notification_sgd_classifier import IsLegalNotificationSgdClassifier
lnu = IsLegalNotificationSgdClassifier(ha=ha, cu=cu, verbose=False)
lnu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_office_location_sgd_classifier import IsOfficeLocationSgdClassifier
olu = IsOfficeLocationSgdClassifier(ha=ha, cu=cu, verbose=False)
olu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_other_sgd_classifier import IsOtherSgdClassifier
ou = IsOtherSgdClassifier(ha=ha, cu=cu, verbose=False)
ou.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_posting_date_sgd_classifier import IsPostingDateSgdClassifier
pdu = IsPostingDateSgdClassifier(ha=ha, cu=cu, verbose=False)
pdu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_preferred_qualification_sgd_classifier import IsPreferredQualificationSgdClassifier
pqu = IsPreferredQualificationSgdClassifier(ha=ha, cu=cu, verbose=False)
pqu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_qualification_sgd_classifier import IsQualificationSgdClassifier
qu = IsQualificationSgdClassifier(ha=ha, cu=cu, verbose=False)
qu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_minimum_qualification_sgd_classifier import IsRequiredQualificationSgdClassifier
rqu = IsRequiredQualificationSgdClassifier(ha=ha, cu=cu, verbose=False)
rqu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_supplemental_pay_sgd_classifier import IsSupplementalPaySgdClassifier
spu = IsSupplementalPaySgdClassifier(ha=ha, cu=cu, verbose=False)
spu.build_pos_stochastic_gradient_descent_elements(verbose=True)

from is_task_scope_sgd_classifier import IsTaskScopeSgdClassifier
tsu = IsTaskScopeSgdClassifier(ha=ha, cu=cu, verbose=False)
tsu.build_pos_stochastic_gradient_descent_elements(verbose=True)

import humanize
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech Stochastic Gradient Descent elements built in {duration_str}')

I have 47,989 hand-labeled corporate scope htmls prepared
6 iterations seen during training fit for a total of 47,989 records trained
I have 47,991 hand-labeled educational requirement htmls prepared
7 iterations seen during training fit for a total of 47,991 records trained
I have 48,106 hand-labeled header htmls prepared
7 iterations seen during training fit for a total of 48,106 records trained
I have 47,988 hand-labeled interview procedure htmls prepared
7 iterations seen during training fit for a total of 47,988 records trained
I have 47,987 hand-labeled job duration htmls prepared
7 iterations seen during training fit for a total of 47,987 records trained
I have 47,989 hand-labeled job title htmls prepared
7 iterations seen during training fit for a total of 47,989 records trained
I have 47,988 hand-labeled legal notification htmls prepared
7 iterations seen during training fit for a total of 47,988 records trained
I have 47,988 hand-labeled office location htmls prepared
7 itera

In [20]:

def predict_single(child_str):
    pos_symbol = ''
    percent_fit = ihu.predict_percent_fit(child_str)
    if percent_fit >= 0.5:
        pos_symbol += 'H-'
    else:
        pos_symbol += 'O-'
    tuples_list = []
    for sgd_obj in [csu, eru, ipu, jdu, jtu, lnu, olu, ou, pdu, pqu, rqu, spu, tsu]:
        class_name = re.sub(r"<class 'is[a-z]+_utils\.Is([a-zA-z]+)Utilities'>", r'\g<1>', str(sgd_obj.__class__))
        pos_symbol_suffix = re.sub('[a-z]+', '', class_name)
        percent_fit = sgd_obj.predict_percent_fit(child_str)
        sgd_tuple = (pos_symbol_suffix, percent_fit)
        tuples_list.append(sgd_tuple)
    pos_symbol += sorted(tuples_list, key=lambda x: x[1], reverse=True)[0][0]

    return pos_symbol

In [27]:

t0 = time.time()
y_sgd_predicted = [(predict_single(navigable_parent), ) for navigable_parent in part_of_speech_dict.keys()]
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

Predicted labels created in 1 hour, 54 minutes and 8 seconds


In [28]:

y_sgd_predicted[-2:]

[('O-RQ',), ('O-RQ',)]

In [29]:

y_actual[-2:]

[('O-O',), ('O-O',)]

In [30]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_actual_transformed = mlb.fit_transform(y_actual)
y_sgd_pred_transformed = mlb.transform(y_sgd_predicted)
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)


## SGD Visualization

In [32]:

# Compute the classification report
from sklearn.metrics import classification_report

pos_symbol_sgd_df = DataFrame.from_dict(classification_report(y_actual_transformed, y_sgd_pred_transformed, target_names=mlb.classes_,
                                                              zero_division=0, output_dict=True), orient='index')
pos_symbol_sgd_df.index.name = 'pos_symbol'
pos_symbol_sgd_df.sort_values('recall', ascending=False)

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O-RQ,0.526292,1.0,0.689635,17445
O-JT,0.998935,0.947847,0.972721,9894
O-IP,0.999557,0.92478,0.960715,4879
weighted avg,0.667572,0.667562,0.56675,47299
micro avg,0.667562,0.667562,0.667562,47299
samples avg,0.667562,0.667562,0.667562,47299
macro avg,0.162223,0.112079,0.103982,47299
O-TS,0.978723,0.033358,0.064516,6895
H-RQ,0.714286,0.008065,0.015949,1240
H-O,0.0,0.0,0.0,123



----

In [9]:

for sgd_obj in [csu, eru, ipu, jdu, jtu, lnu, olu, ou, pdu, pqu, qu, rqu, spu, tsu]:
    class_name = str(sgd_obj.__class__)
    print(class_name)

<class 'is_corporate_scope_sgd_classifier.IsCorporateScopeSgdClassifier'>
<class 'is_educational_requirement_sgd_classifier.IsEducationalRequirementSgdClassifier'>
<class 'is_interview_procedure_sgd_classifier.IsInterviewProcedureSgdClassifier'>
<class 'is_job_duration_sgd_classifier.IsJobDurationSgdClassifier'>
<class 'is_job_title_sgd_classifier.IsJobTitleSgdClassifier'>
<class 'is_legal_notification_sgd_classifier.IsLegalNotificationSgdClassifier'>
<class 'is_office_location_sgd_classifier.IsOfficeLocationSgdClassifier'>
<class 'is_other_sgd_classifier.IsOtherSgdClassifier'>
<class 'is_posting_date_sgd_classifier.IsPostingDateSgdClassifier'>
<class 'is_preferred_qualification_sgd_classifier.IsPreferredQualificationSgdClassifier'>
<class 'is_qualification_sgd_classifier.IsQualificationSgdClassifier'>
<class 'is_minimum_qualification_sgd_classifier.IsRequiredQualificationSgdClassifier'>
<class 'is_supplemental_pay_sgd_classifier.IsSupplementalPaySgdClassifier'>
<class 'is_task_scope_s

In [5]:

child_str = '<b>Key Qualifications:</b>'
rows_list = []
for sgd_obj in [csu, eru, ihu, ipu, jdu, jtu, lnu, olu, ou, pdu, pqu, qu, rqu, spu, tsu]:
    row_dict = {}
    class_name = str(sgd_obj.__class__).split("'")[1].split('.')[1]
    row_dict['class_name'] = class_name
    percent_fit = sgd_obj.predict_percent_fit(child_str)
    row_dict['percent_fit'] = percent_fit
    rows_list.append(row_dict)
from pandas import DataFrame
df = DataFrame(rows_list)

In [6]:

df.sort_values('percent_fit', ascending=False)

Unnamed: 0,class_name,percent_fit
11,IsQualificationSgdClassifier,0.975795
12,IsRequiredQualificationSgdClassifier,0.410475
2,IsHeaderSgdClassifier,0.311674
14,IsTaskScopeSgdClassifier,0.165191
5,IsJobTitleSgdClassifier,0.103399
3,IsInterviewProcedureSgdClassifier,0.07856
10,IsPreferredQualificationSgdClassifier,0.070057
0,IsCorporateScopeSgdClassifier,0.030944
13,IsSupplementalPaySgdClassifier,0.01839
4,IsJobDurationSgdClassifier,0.013411


In [7]:

import re

for class_name in df.class_name:
    class_name = re.sub('Is([a-zA-Z]+)Utilities', r'\g<1>', class_name)
    class_name = re.sub('[a-z]+', '', class_name)
    print(class_name)

CS
ER
H
IP
JD
JT
LN
OL
O
PD
PQ
Q
RQ
SP
TS


In [21]:

child_str = '<b>Key Qualifications:</b>'
predict_single(child_str)

'O-RQ'