In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from neo4j.exceptions import ServiceUnavailable
from nltk import pos_tag
from pandas import DataFrame
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
import humanize
import nltk
import os
import re
import sys
import time
import warnings

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [4]:

t0 = t1 = time.time()

# Get the Storage object
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

# Get the HeaderAnalysis object
from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the WebScrapingUtilities object
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the CypherUtilities object and Neo4j driver
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

try:
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')

# Get the HeaderCategories object
from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

# Get the LrUtilities object
from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

# Get the SectionLRClassifierUtilities object
from section_classifier_utils import SectionLRClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Get the SectionSGDClassifierUtilities object
from section_classifier_utils import SectionSGDClassifierUtilities
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Get the SectionCRFClassifierUtilities object
from section_classifier_utils import SectionCRFClassifierUtilities
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

# Get the CrfUtilities object
from crf_utils import CrfUtilities
crf = CrfUtilities(
    ha=ha, hc=hc, cu=cu, lru=lru, slrcu=slrcu, scrfcu=scrfcu, ssgdcu=ssgdcu, verbose=True
)

duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')

Utility libraries created in 3 seconds



---
# Data Preparation
This stage involves preparing the dataset for comparison by cleaning and pre-processing the data.

In [5]:

t1 = time.time()
cu.populate_pos_relationships(verbose=False)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech relationships repopulated in {duration_str}')

Parts-of-speech relationships repopulated in 7 seconds


In [6]:

cypher_str = '''
    // Filter for NavigableParents nodes with an unambiguous SUMMARIZES relationship
    MATCH (np:NavigableParents)
    WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) = 1
    
    // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
    WITH np
    MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
    
    // Return the navigable parent and important properties
    RETURN
        pos.pos_symbol AS pos_symbol,
        pos.is_header AS pos_is_header,
        pos.is_task_scope AS pos_is_task_scope,
        pos.is_minimum_qualification AS pos_is_minimum_qualification,
        pos.is_preferred_qualification AS pos_is_preferred_qualification,
        pos.is_legal_notification AS pos_is_legal_notification,
        pos.is_job_title AS pos_is_job_title,
        pos.is_office_location AS pos_is_office_location,
        pos.is_job_duration AS pos_is_job_duration,
        pos.is_supplemental_pay AS pos_is_supplemental_pay,
        pos.is_educational_requirement AS pos_is_educational_requirement,
        pos.is_interview_procedure AS pos_is_interview_procedure,
        pos.is_corporate_scope AS pos_is_corporate_scope,
        pos.is_posting_date AS pos_is_posting_date,
        pos.is_other AS pos_is_other,
        
        np.navigable_parent AS navigable_parent,
        np.is_header AS np_is_header,
        np.is_task_scope AS np_is_task_scope,
        np.is_minimum_qualification AS np_is_minimum_qualification,
        np.is_preferred_qualification AS np_is_preferred_qualification,
        np.is_legal_notification AS np_is_legal_notification,
        np.is_job_title AS np_is_job_title,
        np.is_office_location AS np_is_office_location,
        np.is_job_duration AS np_is_job_duration,
        np.is_supplemental_pay AS np_is_supplemental_pay,
        np.is_educational_requirement AS np_is_educational_requirement,
        np.is_interview_procedure AS np_is_interview_procedure,
        np.is_corporate_scope AS np_is_corporate_scope,
        np.is_posting_date AS np_is_posting_date,
        np.is_other AS np_is_other;'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    pos_html_strs_df = DataFrame(row_objs_list)
    
    # (47670, 30)
    print(pos_html_strs_df.shape)

(47670, 30)


In [7]:

columns_list = ['navigable_parent', 'pos_symbol', 'pos_is_header']
pos_html_strs_df[columns_list].sample(5)

Unnamed: 0,navigable_parent,pos_symbol,pos_is_header
21246,"<orq>Ability to business, tech and compliance ...",O-RQ,False
37136,<ojt>Role ID: 4712056.0</ojt>,O-JT,False
40512,<osp>Career Level From - To: 11 to 7</osp>,O-SP,False
11705,<li>6 - Advance Front End Development NodeJs (...,O-RQ,False
13374,<orq>Ability to deployment-2 Lead</orq>,O-RQ,False


In [8]:

# Prepare the comparison data
part_of_speech_dict = pos_html_strs_df.set_index('navigable_parent').pos_symbol.to_dict()
y_actual = [(pos_symbol, ) for pos_symbol in part_of_speech_dict.values()]


---
# Parts-of-speech Prediction by Conditional Random Fields Algorithm
## CRF Data Preparation
This stage involves preparing the CRF dataset for evaluation by transforming the data with a multi-label binarizer.

In [16]:

crf_name = crf.word2features.__doc__.split('.')[0].split(' ')[-1]
if s and s.pickle_exists(crf_name):
    crf.CRF = s.load_object(crf_name)
if hasattr(crf, 'CRF'):
    print('predict_single is now available')
else:
    print('predict_single is not available')

predict_single is now available


In [23]:

[f'crf.CRF.{fn}' for fn in dir(crf.CRF) if 'predict' in fn.lower()]

['crf.CRF.predict', 'crf.CRF.predict_marginals', 'crf.CRF.predict_marginals_single', 'crf.CRF.predict_single']

In [79]:

import random

if s and s.pickle_exists('HEADER_PATTERN_DICT'):
    header_pattern_dict = s.load_object('HEADER_PATTERN_DICT')
SAVES_HTML_FOLDER = os.path.join(s.saves_folder, 'html')
files_list = os.listdir(SAVES_HTML_FOLDER)
file_name = random.choice(files_list)
while file_name not in header_pattern_dict:
    file_name = random.choice(files_list)
feature_dict_list = header_pattern_dict[file_name]

In [81]:

str(feature_dict_list)[:200]

"[{'initial_tag': 'hrq', 'is_header': True, 'is_task_scope': False, 'is_minimum_qualification': True, 'is_preferred_qualification': False, 'is_legal_notification': False, 'is_job_title': False, 'is_off"

In [82]:

crf.CRF.predict([feature_dict_list])

[['O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP', 'O-IP']]

In [91]:

symbols_list = []
for symbol_probs_dict in crf.CRF.predict_marginals([feature_dict_list])[0]:
    pos_symbol = sorted([(s, p) for s, p in symbol_probs_dict.items()], key=lambda x: x[1], reverse=True)[0][0]
    symbols_list.append(pos_symbol)
[symbols_list]

[['O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS', 'O-CS']]

In [52]:

t1 = time.time()
from pandas import Series

y_crf_predicted = []
for navigable_parent in part_of_speech_dict.keys():
    predictions_list = crf.CRF.predict_single(navigable_parent)
    pos_symbol = 'O-O'
    if predictions_list:
        pos_symbol = Series(predictions_list).mode().squeeze()
        if type(pos_symbol) != str:
            pos_symbol = predictions_list[0]
    y_crf_predicted.append((pos_symbol, ))
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

Predicted labels created in 13 seconds


In [53]:

y_crf_predicted[-2:]

[('O-IP',), ('O-IP',)]

In [51]:

y_actual[-2:]

[('O-O',), ('O-O',)]

In [54]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_actual_transformed = mlb.fit_transform(y_actual[:])
y_crf_pred_transformed = mlb.transform(y_crf_predicted[:])
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)


## CRF Visualization

In [55]:

# Compute the classification report
pos_symbol_crf_df = DataFrame.from_dict(classification_report(y_actual_transformed, y_crf_pred_transformed, target_names=mlb.classes_,
                                                              zero_division=0, output_dict=True),
                                        orient='index')
pos_symbol_crf_df.index.name = 'pos_symbol'
pos_symbol_crf_df.sort_values('recall', ascending=False)

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O-IP,0.10241,0.999385,0.185782,4882
micro avg,0.102349,0.102349,0.102349,47670
weighted avg,0.010488,0.102349,0.019026,47670
samples avg,0.102349,0.102349,0.102349,47670
macro avg,0.003939,0.038438,0.007145,47670
H-JT,0.0,0.0,0.0,77
H-LN,0.0,0.0,0.0,68
H-IP,0.0,0.0,0.0,103
H-JD,0.0,0.0,0.0,31
O-TS,0.0,0.0,0.0,6983


In [76]:

# Check if the scrfcu has built its parts-of-speech conditional random field elements
# Parts-of-speech CRF elements normally built in 29 minutes and 57 seconds
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is now available
Parts-of-speech conditional random field elements built in 2 seconds


In [None]:

child_strs_list = ha.get_child_strs_from_file(file_name=file_name)
child_tags_list = ha.get_child_tags_list(child_strs_list)
feature_dict_list = cu.get_feature_dict_list(child_tags_list, child_strs_list)
feature_tuple_list = []
for feature_dict in feature_dict_list:
    feature_tuple_list.append(hc.get_feature_tuple(
        feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=scrfcu.predict_single,
        pos_sgd_predict_single=ssgdcu.predict_single
    ))
random.choice(feature_tuple_list)

In [71]:

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient descent elements built in {duration_str}')

I have 49,163 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 12 seconds


In [75]:

feature_tuple = random.choice(feature_tuple_list)
sorted(crf.word2features([feature_tuple], 0).keys(), key=lambda x: len(x))

['BOS', 'EOS', 'postag', 'position', '+0:postag', 'tag.span_set', 'tag.lists_set', 'tag.forms_set', 'tag.anchor_set', 'tag.tables_set', 'tag.frames_set', 'tag.general_set', '+0:tag.span_set', 'tag.null_element', '+0:previous==tag', '+0:tag.lists_set', '+0:tag.forms_set', '+0:tag.anchor_set', '+0:tag.tables_set', '+0:tag.frames_set', 'tag.basic_text_set', '+0:tag.general_set', '+0:tag.null_element', 'tag.presentation_set', '+0:tag.basic_text_set', 'tag.block_elements_set', 'tag.inline_elements_set', 'tag.phrase_elements_set', '+0:tag.presentation_set', 'tag.section_headings_set', 'tag.historic_elements_set', '+0:tag.block_elements_set', 'tag.images_and_objects_set', '+0:tag.inline_elements_set', '+0:tag.phrase_elements_set', 'tag.pos_symbol_elements_set', '+0:tag.section_headings_set', 'tag.other_block_elements_set', '+0:tag.historic_elements_set', 'tag.other_inline_elements_set', 'tag.non_standard_elements_set', 'tag.consecutive_previous_tags', '+0:tag.images_and_objects_set', 'tag.doc


---
# Parts-of-speech Prediction by Logistic Regression Algorithm
## LR Data Preparation
This stage involves preparing the LR dataset for evaluation by transforming the data with a multi-label binarizer.

In [16]:

# Keep the total creation time to less than one hour by adjusting the sampling strategy limit
# I have 47,686 labeled parts of speech in here
# Parts-of-speech logistic regression elements built in 2 hours, 3 minutes and 10 seconds
t1 = time.time()
if not (hasattr(slrcu, 'pos_predict_percent_fit_dict')):# or crf.is_flask_running()
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)#6_400
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 47,946 labeled parts of speech in here
Parts-of-speech logistic regression elements built in 1 hour, 59 minutes and 54 seconds


In [17]:

t0 = time.time()
y_lr_predicted = [(slrcu.predict_single(navigable_parent), ) for navigable_parent in part_of_speech_dict.keys()]
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

Predicted labels created in 9 hours, 5 minutes and 5 seconds


In [18]:

y_lr_predicted[-2:]

[('O-O',), ('O-RQ',)]

In [19]:

y_actual[-2:]

[('O-O',), ('O-O',)]

In [20]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_actual_transformed = mlb.fit_transform(y_actual)
y_lr_pred_transformed = mlb.transform(y_lr_predicted)
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)


## LR Visualization

In [None]:

# Compute the classification report
pos_symbol_lr_df = DataFrame.from_dict(classification_report(y_actual_transformed, y_lr_pred_transformed, target_names=mlb.classes_,
                                                             zero_division=0, output_dict=True),
                                        orient='index')
pos_symbol_lr_df.index.name = 'pos_symbol'
pos_symbol_lr_df.sort_values('recall', ascending=False)


----

In [19]:

sents_list = [
    "<div>At PRA, we don't make our 17000+ people great. It's the other way around. As we have grown to a top-5 CRO, we have maintained the feel of a small company, dedicated to collaboration and passion for what we do.</div>",
    "<div>We always have a desire to keep seeking new and better ways to operate. We don't settle for the same old ways. Our passion for improving patient lives worldwide permeates all that we do. Put simply, we care.</div>",
    '<b>Responsibilities</b>',
    '<div>Due to growth we are currently seeking a Central Data Scientist, to work as part of an established global group that supports project teams in analyzing clinical data to identify risks and data issues, using advanced analytical techniques.</div>',
    '<div>Acting as a Key Functional Lead on projects; the Central Data Scientist will assess trial compliance, perform trending analysis, exploratory data reviews and report findings to internal and external stakeholders.</div>',
    '<div>Key responsibilities in this role include:</div>',
    '<li>Developing and maintaining study documents specifying strategy, approach and procedures on assigned protocols/projects.</li>',
    '<li>Providing input to applications, databases and systems used to assess study data quality.</li>',
    '<li>Reviewing clinical data at aggregate level regularly throughout assigned studies using analytical reporting tools to support the identification of risks and data patterns/trends.</li>',
    '<li>Creating analytical reports and presentations to facilitate review and data-driven decision making during team meetings</li>',
    '<li>Performs analytical reviews and collaborates with assigned project teams to address data-related questions and recommend potential solutions.</li>',
    '<li>Provides input during adaptive monitoring assessment process.</li>',
    '<li>Documents review findings utilizing applicable systems, according to standard procedures.</li>',
    '<li>Develops analytical reports using programming knowledge and data modeling techniques, e.g., SQL, SAS.</li>',
    '<li>Leads Analysis of Findings meetings on assigned projects.</li>',
    '<li>Escalates project concerns such as outof-scope tasks, at-risk project deliverables and project team relationship issues to functional and project managers in a timely fashion.</li>',
    '<b>Qualifications</b>',
    '<p>You are...</p>',
    '<p>Analytically-minded, a problem-solver</p>',
    '<p>Here at PRA we want our employees to succeed and ensure that they are set up for this success through constant training, development and support. To enable success in this position you will have:</p>',
    "<li>Bachelor's degree (or equivalent) in a Scientific or Healthcare discipline</li>",
    '<li>Previous experience, ideally in a similar role, but we are also open to considering people with other relevant clinical trials experience, including those working as Lead Data Managers, Programmers, Clinical Research Associates, Clinical Team Managers and Project Managers.</li>',
    'Familiarity with',
    '<b>risk-based monitoring</b>',
    '<li>Technical ability: use of JReview specifically or other analytical/visualization tool (e.g Spotfire, SAS JMP Clinical, SAS, R) or at a very minimum significant experience in using excel (including pivot tables, graphics and data exploration).</li>',
    '<li>Analytical thinker: ability to break down issues into manageable components</li>',
    '<b>SQL experience</b>',
    'Skills in aggregating data review and interpretation using visualization/analysis software e.g.',
    'JReview, Tableau, SAS',
    '<div>You will be frequently collaborating within multi-cultural global teams, so will need to demonstrate excellent written and oral communication skills, exhibit pro-active teamwork alongside a positive attitude, and maintain up-to-date industry awareness and understanding of regulation/standards.</div>',
    '<b>PRA is an EEO/AA employer and is committed to providing opportunities to minorities, women, veterans and individuals with disabilities.</b>',
    '<b>Options</b>',
    '<div>Apply for this job onlineApply</div>',
    '<div>Share</div>',
    '<div>Sorry the Share function is not working properly at this moment. Please refresh the page and try again later.</div>',
    '<div>Share on your newsfeed</div>',
    '<b>Connect With Us!</b>'
]
pos_symbols_list = [
    'O-CS',
    'O-CS',
    'H-TS',
    'O-IP',
    'O-TS',
    'H-TS',
    'O-TS',
    'O-TS',
    'O-TS',
    'O-TS',
    'O-TS',
    'O-TS',
    'O-TS',
    'O-TS',
    'O-TS',
    'O-TS',
    'H-RQ',
    'H-RQ',
    'O-RQ',
    'O-IP',
    'O-ER',
    'O-RQ',
    'H-RQ',
    'O-RQ',
    'O-RQ',
    'O-RQ',
    'O-RQ',
    'H-RQ',
    'O-RQ',
    'O-RQ',
    'O-LN',
    'H-O',
    'O-O',
    'O-O',
    'O-O',
    'O-O',
    'O-IP'
]
tokens_list = [ha.html_regex_tokenizer(sentence) for sentence in sents_list]
from nltk import pos_tag
pos_tags_list = [pos_tag(tokens) for tokens in tokens_list]
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'word': word,
        'postag': postag
    }

    return features
def sent2features(sent):

    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(pos_symbol, sent):

    return [pos_symbol] * len(sent)
X = [sent2features(pos_tags) for pos_tags in pos_tags_list]
y = [sent2labels(pos_symbol, pos_tag) for pos_tag, pos_symbol in zip(pos_tags_list, pos_symbols_list)]
import sklearn_crfsuite
pos_symbol_crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
try:
    pos_symbol_crf.fit(X, y)
except AttributeError:
    pass

In [28]:

X[0][0]

{'word': '<div', 'postag': 'NN'}

In [33]:

import numpy as np

pos_symbol_dicts_list = pos_symbol_crf.predict_marginals(X)[0]
np.mean([pos_symbol_dict['O-CS'] for pos_symbol_dict in pos_symbol_dicts_list])

0.930723015065452

In [22]:

[f'pos_symbol_crf.{fn}' for fn in dir(pos_symbol_crf) if 'predict' in fn.lower()]

['pos_symbol_crf.predict', 'pos_symbol_crf.predict_marginals', 'pos_symbol_crf.predict_marginals_single', 'pos_symbol_crf.predict_single']