In [1]:

# Load needed libraries and functions
%pprint
%matplotlib inline
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))
from jobpostlib import (cu, datetime, nu, humanize, time)
from pandas import DataFrame
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
import os
import torch

Pretty printing has been turned OFF
Utility libraries created in 4 seconds



---
# Model Creation

In [2]:

file_tags_mapping = nu.load_object('file_tags_mapping')
pos_html_strs_df = nu.load_object('file_tags_df')
model_path = '../saves/models/sequence_classification.model'
tokenizer_path = '../saves/tokenizers/sequence_classification.tokenizer'

In [3]:

# Load the saved model
model = BertForSequenceClassification.from_pretrained(model_path)

# Load the saved tokenizer
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set the model to evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:

# 2. I create a function to predict the label for a given text:
reverse_mapping = {v: k for k, v in file_tags_mapping.items()}
def predict_label(text, max_length=128):
    
    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Move inputs to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Get the model's prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Get the predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    
    # Map the predicted class back to its label
    predicted_label = reverse_mapping[predicted_class]
    
    return predicted_label


----
## Why are Task Scopes so badly predicted?

In [5]:

# Populate the parts of speech dictionary so that it only contains task scope examples
mask_series = (pos_html_strs_df.pos_symbol == 'O-TS')
part_of_speech_dict = pos_html_strs_df[mask_series].set_index('text').pos_symbol.to_dict()

In [6]:

# Prepare the actual and predicted data; use tqdm for a progress bar
t1 = time.time()
y_actual = [(pos_symbol, ) for pos_symbol in part_of_speech_dict.values()]
navigable_parents_list = list(part_of_speech_dict.keys())
num_navigable_parents = len(navigable_parents_list)
progress_bar = tqdm(
    navigable_parents_list, total=num_navigable_parents, desc="Predict Label"
)
y_predicted = [(predict_label(navigable_parent), ) for navigable_parent in progress_bar]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

Predict Label: 100%|███████████████████████████████████████████████████████████████| 2234/2234 [12:56<00:00,  2.88it/s]

Predicted labels created in 12 minutes and 56 seconds





In [8]:

mask_series = (pos_html_strs_df.pos_symbol == 'O-TS')
pos_html_strs_df[mask_series]

Unnamed: 0,text,pos_symbol,label
3513,"<li>Do data engineering, data analysis, data v...",O-TS,25
3514,<li>Will also use Tableau Prep for proof-of-co...,O-TS,25
3515,<li>Will also use Tableau Prep for proof-of-co...,O-TS,25
3516,<li>Will also use Tableau Prep for proof-of-co...,O-TS,25
3517,<li>Interact with other team members and clien...,O-TS,25
...,...,...,...
7549,"<li>Work on high-impact global projects, such ...",O-TS,25
7550,<div>Help grow efforts for a performance autom...,O-TS,25
7551,<span>Optimize machine learning models for per...,O-TS,25
7552,to join our Built for Contractors team. This h...,O-TS,25


In [14]:

from numpy import nan
pos_html_strs_df['predicted_symbol'] = nan
for predicted_symbol_tuple, navigable_parent in zip(y_predicted, navigable_parents_list):
    mask_series = (pos_html_strs_df.text == navigable_parent)
    pos_html_strs_df.loc[mask_series, 'predicted_symbol'] = predicted_symbol_tuple[0]
mask_series = (pos_html_strs_df.pos_symbol == 'O-TS')
pos_html_strs_df[mask_series]

Unnamed: 0,text,pos_symbol,label,predicted_symbol
3513,"<li>Do data engineering, data analysis, data v...",O-TS,25,O-RQ
3514,<li>Will also use Tableau Prep for proof-of-co...,O-TS,25,O-RQ
3515,<li>Will also use Tableau Prep for proof-of-co...,O-TS,25,O-RQ
3516,<li>Will also use Tableau Prep for proof-of-co...,O-TS,25,O-RQ
3517,<li>Interact with other team members and clien...,O-TS,25,O-RQ
...,...,...,...,...
7549,"<li>Work on high-impact global projects, such ...",O-TS,25,O-RQ
7550,<div>Help grow efforts for a performance autom...,O-TS,25,O-RQ
7551,<span>Optimize machine learning models for per...,O-TS,25,O-RQ
7552,to join our Built for Contractors team. This h...,O-TS,25,H-TS


In [15]:

mask_series = (pos_html_strs_df.pos_symbol == 'O-TS') & (pos_html_strs_df.predicted_symbol == 'O-TS')
pos_html_strs_df[mask_series]

Unnamed: 0,text,pos_symbol,label,predicted_symbol



----
## Create a Classification Report for all symbols

In [None]:

# Populate the parts of speech dictionary so that each symbol has an equal number of examples
part_of_speech_dict = {}
max_examples = pos_html_strs_df.pos_symbol.value_counts().tail(1).values[0]
for pos_symbol in pos_html_strs_df.pos_symbol.unique():
    mask_series = (pos_html_strs_df.pos_symbol == pos_symbol)
    df = pos_html_strs_df[mask_series].sample(max_examples)
    part_of_speech_dict.update(df.set_index('text').pos_symbol.to_dict())

In [None]:

# Prepare the actual and predicted data; use tqdm for a progress bar
t1 = time.time()
y_actual = [(pos_symbol, ) for pos_symbol in part_of_speech_dict.values()]
navigable_parents_list = list(part_of_speech_dict.keys())
num_navigable_parents = len(navigable_parents_list)
progress_bar = tqdm(
    navigable_parents_list, total=num_navigable_parents, desc="Predict Label"
)
y_predicted = [(predict_label(navigable_parent), ) for navigable_parent in progress_bar]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

In [17]:

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_actual)
y_pred_transformed = mlb.transform(y_predicted)
print(f'mlb.classes_: {mlb.classes_}')

# Compute the classification report
classification_report_df = DataFrame.from_dict(
    classification_report(
        y_test_transformed, y_pred_transformed, target_names=mlb.classes_,
        zero_division=0, output_dict=True
    ), orient='index'
)
classification_report_df.index.name = 'pos_symbol'
display(classification_report_df.sort_values('recall', ascending=False))

mlb.classes_: ['H-CS' 'H-ER' 'H-IP' 'H-JD' 'H-JT' 'H-LN' 'H-O' 'H-OL' 'H-PD' 'H-PQ'
 'H-RQ' 'H-SP' 'H-TS' 'O-CS' 'O-ER' 'O-IP' 'O-JD' 'O-JT' 'O-LN' 'O-O'
 'O-OL' 'O-PD' 'O-PQ' 'O-RQ' 'O-SP' 'O-TS']


Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
H-CS,0.777778,0.4375,0.56,16
H-ER,0.857143,0.428571,0.571429,14
H-JD,1.0,0.333333,0.5,3
H-JT,0.428571,0.3,0.352941,10
H-SP,0.090909,0.2,0.125,10
H-LN,0.666667,0.166667,0.266667,12
O-O,0.08,0.153846,0.105263,13
H-O,1.0,0.125,0.222222,8
H-OL,0.333333,0.1,0.153846,10
macro avg,0.244662,0.095362,0.12141,305



----

In [28]:

# Get tagged nodes data frame
cypher_str = f'''
    // Get the tagged node counts for each file
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WITH
        r2.file_name AS file_name,
        COUNT(r1) AS tagged_count,
        COUNT(r2) AS edge_count,
        COUNT(np1) AS np_count
    RETURN np_count, tagged_count, edge_count, file_name
    ORDER BY edge_count DESC;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    tagged_node_counts_df = DataFrame(row_objs_list)
    print(f'tagged_node_counts_df.shape: {tagged_node_counts_df.shape}') # (3972, 4)

# Get all tagged HTML child strings
filenames_list = tagged_node_counts_df.file_name.unique().tolist()
filenames_str = '", "'.join(filenames_list)
cypher_str = f'''
    // Get child string and POS for each at-least-partially tagged file
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WHERE
        r2.file_name IN ["{filenames_str}"]
    RETURN
        np1.navigable_parent AS text,
        pos.pos_symbol AS pos_symbol;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    pos_html_strs_df = DataFrame(row_objs_list).drop_duplicates()
    print(f'pos_html_strs_df.shape: {pos_html_strs_df.shape}') # (16239, 2)
    
    # Convert lables to numbers and get mapping
    sequence = pos_html_strs_df.pos_symbol.tolist()
    new_sequence, file_tags_mapping = nu.convert_strings_to_integers(sequence)
    pos_html_strs_df['label'] = new_sequence

tagged_node_counts_df.shape: (3972, 4)
pos_html_strs_df.shape: (16236, 2)


In [29]:

pos_html_strs_df.text.value_counts().head(1)

text
This position is eligible for remote worksite. Work is performed fully in a remote capacity. An employee who is working remotely must reside within the United States in order to comply with all federal and state laws, filings, or tax requirements.    3
Name: count, dtype: int64

In [30]:

# Examine ambiguous relationships
navigable_parent = pos_html_strs_df.text.value_counts().head(1).index[0]
cypher_str = f'''
    // Examine the SUMMARIZES relationships
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WHERE
        np1.navigable_parent IN ["{navigable_parent}"]
    RETURN
        pos,
        np1;'''
pyperclip.copy(cypher_str)

In [27]:

# Fix ambiguous relationships
navigable_parent = pos_html_strs_df.text.value_counts().head(1).index[0]
cypher_str = f'''
    // Remove the SUMMARIZES relationships that are not "O-ER"
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WHERE
        np1.navigable_parent IN ["{navigable_parent}"]
        AND NOT (pos.pos_symbol = "O-ER")
    DELETE r1;'''
pyperclip.copy(cypher_str)

In [31]:

columns_list = ['text', 'pos_symbol']
pos_html_strs_df[columns_list].sample(5)

Unnamed: 0,text,pos_symbol
16302,<li>Experience using Jupyter notebooks or simi...,O-RQ
49061,<p>We Support Justice + Equality</p>,O-CS
48816,We are proud to be recognized by Forbes as one...,O-CS
47574,You are also eligible to participate in an ann...,O-IP
28571,"<li>Familiarity with Google Cloud, Docker, Agi...",O-PQ



---
# Data Preparation
This stage involves preparing the dataset for training by transforming, cleaning, and pre-processing the data. Data preparation may include tasks such as feature selection, normalization, and data augmentation.

In [32]:

# Prepare the training and test data
t1 = time.time()
part_of_speech_dict = pos_html_strs_df.set_index('text').pos_symbol.to_dict()
y_actual = [(pos_symbol, ) for text, pos_symbol in part_of_speech_dict.items()]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Actual POS symbol list created in {duration_str}')

Actual POS symbol list created in 0 seconds


In [35]:

# 1. First, I load the saved model and tokenizer:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the saved model
model_path = '../saves/models/sequence_classification.model'
tokenizer_path = '../saves/tokenizers/sequence_classification.tokenizer'
model = BertForSequenceClassification.from_pretrained(model_path)

# Load the saved tokenizer
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

# Move the model to GPU if available
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set the model to evaluation mode
model.eval()


# 2. I create a function to predict the label for a given text:
def predict_label(text, max_length=128):
    
    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Move inputs to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Get the model's prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Get the predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    
    # Map the predicted class back to its label
    reverse_mapping = {v: k for k, v in file_tags_mapping.items()}
    predicted_label = reverse_mapping[predicted_class]
    
    return predicted_label

# Example usage
predicted_label = predict_label(navigable_parent)
print(f"Navigable Parent: {navigable_parent}\nPredicted label: {predicted_label}")

Navigable Parent: This position is eligible for remote worksite. Work is performed fully in a remote capacity. An employee who is working remotely must reside within the United States in order to comply with all federal and state laws, filings, or tax requirements.
Predicted label: O-PQ


In [36]:

# 3. I want to process multiple texts at once, so I create a batch prediction function:
def predict_batch(texts, max_length=128):
    
    # Tokenize the input texts
    inputs = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Move inputs to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Get the model's predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Get the predicted classes
    predicted_classes = torch.argmax(logits, dim=1).tolist()
    
    # Map the predicted classes back to their labels
    reverse_mapping = {v: k for k, v in file_tags_mapping.items()}
    predicted_labels = [reverse_mapping[cls] for cls in predicted_classes]
    
    return predicted_labels

# Example usage
texts = [
    "<li>Troubleshooting and triaging issues with multiple teams to drive towards root cause identification and resolution.</li>",
    "<h2>Job Requirements</h2>",
    "<p>Bachelor's degree in Computer Science or related field required.</p>"
]
predicted_labels = predict_batch(texts)
for text, label in zip(texts, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted label: {label}\n")

Text: <li>Troubleshooting and triaging issues with multiple teams to drive towards root cause identification and resolution.</li>
Predicted label: O-RQ

Text: <h2>Job Requirements</h2>
Predicted label: H-PQ

Text: <p>Bachelor's degree in Computer Science or related field required.</p>
Predicted label: O-CS



In [None]:

t1 = time.time()
y_predicted = [(label, ) for label in predict_batch(part_of_speech_dict.keys())]
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Predicted labels created in {duration_str}')

In [17]:

y_predicted[-2:]

[('O-IP',), ('O-IP',)]

In [18]:

y_actual[-2:]

[('O-O',), ('O-O',)]

In [19]:

from sklearn.preprocessing import MultiLabelBinarizer

# Create the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the y_test and y_pred sequences
y_test_transformed = mlb.fit_transform(y_actual)
y_pred_transformed = mlb.transform(y_predicted)
mlb.classes_

array(['H-CS', 'H-ER', 'H-IP', 'H-JD', 'H-JT', 'H-LN', 'H-O', 'H-OL',
       'H-PD', 'H-PQ', 'H-RQ', 'H-SP', 'H-TS', 'O-CS', 'O-ER', 'O-IP',
       'O-JD', 'O-JT', 'O-LN', 'O-O', 'O-OL', 'O-PD', 'O-PQ', 'O-RQ',
       'O-SP', 'O-TS'], dtype=object)

In [20]:

# Compute the classification report
pos_symbol_crf_df = DataFrame.from_dict(classification_report(y_test_transformed, y_pred_transformed, target_names=mlb.classes_,
                                                              zero_division=0, output_dict=True), orient='index')
pos_symbol_crf_df.index.name = 'pos_symbol'
pos_symbol_crf_df.sort_values('recall', ascending=False)

Unnamed: 0_level_0,precision,recall,f1-score,support
pos_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O-IP,0.10241,0.999385,0.185782,4882
micro avg,0.102349,0.102349,0.102349,47670
weighted avg,0.010488,0.102349,0.019026,47670
samples avg,0.102349,0.102349,0.102349,47670
macro avg,0.003939,0.038438,0.007145,47670
H-JT,0.0,0.0,0.0,77
H-LN,0.0,0.0,0.0,68
H-IP,0.0,0.0,0.0,103
H-JD,0.0,0.0,0.0,31
O-TS,0.0,0.0,0.0,6983
