In [1]:

%pprint
%matplotlib inline
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine)
from pandas import DataFrame
import re

Pretty printing has been turned OFF
Utility libraries created in 6 seconds



How do I pretrain an encoder model that has for its labeled training data HTML text surrounded by its parent HTML tag (if any) labeled with one of these categories: Job Title, Corporate Scope, Task Scope, Educational Requirements, Minimum Qualifications, Preferred Qualifications, Supplemental Pay, Legal Notifications, Office Location, Job Duration, Interview Procedures, Other, and Posting Date (Header or Non-header)? For instance, the navigable_parent "&lt;li>Troubleshooting and triaging issues with multiple teams to drive towards root cause identification and resolution.&lt;/li>" would be labeled "O-TS" (Task Scope Non-header).


To pretrain an encoder model for this task, you can follow these steps:

1. Data Preparation:
   - Ensure your dataset consists of HTML text snippets with their corresponding parent tags.
   - Create a label mapping for your categories. For example:

In [2]:

label_mapping = {
     'Job Title': 'JT',
     'Corporate Scope': 'CS',
     'Task Scope': 'TS',
     'Educational Requirements': 'ER',
     'Minimum Qualifications': 'RQ',
     'Preferred Qualifications': 'PQ',
     'Supplemental Pay': 'SP',
     'Legal Notifications': 'LN',
     'Office Location': 'OL',
     'Job Duration': 'JD',
     'Interview Procedures': 'IP',
     'Other': 'OT',
     'Posting Date': 'PD'
 }


1. Data Preparation (continued):
   - Add a prefix 'H-' for header and 'O-' for non-header items.

2. Tokenization:
   - Use a tokenizer that can handle HTML tags (e.g., a custom tokenizer or adapt an existing one like BERT's tokenizer).
   - Ensure the tokenizer preserves the HTML structure.

3. Model Architecture:
   - Choose a suitable encoder architecture (e.g., BERT, RoBERTa, or a custom transformer-based model).
   - Modify the final layer to output predictions for your specific categories.

4. Pretraining Objective:
   - Use a combination of masked language modeling (MLM) and sequence classification.
   - For MLM, randomly mask some tokens and train the model to predict them.
   - For sequence classification, use the labeled data to train the model to predict the correct category.

5. Training Process:
   - Implement the training loop, including:
     - Batching the data
     - Applying MLM
     - Computing loss for both MLM and classification tasks
     - Backpropagation and optimization

Here's a basic outline of how you might implement this in PyTorch:

In [3]:

cypher_str = f'''
    // Get the edge and node counts for each file, tag-agnostic
    MATCH (np1:NavigableParents)-[r:NEXT]->(np2:NavigableParents)
    WITH
        r.file_name AS file_name,
        COUNT(r) AS edge_count,
        COUNT(np1) AS np_count
    RETURN np_count, edge_count, file_name
    ORDER BY edge_count DESC;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    node_counts_df = DataFrame(row_objs_list)
    if node_counts_df.shape[0]:
        display(node_counts_df)

Unnamed: 0,np_count,edge_count,file_name
0,264,264,temp_Statistician_(Data_Scientist)_12_month_Ro...
1,242,242,Statistician_(Data_Scientist)_12_month_Roster_...
2,242,242,Statistician_(Data_Scientist)_12_month_Roster_...
3,210,210,Senior_Data_Scientist–Statistics_and_Machine_L...
4,201,201,548ece40cf23703d_Information_Technology_Cybers...
...,...,...,...
6008,5,5,Automation_Engineer_Remote_ISite_Technologies_...
6009,4,4,"Sr._AI_Data_Science_Engineer_-_Minneapolis,_MN..."
6010,3,3,NATURAL_LANGUAGE_UNDERSTANDING_AND_MACHINE_LEA...
6011,3,3,ea38cd78e8491710_Applied_Machine_Learning_Scie...


In [4]:

cypher_str = f'''
    // Get the tagged node counts for each file
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WITH
        r2.file_name AS file_name,
        COUNT(r1) AS tagged_count,
        COUNT(r2) AS edge_count,
        COUNT(np1) AS np_count
    RETURN np_count, tagged_count, edge_count, file_name
    ORDER BY edge_count DESC;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    tagged_node_counts_df = DataFrame(row_objs_list)
    if tagged_node_counts_df.shape[0]:
        display(tagged_node_counts_df)

Unnamed: 0,np_count,tagged_count,edge_count,file_name
0,104,104,104,f217ae62af41916b_Senior_Developer_Big_Data_Dev...
1,96,96,96,"Statistician_-_Anchorage,_AK_99501_-_Indeed.co..."
2,95,95,95,184b817c6eaf7172_Data_Scientist_Remote_Indeed_...
3,94,94,94,9772f70e3d9f27cd_Python_Web_Developer_Remote_I...
4,92,92,92,14885afaa7bbd01e_Software_Developer_Engineer_i...
...,...,...,...,...
5318,1,1,1,53b28529e19024d9_Geospatial_Analytics_Engineer...
5319,1,1,1,caed7f6362eed7e1_Lead_Software_Engineer_Backen...
5320,1,1,1,7022cb918f8424ec_Senior_Software_Engineer_C_AI...
5321,1,1,1,c0139ec70b6a4800_Data_Scientist_Chicago_IL_Ind...


In [5]:

df = node_counts_df.merge(tagged_node_counts_df, on=['file_name'], suffixes=('_untagged', '_tagged'))
mask_series = (df.np_count_untagged == df.np_count_tagged)
# filenames_list = df[mask_series].file_name.tolist()
df[mask_series]

Unnamed: 0,np_count_untagged,edge_count_untagged,file_name,np_count_tagged,tagged_count,edge_count_tagged
162,92,92,14885afaa7bbd01e_Software_Developer_Engineer_i...,92,92,92
186,89,89,Senior_Cloud_Data_Engineer_Texas_Indeed_com.html,89,89,89
187,89,89,Senior_Cloud_Data_Engineer_Salem_OR_97302_Inde...,89,89,89
188,89,89,Senior_Cloud_Data_Engineer_Oregon_Indeed_com.html,89,89,89
189,89,89,86cf17672d5bf560_Senior_Cloud_Data_Engineer_Sa...,89,89,89
...,...,...,...,...,...,...
5250,15,15,4656009_Accenture_CIO_62812_439400_Data_Scient...,15,15,15
5251,15,15,4656043_Accenture_CIO_62812_439399_Data_Scient...,15,15,15
5252,15,15,4656042_Accenture_CIO_62812_439276_Data_Scient...,15,15,15
5257,14,14,297b0a7515e65134_Big_Data_Developer_Pittsburgh...,14,14,14


In [6]:

filenames_list = tagged_node_counts_df.file_name.tolist()
filenames_str = '", "'.join(filenames_list)
cypher_str = f'''
    // Get child string and POS for each at-least-partially tagged file
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WHERE
        r2.file_name IN ["{filenames_str}"]
    RETURN
        np1.navigable_parent AS text,
        pos.pos_symbol AS pos_symbol;'''
# print(cypher_str)
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    file_tags_df = DataFrame(row_objs_list)
    if file_tags_df.shape[0]:
        display(file_tags_df)

Unnamed: 0,text,pos_symbol
0,<b>ACCOUNTABILITIES:</b>,H-TS
1,<p>These skills will help you succeed in this ...,H-TS
2,<p>These skills will help you succeed in this ...,H-TS
3,<p>These skills will help you succeed in this ...,H-TS
4,<p>These skills will help you succeed in this ...,H-TS
...,...,...
57582,"<span aria-hidden=""true"">|</span>",O-O
57583,"<span aria-hidden=""true"">|</span>",O-O
57584,"<span aria-hidden=""true"">|</span>",O-O
57585,<i>Learn why</i>,O-O


In [19]:

pos_symbol = file_tags_df.groupby('pos_symbol').count().reset_index().rename(columns={'text': 'labeled_count'}).sort_values('labeled_count').iloc[0].pos_symbol
cypher_str = f'''
    // Get example child strings
    MATCH (pos:PartsOfSpeech)-[r1:SUMMARIZES]->(np1:NavigableParents)-[r2:NEXT]->(np2:NavigableParents)
    WHERE pos.pos_symbol IN ["{pos_symbol}"]
    RETURN DISTINCT np1.navigable_parent AS text;'''
# print(cypher_str)
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    df = DataFrame(row_objs_list)
    if df.shape[0]:
        display(df.text.to_list())

['<b>Publication Date:</b>', '<b>Job Posting :</b>', '<b>Job Posting</b>', '<div>Published</div>']

In [7]:

sequence = file_tags_df.pos_symbol.tolist()
new_sequence, mapping = nu.convert_strings_to_integers(sequence)
file_tags_df['label'] = new_sequence
nu.store_objects(file_tags_df=file_tags_df, file_tags_mapping=mapping)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\file_tags_df.pkl
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\file_tags_mapping.pkl


In [8]:

from pandas import read_pickle

file_tags_mapping = read_pickle('../saves/pkl/file_tags_mapping.pkl')
file_tags_df = read_pickle('../saves/pkl/file_tags_df.pkl')

In [9]:

import torch
from torch import nn
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig

# Custom dataset class
class HTMLDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            item['text'],
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(item['label'], dtype=torch.long)
        }
model_path = '../saves/models/sequence_classification.model'
tokenizer_path = '../saves/tokenizers/sequence_classification.tokenizer'

In [10]:

import os
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def get_latest_epoch(model_path):
    epochs = []
    try:
        epochs = [int(d.split('-')[-1]) for d in os.listdir(model_path) if d.startswith('epoch-')]
    except:
        return 0
    return max(epochs) if epochs else 0

# Model setup
num_labels = len(file_tags_mapping)
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

# Tokenizer setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset and DataLoader setup
dataset = HTMLDataset(file_tags_df, tokenizer, max_length=128)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

# Training loop
#optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
print(f"Starting training for {num_epochs} epochs...")

# Get the latest epoch
latest_epoch = get_latest_epoch(model_path)
start_epoch = latest_epoch + 1

Using device: cpu
Starting training for 10 epochs...


In [18]:

# Run this only once
remaining_epochs_list = list(reversed([epoch for epoch in range(start_epoch, start_epoch + num_epochs)]))
remaining_epochs_list

[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]

In [None]:

epoch = remaining_epochs_list.pop()
print(f"Epoch {epoch}/{start_epoch + num_epochs - 1}")

# Load the model from the latest snapshot or initialize if it's the first epoch
if epoch == start_epoch:
    if latest_epoch > 0:
        model = BertForSequenceClassification.from_pretrained(f"{model_path}/epoch-{latest_epoch}")
        print(f"Loaded model from epoch {latest_epoch}")
    else:
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
        print("Initialized new model")
else:
    model = BertForSequenceClassification.from_pretrained(f"{model_path}/epoch-{epoch-1}")
    print(f"Loaded model from epoch {epoch-1}")

model.to(device)  # Move model to GPU if available
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

epoch_loss = 0.0
num_batches = len(dataloader)

# Use tqdm for a progress bar
progress_bar = tqdm(dataloader, total=num_batches, desc=f"Epoch {epoch}")

model.train()
for batch in progress_bar:
    batch = {k: v.to(device) for k, v in batch.items()}  # Move batch to GPU if available
    outputs = model(**batch)
    loss = outputs.loss
    epoch_loss += loss.item()
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    # Update progress bar description with current loss
    progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

# Print average loss for the epoch
avg_loss = epoch_loss / num_batches
print(f"Epoch {epoch} completed. Average loss: {avg_loss:.4f}")

# Save the model after each epoch
epoch_dir = f"{model_path}/epoch-{epoch}"
print(f"Saving model to {epoch_dir}")
model.save_pretrained(epoch_dir)
print("Model saved successfully.")

In [None]:

print("Training completed.")

# Save the tokenizer
print(f"Saving tokenizer to {tokenizer_path}")
tokenizer.save_pretrained(tokenizer_path)
print("Tokenizer saved successfully.")


Now that I've trained and saved my model and tokenizer, I can use them for inference on new data. Here's how I load and use my saved model and tokenizer:

1. First, load the saved model and tokenizer:

In [None]:

from transformers import BertForSequenceClassification, BertTokenizer

# Load the saved model
model = BertForSequenceClassification.from_pretrained(model_path)

# Load the saved tokenizer
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set the model to evaluation mode
model.eval()


2. Create a function to predict the label for a given text:

In [None]:

import torch

def predict_label(text, max_length=128):
    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Move inputs to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Get the model's prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Get the predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    
    # Map the predicted class back to its label
    reverse_mapping = {v: k for k, v in mapping.items()}
    predicted_label = reverse_mapping[predicted_class]
    
    return predicted_label

# Example usage
text = "<li>Troubleshooting and triaging issues with multiple teams to drive towards root cause identification and resolution.</li>"
predicted_label = predict_label(text)
print(f"Predicted label: {predicted_label}")


3. I want to process multiple texts at once, so I create a batch prediction function:

In [None]:

def predict_batch(texts, max_length=128):
    # Tokenize the input texts
    inputs = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Move inputs to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Get the model's predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Get the predicted classes
    predicted_classes = torch.argmax(logits, dim=1).tolist()
    
    # Map the predicted classes back to their labels
    reverse_mapping = {v: k for k, v in mapping.items()}
    predicted_labels = [reverse_mapping[cls] for cls in predicted_classes]
    
    return predicted_labels

# Example usage
texts = [
    "<li>Troubleshooting and triaging issues with multiple teams to drive towards root cause identification and resolution.</li>",
    "<h2>Job Requirements</h2>",
    "<p>Bachelor's degree in Computer Science or related field required.</p>"
]
predicted_labels = predict_batch(texts)
for text, label in zip(texts, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted label: {label}\n")


4. I want to get the probabilities for each class, so I modify the prediction functions:

In [None]:

num_labels = model.num_labels
print(f"Number of labels in the model: {num_labels}")

In [None]:

print("Mapping:")
for k, v in mapping.items():
    print(f"{k}: {v}")

In [27]:

import torch.nn.functional as F

def predict_with_probabilities(text, max_length=128):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    probabilities = F.softmax(logits, dim=1).squeeze().tolist()
    
    reverse_mapping = {v: k for k, v in mapping.items()}
    label_probs = {reverse_mapping[i]: prob for i, prob in enumerate(probabilities)}
    
    predicted_class = max(label_probs, key=label_probs.get)
    
    return predicted_class, label_probs

# Example usage
text = "<li>Troubleshooting and triaging issues with multiple teams to drive towards root cause identification and resolution.</li>"
predicted_label, probabilities = predict_with_probabilities(text)
print(f"Predicted label: {predicted_label}")
print("Probabilities:")
for label, prob in probabilities.items():
    print(f"{label}: {prob:.4f}")

KeyError: 24

```

These functions allow you to use your trained model for inference on new, unseen data. You can now classify HTML text snippets into your predefined categories.

Remember to preprocess your input data in the same way you did during training. If you applied any specific transformations or cleaning steps to your training data, make sure to apply the same steps to your input data during inference.

Would you like me to explain any part of this process in more detail?