# README

This Notebook contains the production code to run the optomized Tensorrt Models 
as well as production helper functions to take the outputed prediction into a useful form.

# Imports

In [1]:
import torch

from transformers import BertTokenizer, DistilBertTokenizer, AutoModelForTokenClassification
import onnx
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import json
import os
import pandas as pd

import contractions
import unicodedata
import re


# Helper Functions

## Tensorrt Helper Functions 
Functions neccesary to load and run an inference through a tensorrt model.

In [20]:
def load_engine(trt_engine_path):
    """Loads the Tensorrt model"""
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open(trt_engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

def deload_engine(engine, context):
    """
    Function to clean up the engine and context.
    """
    if context:
        context.__del__()  # Explicitly call context destructor
    del context  # Delete the context object to free memory
    del engine  # Delete the engine object to free memory

## Type Job 
def type_output_shape(input_ids):
    """
    Returns the output shape for the industry model 
    based on the number of labels
    """
    num_labels = len(type_id2label)
    return (input_ids.shape[0], num_labels)
    
## Location 
def loc_output_shape(input_ids):
    """
    Returns the output shape for the location model 
    based on the number of labels
    """
    num_labels = len(loc_label2id) 
    return (input_ids.shape[0], input_ids.shape[1], num_labels)
    

    
def run_inference(context, input_ids, attention_mask, output_shape_funct):
    """Runs an example input through tensorrt model"""
    # Allocate memory for inputs and outputs on the GPU
    d_input_ids = cuda.mem_alloc(input_ids.nbytes)
    d_attention_mask = cuda.mem_alloc(attention_mask.nbytes)
    if not context.set_input_shape("input_ids", input_ids.shape):
        raise ValueError(f"Failed to set shape for input_ids with shape {input_ids.shape}")
    if not context.set_input_shape("attention_mask", attention_mask.shape):
        raise ValueError(f"Failed to set shape for attention_mask with shape {attention_mask.shape}")
        
    # Prepare output buffer, now using the correct shape
    output_shape = output_shape_funct(input_ids)
    d_output = cuda.mem_alloc(int(np.prod(output_shape) * np.dtype(np.float32).itemsize))

    # Copy inputs to device memory
    cuda.memcpy_htod(d_input_ids, input_ids)
    cuda.memcpy_htod(d_attention_mask, attention_mask)

    # Run inference
    bindings = [int(d_input_ids), int(d_attention_mask), int(d_output)]
    context.execute_v2(bindings)

    # Copy outputs back to host
    output = np.empty(output_shape, dtype=np.float32)
    cuda.memcpy_dtoh(output, d_output)

    return output
    
def load_tokenizer_config(tokenizer_dir, param_list):
    """Loads the Tokenizer Config from file"""
    tokenizer = BertTokenizer.from_pretrained(tokenizer_dir)
    
    # Load the tokenizer config JSON file
    with open(tokenizer_dir + "/tokenizer_config.json", "r") as f:
        tokenizer_config = json.load(f)
        
    result = {}
    for param in param_list:
        if param not in tokenizer_config:
            raise KeyError(f"Parameter '{param}' is missing from the tokenizer config json file.")
        result[param] = tokenizer_config[param]
    
    return result, tokenizer

def tokenize_text(text:list|str, tokenizer_config):
    """Tokenize the text using configs"""
    inputs = tokenizer(
        text,
        return_tensors="np"
        **tokenizer_config
    )
    return inputs

## Job Type Model Helper Functions
The preprocessing and input building functions for the Job Type model. 

In [4]:
def type_preprocess_text(text): 
    """Pre Process Text"""
    if text is None:
        return text

    # Convert to lowercase
    text = text.lower()
    
    # Remove non-standard UTF-8 characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    try:
        # Fix Contractions
        text = contractions.fix(text)
    except:
        print(text)
        raise Exception('preprocess error') ## TODO invesitgate when running 

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Normalize accented characters to their ASCII equivalents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

    # Remove bullet points (at the start of a line or following whitespace)
    text = re.sub(r'(^[\s]*[\u2022\u2023\u25E6\u2043\u2219*•-]\s+)', '', text, flags=re.MULTILINE)

    # Replace URLs with "URL"
    text = re.sub(r'http\S+|www.\S+', 'URL', text)

    # Replace phone numbers with "PHONE_NUMBER"
    text = re.sub(r'\(?\b\d{3}[-.)\s]*\d{3}[-.\s]*\d{4}\b', 'PHONE_NUMBER', text)

    # Replace email addresses with "EMAIL"
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'EMAIL', text)

    # Replace more than three consecutive punctuation marks with a single instance
    text = re.sub(r'([^\w\s])\1{2,}', r'\1', text)

    # Remove parentheses, curly braces, and square brackets
    text = re.sub(r'[\(\)\{\}\[\]]', ' ', text)

    # Remove special characters 
    text = re.sub(r'[:;@\/\\*#!?]', ' ', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def build_input_type( title, description):
    """Build input row out of title and description"""
    return f"[TITLE] {type_preprocess_text(title)} [DESC] {type_preprocess_text(description)}"

## Location Model Helpers
The preprocessing and input building functions for the Job Location Named Entity Recognition Model.

Includes functions to take the output from the model and map to the input string to create a dictionary of locations. 

In [5]:
def create_ner_label_mappings(token_labels, debug = False):
    """
    Given the number of token labels creates the IOB labels.
    """
    label2id = {'O': 0}
    id2label = {0: 'O'}
    index = 1  
    for label in token_labels:
        for prefix in ['B-', 'I-']:
            current_label = f"{prefix}{label.upper()}"
            label2id[current_label] = index
            id2label[index] = current_label
            index += 1
    if debug:
        print(f"Num Labels: {len(label2id.keys())}")
        print(f"label2id: {label2id}")
        print(f"id2label: {id2label}")
    return label2id, id2label

def loc_preprocess_text(text):
    """Simple preprocessing function for location model"""
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text
    
state_abbreviation_to_name = {
    "al": "alabama", "ak": "alaska", "az": "arizona", "ar": "arkansas", "ca": "california", 
    "co": "colorado", "ct": "connecticut", "de": "delaware", "fl": "florida", "ga": "georgia", 
    "hi": "hawaii", "id": "idaho", "il": "illinois", "in": "indiana", "ia": "iowa", 
    "ks": "kansas", "ky": "kentucky", "la": "louisiana", "me": "maine", "md": "maryland", 
    "ma": "massachusetts", "mi": "michigan", "mn": "minnesota", "ms": "mississippi", 
    "mo": "missouri", "mt": "montana", "ne": "nebraska", "nv": "nevada", "nh": "new hampshire", 
    "nj": "new jersey", "nm": "new mexico", "ny": "new york", "nc": "north carolina", 
    "nd": "north dakota", "oh": "ohio", "ok": "oklahoma", "or": "oregon", "pa": "pennsylvania", 
    "ri": "rhode island", "sc": "south carolina", "sd": "south dakota", "tn": "tennessee", 
    "tx": "texas", "ut": "utah", "vt": "vermont", "va": "virginia", "wa": "washington", 
    "wv": "west virginia", "wi": "wisconsin", "wy": "wyoming"
}
state_names_values = list(state_abbreviation_to_name.values())
def normalize_state(state):
    """Normalizes the state strings for consistency"""
    if len(state) == 2:
        state_name = state_abbreviation_to_name.get(state, None)
        if state_name:
            return state_name.title()
        else:
            return None
    elif len(state) > 2:
        if state in state_names_values:
            return state.title()
        else:
            return None
    else:
        return None
        
country_abbreviation_to_name = {
    "us": "United States", "usa": "United States",
    "americas" : "United States", "america": "United States",
    "gb": "United Kingdom", "uk": "United Kingdom", "england": "United Kingdom"
}
def normalize_country(country):
    """Normalizes the Country strings for consistency"""
    country_name = country_abbreviation_to_name.get(country, country)
    return country_name.title()

def normalize_city(city):
    """Noramlizes the City Strings for consistencies """
    return city.title()

def build_final_dict(tokens, pred_labels):
    """
    Takes the tokens and predicted token labels and rebuilds tokens into strings 
    creates the country, state, city, and remote lists
    """
    entities = {
        "country": [],
        "state": [],
        "city": [],
        "remote": False 
    }
    
    current_entity_tokens = []
    current_entity_label = None
    
    for token, pred_token_label in zip(tokens, pred_labels):
        if token in ['[CLS]', '[SEP]']: # Skip special tokens
            continue

        if pred_token_label == 'B-REMOTE':  # Check for remote first
            entities['remote'] = True
            continue  
        
        if pred_token_label != 'O':  # Not an outside word
            prefix, entity = pred_token_label.split('-')
            if prefix == 'B':  # Start of a new entity
                # Save current entity if there is one
                if current_entity_label and current_entity_tokens:
                    entities[current_entity_label].append(''.join(current_entity_tokens))
                
                # Start a new entity
                current_entity_tokens = [token.replace('##', '')]  # Remove BPE marker for subwords
                current_entity_label = entity.lower()  # Start a new entity label
            elif prefix == 'I' and entity.lower() == current_entity_label and current_entity_tokens:
                # Continue the current entity
                if token.startswith('##'):
                    current_entity_tokens.append(token.replace('##', ''))
                else:
                    current_entity_tokens.append(' ' + token)  # Add space if it's a full word
        else:
            # Save the entity if we hit an 'O' and we have an ongoing entity
            if current_entity_label and current_entity_tokens:
                entities[current_entity_label].append(''.join(current_entity_tokens))
                current_entity_tokens = []
                current_entity_label = None
    
    # Capture any entity that goes right up to the end without hitting an 'O'
    if current_entity_label and current_entity_tokens:
        entities[current_entity_label].append(''.join(current_entity_tokens))
    
    return {
        "country_list": entities['country'], 
        "state_list": entities['state'],
        "city_list": entities['city'],
        "remote": entities['remote']
    }

def loc_id_to_entity_dict(predicted_tokens, attention_mask, input_ids, id2label, loaded_tokenizer, debug=False):
    """
    Takes the predicted tokens, attention mask, and input IDs to map the prediction on each token to the input ID
    Uses build_final_dict helper function to build the final dictionaries of count
    """
    entities =[]
    
    for i in range(predicted_tokens.shape[0]):
        adjusted_predictions = np.where(attention_mask[i] == 1, predicted_tokens[i], -100)
        flat_predictions = adjusted_predictions.flatten() # Flatten predictions for this example
        attention_mask_list = attention_mask[i].flatten().tolist() # Flatten attention mask for this example and convert to list
        last_real_token_index = len(attention_mask_list) - 1 - attention_mask_list[::-1].index(1)# Find the index of the last real token (where the attention mask is 1)
        filtered_predictions_ids = flat_predictions[:last_real_token_index + 1]# Use last index +1 to remove the -100 values for padded tokens
        predicted_token_labels = [id2label[id] for id in filtered_predictions_ids]# Convert IDs to their corresponding labels using id2label

        input_ids_list = input_ids[i].squeeze().tolist()  # Remove extra dimension and convert tensor to list  
        # Grab the tokens for the input example then convert them from ids to actual string token. then Remove the pad tokens. 
        tokens = loaded_tokenizer.convert_ids_to_tokens(input_ids_list) 
        token_labels = [token for token in tokens if token != '[PAD]']
        if debug:
            print(predicted_token_labels)
            print(token_labels)
            print()
            
        entity_obj = build_final_dict(token_labels, predicted_token_labels)
        if len(entity_obj['state_list'])> 0:
            new_state_list = []
            for state in entity_obj['state_list']:
                fmt_state = normalize_state(state)
                if fmt_state:
                    new_state_list.append(fmt_state)
            entity_obj['state_list'] = new_state_list

        if len(entity_obj['country_list'])> 0:
            for idx in range(len(entity_obj['country_list'])):
                entity_obj['country_list'][idx] = normalize_country(entity_obj['country_list'][idx])

        if len(entity_obj['city_list'])> 0:
            for idx in range(len(entity_obj['city_list'])):
                entity_obj['city_list'][idx] = normalize_city(entity_obj['city_list'][idx])
                
        entities.append(entity_obj)
        
    return entities

# Model Load

## Job type classifier model load 
Tensorrt and tokenizer load from file.

In [6]:
# File Path 
type_out_put_dir = "../Job_Type_Model/jobLevel_multi_bert_all_1"
type_tokeninzer_out_put_dir = type_out_put_dir+"_tokenizer"

In [7]:
type_tokenizer_params_list = ['max_length', 'truncation', 'padding']
type_tokenizer_config, type_tokenizer = load_tokenizer_config(type_tokeninzer_out_put_dir, type_tokenizer_params_list)



In [8]:
# labels 
type_id2label = {0:  "technology", 1: "medical", 2:"marketing and sales", 3:"law", 4: "service industry", 5:"retail",
                 6:"education", 7:"customer service", 8:"engineering and architecture", 9: "skilled trades", 10: "hr", 11: "finance"}

type_label2id = {"technology": 0, "medical": 1, "marketing and sales": 2, "law": 3, "service industry": 4, "retail": 5, 
                 "education": 6, "customer service": 7, "engineering and architecture": 8, "skilled trades": 9, "hr": 10, "finance": 11}

In [9]:
type_engine = load_engine("../Job_Type_Model/industry_classifier.trt") # Load the TensorRT engine
type_context = type_engine.create_execution_context()# Create execution context

[11/04/2024-14:53:10] [TRT] [W] Using an engine plan file across different models of devices is not recommended and is likely to affect performance or even cause errors.


## Location NER Model Load
Tensorrt and tokenizer load from file

In [10]:
# File Path 
loc_out_put_dir = "../Location_Model/location_bert_v2"
loc_tokeninzer_out_put_dir = loc_out_put_dir+"_tokenizer"

In [11]:
loc_tokenizer_params_list = ['max_length', 'truncation', 'padding', 'is_split_into_words']
loc_tokenizer_config, loc_tokenizer = load_tokenizer_config(loc_tokeninzer_out_put_dir, loc_tokenizer_params_list)

In [12]:
# Label Creation Function
token_labels = ['city', 'state', 'country', 'remote']
loc_label2id, loc_id2label = create_ner_label_mappings(token_labels)

In [13]:
loc_engine = load_engine("../Location_Model/location_classifier_2.trt") # Load the TensorRT engine
loc_context = loc_engine.create_execution_context()# Create execution 

[11/04/2024-14:53:15] [TRT] [W] Using an engine plan file across different models of devices is not recommended and is likely to affect performance or even cause errors.


# Data load 

In [14]:
all_df = pd.read_csv('../Location_Model/location_data/all_data_fmt_location.csv')
all_df.drop(axis=1, inplace=True, columns=['id','fmt_raw_location'])
all_df.head()

Unnamed: 0,title,description,raw_location
0,"Vice President, Software Engineering",Xometry (NASDAQ: XMTR) powers the industries o...,"North Bethesda, MD, Lexington, KY, Remote"
1,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote - United Kingdom
2,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote
3,"Vice President, Engineering","At Airwallex (airwallex.com), were building th...",US - San Francisco
4,VP Engineering,About Us We believe AI will fundamentally tran...,"Remote, USA"


In [15]:
# Convert DF to list of dictionaries.
test_data = all_df.to_dict(orient="records")

# Model runs 

In [29]:
# Function to test classification on jobs by industry
def classify_type(messages):
    cleaned_messages = []
    for message in messages:
        # cleaned_messages.append(self.build_input(message.title, message.description))
        cleaned_messages.append(build_input_type(message['title'], message['description']))
    inputs = type_tokenizer(
        cleaned_messages,
        return_tensors="np",
        **type_tokenizer_config
    )
    output = run_inference(type_context, inputs['input_ids'], inputs['attention_mask'], type_output_shape)
    predicted_classes_id = np.argmax(output, axis=-1)

    predicted_classes_word = []
    for pred in predicted_classes_id:
        print(pred)
        predicted_classes_word.append(type_id2label[pred])

    return predicted_classes_word

In [30]:
classify_type(test_data[0:1])

0


['technology']

In [31]:
classify_type(test_data[1:2])

4


['service industry']

In [32]:
classify_type(test_data[4:5])

0


['technology']

In [126]:
# Function to test Named Entity Recognition on Raw locations
def classify_loc(messages):
    cleaned_messages = []
    for message in messages:
        # cleaned_messages.append(loc_preprocess_text(message.location))
        cleaned_messages.append(loc_preprocess_text(message['raw_location']))
    inputs = loc_tokenizer(
        cleaned_messages,
        return_tensors="np",
        **loc_tokenizer_config
    )
    output = run_inference(loc_context, inputs['input_ids'], inputs['attention_mask'], loc_output_shape)
    predicted_token_ids = np.argmax(output, axis=-1)

    predicted_dict_entities_list = loc_id_to_entity_dict(predicted_token_ids, inputs['attention_mask'], inputs['input_ids'], loc_id2label, loc_tokenizer, True)

    return predicted_dict_entities_list

In [127]:
classify_loc(test_data[0:15])

['O', 'B-CITY', 'I-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE', 'O']
['[CLS]', 'north', 'beth', '##es', '##da', ',', 'md', ',', 'lexington', ',', 'ky', ',', 'remote', '[SEP]']

['O', 'B-REMOTE', 'O', 'B-COUNTRY', 'I-COUNTRY', 'O']
['[CLS]', 'remote', '-', 'united', 'kingdom', '[SEP]']

['O', 'B-REMOTE', 'O']
['[CLS]', 'remote', '[SEP]']

['O', 'B-COUNTRY', 'O', 'B-CITY', 'I-CITY', 'I-STATE']
['[CLS]', 'us', '-', 'san', 'francisco', '[SEP]']

['O', 'B-REMOTE', 'O', 'B-COUNTRY', 'I-STATE']
['[CLS]', 'remote', ',', 'usa', '[SEP]']

['O', 'B-CITY', 'I-CITY', 'O', 'B-STATE', 'I-STATE']
['[CLS]', 'new', 'york', ',', 'ny', '[SEP]']

['O', 'B-CITY', 'O']
['[CLS]', 'berlin', '[SEP]']

['O', 'O', 'O', 'O']
['[CLS]', '7', 'locations', '[SEP]']

['O', 'B-CITY', 'I-CITY', 'O', 'B-STATE', 'I-STATE']
['[CLS]', 'new', 'york', ',', 'ny', '[SEP]']

['O', 'O', 'O', 'O']
['[CLS]', '5', 'locations', '[SEP]']

['O', 'O', 'O', 'O']
['[CLS]', '6', 'locations', '[S

[{'country_list': [],
  'state_list': ['Maryland', 'Kentucky'],
  'city_list': ['North Bethesda', 'Lexington'],
  'remote': True},
 {'country_list': ['United Kingdom'],
  'state_list': [],
  'city_list': [],
  'remote': True},
 {'country_list': [], 'state_list': [], 'city_list': [], 'remote': True},
 {'country_list': ['United States'],
  'state_list': [],
  'city_list': ['San Francisco'],
  'remote': False},
 {'country_list': ['United States'],
  'state_list': [],
  'city_list': [],
  'remote': True},
 {'country_list': [],
  'state_list': ['New York'],
  'city_list': ['New York'],
  'remote': False},
 {'country_list': [],
  'state_list': [],
  'city_list': ['Berlin'],
  'remote': False},
 {'country_list': [], 'state_list': [], 'city_list': [], 'remote': False},
 {'country_list': [],
  'state_list': ['New York'],
  'city_list': ['New York'],
  'remote': False},
 {'country_list': [], 'state_list': [], 'city_list': [], 'remote': False},
 {'country_list': [], 'state_list': [], 'city_list':

In [146]:
for idx in range(100):
    print(f"{classify_type([test_data[idx]])} Job Title: {test_data[idx]['title']}")
    print(classify_loc([test_data[idx]]))
    print()
    print()

['technology'] Job Title: Vice President, Software Engineering 
['O', 'B-CITY', 'I-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE', 'O']
['[CLS]', 'north', 'beth', '##es', '##da', ',', 'md', ',', 'lexington', ',', 'ky', ',', 'remote', '[SEP]']

[{'country_list': [], 'state_list': ['Maryland', 'Kentucky'], 'city_list': ['North Bethesda', 'Lexington'], 'remote': True}]


['service industry'] Job Title: VP of Product
['O', 'B-REMOTE', 'O', 'B-COUNTRY', 'I-COUNTRY', 'O']
['[CLS]', 'remote', '-', 'united', 'kingdom', '[SEP]']

[{'country_list': ['United Kingdom'], 'state_list': [], 'city_list': [], 'remote': True}]


['service industry'] Job Title: VP of Product
['O', 'B-REMOTE', 'O']
['[CLS]', 'remote', '[SEP]']

[{'country_list': [], 'state_list': [], 'city_list': [], 'remote': True}]


['technology'] Job Title: Vice President, Engineering
['O', 'B-COUNTRY', 'O', 'B-CITY', 'I-CITY', 'I-STATE']
['[CLS]', 'us', '-', 'san', 'francisco', '[SEP]']

[{'

## Model Cleanup

In [147]:
# Cleanup Models
deload_engine(type_engine,type_context)
deload_engine(loc_engine,loc_context)

  context.__del__()  # Explicitly call context destructor
