# README

This File is used to take the Location NER pytorch model to onnx model and then to optomized tensorrt model.

# Imports

In [2]:
import torch

from transformers import BertTokenizer, DistilBertTokenizer, AutoModelForTokenClassification
import onnx
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import json
import os
import pandas as pd

In [3]:
# Sanity Check 
print(os.getcwd())

/home/connor/Documents/jobs/jobs/Location_Model


# Helper Function

In [35]:
def create_ner_label_mappings(token_labels):
    label2id = {'O': 0}
    id2label = {0: 'O'}
    index = 1  
    for label in token_labels:
        for prefix in ['B-', 'I-']:
            current_label = f"{prefix}{label.upper()}"
            label2id[current_label] = index
            id2label[index] = current_label
            index += 1
    print(f"Num Labels: {len(label2id.keys())}")
    print(f"label2id: {label2id}")
    print(f"id2label: {id2label}")
    return label2id, id2label

In [36]:
# Create Label Mappings
token_labels = ['city', 'state', 'country', 'remote']
label2id, id2label = create_ner_label_mappings(token_labels)

Num Labels: 9
label2id: {'O': 0, 'B-CITY': 1, 'I-CITY': 2, 'B-STATE': 3, 'I-STATE': 4, 'B-COUNTRY': 5, 'I-COUNTRY': 6, 'B-REMOTE': 7, 'I-REMOTE': 8}
id2label: {0: 'O', 1: 'B-CITY', 2: 'I-CITY', 3: 'B-STATE', 4: 'I-STATE', 5: 'B-COUNTRY', 6: 'I-COUNTRY', 7: 'B-REMOTE', 8: 'I-REMOTE'}


In [5]:
# Directory File path declaration 
out_put_dir = "./location_bert_v2"
tokenizer_out_put_dir = out_put_dir+"_tokenizer"

# Create production model

## Tokenizer and Model Load 

In [6]:
loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_out_put_dir)
with open(tokenizer_out_put_dir + "/tokenizer_config.json", "r") as f:
    tokenizer_config = json.load(f)

max_length = tokenizer_config.get('max_length', 128)
truncation = tokenizer_config.get('truncation', True)
padding = tokenizer_config.get('padding', True)
return_offsets_mapping = tokenizer_config.get('return_offsets_mapping', True)
is_split_into_words = tokenizer_config.get('is_split_into_words', False)



In [7]:
loaded_model = AutoModelForTokenClassification.from_pretrained(out_put_dir)
loaded_model.eval()

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

## Data Load 

In [8]:
all_df = pd.read_csv('location_data/all_data_fmt_location.csv')
all_df.drop(axis=1, inplace=True, columns=['id','title','description'])
all_df.head()

Unnamed: 0,raw_location,fmt_raw_location
0,"North Bethesda, MD, Lexington, KY, Remote","north bethesda, md, lexington, ky, remote"
1,Remote - United Kingdom,remote - united kingdom
2,Remote,remote
3,US - San Francisco,us - san francisco
4,"Remote, USA","remote, usa"


## Create Input Example

In [9]:
text = all_df.iloc[0]['fmt_raw_location']
inputs = loaded_tokenizer(
    text,
    max_length=max_length,
    truncation=truncation,
    padding=padding,
    return_tensors="pt",
    is_split_into_words=is_split_into_words,
)

## Create Onnx model

In [10]:
onnx_model_name= "location_classifier.onnx"
torch.onnx.export(
    loaded_model,
    (inputs["input_ids"], inputs["attention_mask"]),
    onnx_model_name,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size"}, 
        "attention_mask": {0: "batch_size"}, 
        "logits": {0: "batch_size"}
    },
    opset_version=11  # ONNX set version for compatibility
)

  mask, torch.tensor(torch.finfo(scores.dtype).min)


## Create the .trt model using terminal
```
trtexec --onnx=location_classifier.onnx --saveEngine=location_classifier.trt 
```
Run the above command without the `--fp16` flag. The 16 precision was making the model not predict the correct entity
```
trtexec --onnx=location_classifier.onnx --saveEngine=location_classifier_2.trt \
        --minShapes=input_ids:16x256,attention_mask:16x256 \
        --optShapes=input_ids:16x256,attention_mask:16x256 \
        --maxShapes=input_ids:16x256,attention_mask:16x256
```
To get more memory performance run ^^^ 

See README for instructions on installing `trtexec`

# Onnx and Tensorrt Model tests

## Load tokenizer and tokenizer config

In [5]:
loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_out_put_dir)
with open(tokenizer_out_put_dir + "/tokenizer_config.json", "r") as f:
    tokenizer_config = json.load(f)

max_length = tokenizer_config.get('max_length', 256)
truncation = tokenizer_config.get('truncation', True)
padding = tokenizer_config.get('padding', 'max_length')
is_split_into_words = tokenizer_config.get('is_split_into_words', False)



## Test Data Load

In [6]:
all_df = pd.read_csv('location_data/all_data_fmt_location.csv')
all_df.drop(axis=1, inplace=True, columns=['id','title','description'])
all_df.head()

Unnamed: 0,raw_location,fmt_raw_location
0,"North Bethesda, MD, Lexington, KY, Remote","north bethesda, md, lexington, ky, remote"
1,Remote - United Kingdom,remote - united kingdom
2,Remote,remote
3,US - San Francisco,us - san francisco
4,"Remote, USA","remote, usa"


In [99]:
text = all_df.iloc[1]['fmt_raw_location']
inputs = loaded_tokenizer(
    text,
    max_length=max_length,
    truncation=truncation,
    padding=padding,
    return_tensors="np",
    is_split_into_words=is_split_into_words,
)

In [101]:
text

'remote - united kingdom'

In [102]:
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

## Onnx test

In [53]:
# Load ONNX model
ort_session = ort.InferenceSession("location_classifier.onnx")

# Run inference with ONNX model
onnx_outputs = ort_session.run(None, {
    'input_ids': input_ids,
    'attention_mask': attention_mask
})

# Logits from the output
logits = onnx_outputs[0]  
predictions_np = np.argmax(logits, axis=-1)  # Get token-level predictions

print("ONNX predicted class:", predictions_np)

ONNX predicted class: [[0 1 2 2 2 0 3 0 1 0 3 0 7 0 0 0 0 0 7 0 7 0 1 0 7 0 0 0 0 2 0 3 3 3 3 0
  1 0 3 0 5 0 0 0 0 7 0 0 0 1 0 7 0 0 0 0 1 2 2 2 0 3 0 1 0 3 0 0 0 0 0 0
  0 0 5 0 3 0 7 0 7 0 0 0 0 1 2 2 2 0 3 0 1 0 3 5 0 0 0 0 0 0 0 7 0 7 0 1
  0 7 0 0 2 2 2 0 3 0 1 0 1 0 0 3 0 5 0 0 2 0 1 2 2 2 0 3 0 1 0 7 0 0 0 7
  0 0 2 2 3 2 0 3 0 1 1 0 3 0 0 0 7 7 0 0 0 0 0 7 0 0 0 0 0 2 2 1 1 2 2 2
  0 3 1 0 1 1 7 2 3 5 5 0 0 0 0 0 0 0 7 7 7 7 1 7 7 0 0 0 7 0 0 0 1 2 2 2
  3 0 2 0 5 0 0 0 0 0 2 7 3 3 7 0 1 0 3 0 0 0 0 0 0 0 0 2 3 2 0 3 0 0 0 0
  0 0 2 2]]


In [54]:
# use attention mask to only convert the tokens to labels. 
adjusted_predictions = np.where(attention_mask == 1, predictions_np, -100) 
adjusted_predictions

array([[   0,    1,    2,    2,    2,    0,    3,    0,    1,    0,    3,
           0,    7,    0, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, 

In [55]:
flat_predictions = adjusted_predictions.flatten() # remove the extra dimension  
attention_mask_flat = attention_mask.flatten() # remove extra dimension on np array
attention_mask_list = attention_mask_flat.tolist() # convert nparray to list 
last_real_token_index = len(attention_mask_list) - 1 - attention_mask_list[::-1].index(1) # grab the index of the last real item
filtered_predictions_ids = flat_predictions[:last_real_token_index+1] # use last index +1 to remove the -100s
predicted_token_labels = [id2label[id] for id in filtered_predictions_ids] # Convert Ids to labels 
predicted_token_labels

['O',
 'B-CITY',
 'I-CITY',
 'I-CITY',
 'I-CITY',
 'O',
 'B-STATE',
 'O',
 'B-CITY',
 'O',
 'B-STATE',
 'O',
 'B-REMOTE',
 'O']

In [30]:
text

'north bethesda, md, lexington, ky, remote'

In [56]:
input_ids_list = inputs['input_ids'].squeeze().tolist()  # Remove extra dimension and convert tensor to list  

# Grab the tokens for the input example then conver them from ids to actual string token. then Remove the pad tokens. 
tokens = loaded_tokenizer.convert_ids_to_tokens(input_ids_list) 
tokens = [token for token in tokens if token != '[PAD]']
tokens

['[CLS]',
 'north',
 'beth',
 '##es',
 '##da',
 ',',
 'md',
 ',',
 'lexington',
 ',',
 'ky',
 ',',
 'remote',
 '[SEP]']

## Tensorrt Usage single example

In [126]:
# Load the TensorRT engine
def load_engine(trt_engine_path):
    """
    Loads the Tensorrt engine
    """
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open(trt_engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())
        
def deload_engine(engine, context):
    """Function to clean up the engine and context."""
    if context:
        context.__del__()  # Explicitly call context destructor
    del context  # Delete the context object to free memory
    del engine  # Delete the engine object to free memory
    
def run_inference(context, input_ids, attention_mask):
    """
    Run Input through tensorrt model
    """
    # Allocate memory for inputs and outputs on the GPU
    d_input_ids = cuda.mem_alloc(input_ids.nbytes)
    d_attention_mask = cuda.mem_alloc(attention_mask.nbytes)

    sequence_length = input_ids.shape[1]  # Get the number of tokens in the input
    num_labels = 9  
    output_shape = (input_ids.shape[0], sequence_length, num_labels)
    
    d_output = cuda.mem_alloc(int(np.prod(output_shape) * np.dtype(np.float32).itemsize))

    # Copy inputs to device memory
    cuda.memcpy_htod(d_input_ids, input_ids)
    cuda.memcpy_htod(d_attention_mask, attention_mask)

    # Run inference
    bindings = [int(d_input_ids), int(d_attention_mask), int(d_output)]
    context.execute_v2(bindings)

    # Copy outputs back to host
    output = np.empty(output_shape, dtype=np.float32)
    cuda.memcpy_dtoh(output, d_output)

    return output

In [137]:
engine = load_engine("location_classifier_2.trt") # Load the TensorRT engine
context = engine.create_execution_context()# Create execution context

In [63]:
# To Deload the model 
# deload_engine(engine, context) 

In [138]:
# Run inference
output = run_inference(context, input_ids, attention_mask)
predicted_tokens = np.argmax(output, axis=-1)
print(f"Preds class: {predicted_tokens}")

Preds class: [[0 1 2 ... 0 2 2]
 [0 7 0 ... 7 7 7]
 [0 7 0 ... 7 7 7]
 [0 5 0 ... 0 0 0]
 [0 7 0 ... 1 5 5]]


In [111]:
predicted_tokens.shape

(1, 256)

In [107]:
adjusted_predictions = np.where(attention_mask == 1, predicted_tokens, -100) 
flat_predictions = adjusted_predictions.flatten() # remove the extra dimension  
attention_mask_list = attention_mask.flatten().tolist() # remove extra dimension on np array and convert nparray to list
last_real_token_index = len(attention_mask_list) - 1 - attention_mask_list[::-1].index(1) # grab the index of the last real item
filtered_predictions_ids = flat_predictions[:last_real_token_index+1] # use last index +1 to remove the -100s
predicted_token_labels = [id2label[id] for id in filtered_predictions_ids] # Convert Ids to labels 
predicted_token_labels

['O', 'B-REMOTE', 'O', 'B-COUNTRY', 'I-COUNTRY', 'O']

In [64]:
input_ids_list = inputs['input_ids'].squeeze().tolist()  # Remove extra dimension and convert tensor to list  
# Grab the tokens for the input example then conver them from ids to actual string token. then Remove the pad tokens. 
tokens = loaded_tokenizer.convert_ids_to_tokens(input_ids_list) 
tokens = [token for token in tokens if token != '[PAD]']
tokens

['[CLS]',
 'north',
 'beth',
 '##es',
 '##da',
 ',',
 'md',
 ',',
 'lexington',
 ',',
 'ky',
 ',',
 'remote',
 '[SEP]']

In [66]:
len(predicted_token_labels)

14

In [65]:
len(tokens)

14

In [68]:
build_final_dict(tokens, predicted_token_labels)

{'country_list': [],
 'state_list': ['md', 'ky'],
 'city_list': ['north bethesda', 'lexington'],
 'remote': True}

## Tensorrt Batch test

In [159]:
locations_list = all_df['fmt_raw_location'][:5].to_list()

In [160]:
# Tokenize batch
inputs = loaded_tokenizer(
    locations_list,
    max_length=max_length,
    truncation=truncation,
    padding=padding,
    return_tensors="np",
    is_split_into_words=is_split_into_words,
)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Run inference on the batch
output = run_inference(context, input_ids, attention_mask)

# Get predicted classes
predicted_tokens = np.argmax(output, axis=-1)

In [161]:
predicted_tokens.shape

(5, 256)

### Batch Test Helper functions

In [197]:
def build_final_dict(tokens, pred_labels):
    """
    Takes the tokens and predicted token labels and rebuilds tokens into strings 
    creates the country, state, city, and remote lists
    """
    entities = {
        "country": [],
        "state": [],
        "city": [],
        "remote": False 
    }
    
    current_entity_tokens = []
    current_entity_label = None
    
    for token, pred_token_label in zip(tokens, pred_labels):
        if token in ['[CLS]', '[SEP]']: # Skip special tokens
            continue

        if pred_token_label == 'B-REMOTE':  # Check for remote first
            entities['remote'] = True
            continue  
        
        if pred_token_label != 'O':  # Not an outside word
            prefix, entity = pred_token_label.split('-')
            if prefix == 'B':  # Start of a new entity
                # Save current entity if there is one
                if current_entity_label and current_entity_tokens:
                    entities[current_entity_label].append(''.join(current_entity_tokens))
                
                # Start a new entity
                current_entity_tokens = [token.replace('##', '')]  # Remove BPE marker for subwords
                current_entity_label = entity.lower()  # Start a new entity label
            elif prefix == 'I' and entity.lower() == current_entity_label and current_entity_tokens:
                # Continue the current entity
                if token.startswith('##'):
                    current_entity_tokens.append(token.replace('##', ''))
                else:
                    current_entity_tokens.append(' ' + token)  # Add space if it's a full word
        else:
            # Save the entity if we hit an 'O' and we have an ongoing entity
            if current_entity_label and current_entity_tokens:
                entities[current_entity_label].append(''.join(current_entity_tokens))
                current_entity_tokens = []
                current_entity_label = None
    
    # Capture any entity that goes right up to the end without hitting an 'O'
    if current_entity_label and current_entity_tokens:
        entities[current_entity_label].append(''.join(current_entity_tokens))
    
    return {
        "country_list": entities['country'], 
        "state_list": entities['state'],
        "city_list": entities['city'],
        "remote": entities['remote']
    }
    
state_abbreviation_to_name = {
    "al": "alabama", "ak": "alaska", "az": "arizona", "ar": "arkansas", "ca": "california", 
    "co": "colorado", "ct": "connecticut", "de": "delaware", "fl": "florida", "ga": "georgia", 
    "hi": "hawaii", "id": "idaho", "il": "illinois", "in": "indiana", "ia": "iowa", 
    "ks": "kansas", "ky": "kentucky", "la": "louisiana", "me": "maine", "md": "maryland", 
    "ma": "massachusetts", "mi": "michigan", "mn": "minnesota", "ms": "mississippi", 
    "mo": "missouri", "mt": "montana", "ne": "nebraska", "nv": "nevada", "nh": "new hampshire", 
    "nj": "new jersey", "nm": "new mexico", "ny": "new york", "nc": "north carolina", 
    "nd": "north dakota", "oh": "ohio", "ok": "oklahoma", "or": "oregon", "pa": "pennsylvania", 
    "ri": "rhode island", "sc": "south carolina", "sd": "south dakota", "tn": "tennessee", 
    "tx": "texas", "ut": "utah", "vt": "vermont", "va": "virginia", "wa": "washington", 
    "wv": "west virginia", "wi": "wisconsin", "wy": "wyoming"
}
def normalize_state(state):
    if len(state) == 2:
        state_name = state_abbreviation_to_name.get(state, None)
        if state_name:
            return state_name.title()
        else:
            return None
    elif len(state) > 2:
        if state in us_state_names:
            return state.title()
        else:
            return None
    else:
        return None
country_abbreviation_to_name = {
    "us": "United States", "usa": "United States",
}
def normalize_country(country):
    if len(country)<=3:
        country_name = country_abbreviation_to_name.get(country, country)
        return country_name.title()
    else:
        return country.title()

def normalize_city(city):
    return city.title()

def process_batch_predictions(predicted_tokens, attention_mask, input_ids, id2label):
    batch_predicted_labels = []
    token_labels = []
    entities =[]

    for i in range(predicted_tokens.shape[0]):
        adjusted_predictions = np.where(attention_mask[i] == 1, predicted_tokens[i], -100)
        flat_predictions = adjusted_predictions.flatten() # Flatten predictions for this example
        attention_mask_list = attention_mask[i].flatten().tolist() # Flatten attention mask for this example and convert to list
        last_real_token_index = len(attention_mask_list) - 1 - attention_mask_list[::-1].index(1)# Find the index of the last real token (where attention mask is 1)
        filtered_predictions_ids = flat_predictions[:last_real_token_index + 1]# Use last index +1 to remove the -100 values for padded tokens
        predicted_token_labels = [id2label[id] for id in filtered_predictions_ids]# Convert IDs to their corresponding labels using id2label

        input_ids_list = input_ids[i].squeeze().tolist()  # Remove extra dimension and convert tensor to list  
        # Grab the tokens for the input example then conver them from ids to actual string token. then Remove the pad tokens. 
        tokens = loaded_tokenizer.convert_ids_to_tokens(input_ids_list) 
        token_labels = [token for token in tokens if token != '[PAD]']
        
        entity_obj = build_final_dict(token_labels, predicted_token_labels)
        if len(entity_obj['state_list'])> 0:
            new_state_list = []
            for state in entity_obj['state_list']:
                fmt_state = normalize_state(state)
                if fmt_state:
                    new_state_list.append(fmt_state)
            entity_obj['state_list'] = new_state_list

        if len(entity_obj['country_list'])> 0:
            for idx in range(len(entity_obj['country_list'])):
                entity_obj['country_list'][idx] = normalize_country(entity_obj['country_list'][idx])

        if len(entity_obj['city_list'])> 0:
            for idx in range(len(entity_obj['city_list'])):
                entity_obj['city_list'][idx] = normalize_city(entity_obj['city_list'][idx])
                
        entities.append(entity_obj)
        
    return entities

### Batch Test

In [198]:
process_batch_predictions(predicted_tokens, attention_mask, input_ids, id2label)

[{'country_list': [],
  'state_list': ['Maryland', 'Kentucky'],
  'city_list': ['North Bethesda', 'Lexington'],
  'remote': True},
 {'country_list': ['United Kingdom'],
  'state_list': [],
  'city_list': [],
  'remote': True},
 {'country_list': [], 'state_list': [], 'city_list': [], 'remote': True},
 {'country_list': ['United States'],
  'state_list': [],
  'city_list': ['San Francisco'],
  'remote': False},
 {'country_list': ['United States'],
  'state_list': [],
  'city_list': [],
  'remote': True}]