# README

This File is used to take the multi classification job type pytorch model to onnx model and then to optomized tensorrt model.

# Imports

In [1]:
import torch

from transformers import BertTokenizer, DistilBertTokenizer, AutoModelForTokenClassification
import onnx
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import json
import os
import pandas as pd

In [2]:
# Sanity Check 
print(os.getcwd())

/home/connor/Documents/jobs/jobs/Job_Type_Model


# Create Production Model

## Tokenizer and Model load

In [36]:
out_put_dir = "./jobLevel_multi_bert_all_1"
tokeninzer_out_put_dir = out_put_dir+"_tokenizer"
loaded_tokenizer = DistilBertTokenizerFast.from_pretrained(tokeninzer_out_put_dir)
with open(tokeninzer_out_put_dir + "/tokenizer_config.json", "r") as f:
    tokenizer_config = json.load(f)

max_length = tokenizer_config.get('max_length', 256)
truncation = tokenizer_config.get('truncation', True)
padding = tokenizer_config.get('padding', 'max_length')


loaded_model = DistilBertForSequenceClassification.from_pretrained(out_put_dir)
loaded_model.eval()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30524, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Data Load

In [5]:
quan_df = pd.read_csv('job_type_data/model_labeled.csv', index_col=0)

## Create Input Example

In [14]:
text = quan_df.iloc[0]['input_row']
inputs = loaded_tokenizer(
    text,
    max_length=max_length,
    truncation=truncation,
    padding=padding,
    return_tensors="pt"
)

## Onx model build

In [56]:
onnx_model_name= "industry_classifier.onnx"
torch.onnx.export(
    loaded_model,
    (inputs["input_ids"], inputs["attention_mask"]),
    onnx_model_name,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size"}, 
        "attention_mask": {0: "batch_size"}, 
        "logits": {0: "batch_size"}
    },
    opset_version=11  # ONNX set version for compatibility
)

## Create the .trt model using terminal
Lower precision less memory 
```
trtexec --onnx=industry_classifier.onnx --saveEngine=location_classifier.trt --fp16
```
Lower precision with batch size 1 or 2
```
trtexec --onnx=industry_classifier.onnx --saveEngine=location_classifier_v2.trt --fp16 \
        --minShapes=input_ids:1x512,attention_mask:1x512 \ 
        --optShapes=input_ids:2x512,attention_mask:2x512 \
        --maxShapes=input_ids:2x512,attention_mask:2x512
```
Batch size 1 or 2 
```
trtexec --onnx=industry_classifier.onnx --saveEngine=location_classifier_v3.trt \
        --minShapes=input_ids:1x512,attention_mask:1x512 \
        --optShapes=input_ids:2x512,attention_mask:2x512 \
        --maxShapes=input_ids:2x512,attention_mask:2x512
```

# TensorRT Model test and usage

In [14]:
def load_engine(trt_engine_path):
    """
    Loads the Tensorrt engine
    """
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open(trt_engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())
        
def deload_engine(engine, context):
    """Function to clean up the engine and context."""
    if context:
        context.__del__()  # Explicitly call context destructor
    del engine  # Delete the engine object to free memory
    del context  # Delete the context object to free memory

def run_inference(context, input_ids, attention_mask):
    """
    Run Input through tensorrt model
    """    
    # Allocate memory for inputs and outputs on the GPU
    d_input_ids = cuda.mem_alloc(input_ids.nbytes)
    d_attention_mask = cuda.mem_alloc(attention_mask.nbytes)

    # Prepare output buffer 
    output_shape = (input_ids.shape[0], 11)
    d_output = cuda.mem_alloc(int(np.prod(output_shape) * np.dtype(np.float32).itemsize))

    # Copy inputs to device memory
    cuda.memcpy_htod(d_input_ids, input_ids)
    cuda.memcpy_htod(d_attention_mask, attention_mask)

    # Run inference
    bindings = [int(d_input_ids), int(d_attention_mask), int(d_output)]
    context.execute_v2(bindings)

    # Copy outputs back to host
    output = np.empty(output_shape, dtype=np.float32)
    cuda.memcpy_dtoh(output, d_output)

    return output

In [15]:
# Load the TensorRT engine
engine = load_engine("industry_classifier.trt")
context = engine.create_execution_context()

[10/06/2024-17:42:50] [TRT] [W] Using an engine plan file across different models of devices is not recommended and is likely to affect performance or even cause errors.


In [7]:
# To Deload the model 
# deload_engine(engine, context) 

## Single Example Data Load

In [10]:
# Load one example input
text = quan_df.iloc[0]['input_row']
print(text)
inputs = loaded_tokenizer(
    text,
    max_length=max_length,
    truncation=truncation,
    padding=padding,
    return_tensors="np"
)

[TITLE] k-5 elementary teacher for 2024-2025 school year [DESC] elementary teacher reports to school director flsa classification certificated, 190-day full time part time status salary, full time, exempt regular temporary regular salary teacher framework steps 1-32 range $63,280- $112,776. salary is commensurate with experience and education level. developed by a coalition of san diego civic leaders and educators, high tech high opened in september 2000 as a small public charter school with plans to serve approximately 450 students. hth has evolved into an integrated network of sixteen charter schools serving approximately 6,350 students in grades k-12 across four campuses. located in san diego county, california, high tech high hth is guided by four connected design principlesequity, personalization, authentic work, and collaborative designthat set aspirational goals and create a foundation for understanding our approach. if you are motivated by these principles, we encourage you to 

In [11]:
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

In [16]:
# Run inference
output = run_inference(context, input_ids, attention_mask)
predicted_class = np.argmax(output, axis=-1)
print(f"Predicted class: {predicted_class}")

Predicted class: [6]


In [60]:
list(predicted_class)[0]

6

## Batch prediction with tensorrt

In [17]:
def predict_batch(texts, tokenizer, context, batch_size=32):
    """
    This function tokenizes a batch of texts and performs inference on them using the TensorRT engine.
    
    Args:
        texts (list): List of input texts.
        tokenizer: Hugging Face tokenizer.
        context: TensorRT context for inference.
        batch_size (int): Batch size for inference.
    
    Returns:
        List of predicted class labels for each input text.
    """
    predictions = []
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize batch
        inputs = tokenizer(
            batch_texts,
            max_length=max_length,
            truncation=truncation,
            padding=padding,
            return_tensors="np"
        )
        
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        
        # Run inference on the batch
        output = run_inference(context, input_ids, attention_mask)
        
        # Get predicted classes
        predicted_classes = np.argmax(output, axis=-1)
        predictions.extend(predicted_classes)
    
    return predictions

def batch_inference(df, input_col, tokenizer, context, batch_size=32):
    """
    Function to perform batch inference on a Pandas DataFrame column using a TensorRT engine.
    
    Args:
        df (pd.DataFrame): DataFrame containing the input texts.
        input_col (str): Name of the column containing input texts.
        tokenizer: Hugging Face tokenizer.
        context: TensorRT context.
        batch_size (int): Batch size for inference.
    
    Returns:
        pd.Series: Series of predicted classes for each row in the DataFrame.
    """
    texts = df[input_col].tolist()
    
    start_time = time.time()

    # Perform the batch prediction
    predicted_classes = predict_batch(texts, tokenizer, context, batch_size=batch_size)

    end_time = time.time()
    total_time = end_time - start_time

    print(f"Batch inference completed in {total_time:.2f} seconds")

    return pd.Series(predicted_classes, index=df.index)

In [18]:
quan_df['batch'] = batch_inference(quan_df, 'input_row', loaded_tokenizer, context, batch_size=256)


100%|██████████| 40/40 [01:07<00:00,  1.68s/it]

Batch inference completed in 67.05 seconds





In [19]:
quan_df['batch']

44533    6
61014    1
7677     0
60161    0
16171    0
        ..
66393    0
29270    0
147      0
12659    0
60326    0
Name: batch, Length: 10000, dtype: int64

In [21]:
trial = quan_df.iloc[:100]
trial

Unnamed: 0,company,title,description,title_processed,description_processed,description_processed_stop,title_processed_stop,input_row,labels,label_str
44533,hightechhigh,K-5 Elementary Teacher for 2024-2025 School Year,Elementary Teacher Reports To: School Direc...,k-5 elementary teacher for 2024-2025 school year,elementary teacher reports to school director ...,elementary teacher reports school director fls...,k-5 elementary teacher 2024-2025 school year,[TITLE] k-5 elementary teacher for 2024-2025 s...,6,education
61014,lowes::wd5::LWS_External_CS,In Home Consultant,What You Will Do All Lowes associates deliver ...,in home consultant,what you will do all lowes associates deliver ...,lowes associates deliver quality customer serv...,home consultant,[TITLE] in home consultant [DESC] what you wil...,4,service industry
7677,carrier::wd5::jobs,Indirect Purchasing,Country: ThailandLocation: Carrier (Thailand) ...,indirect purchasing,country thailandlocation carrier thailand limi...,country thailandlocation carrier thailand limi...,indirect purchasing,[TITLE] indirect purchasing [DESC] country tha...,4,service industry
60161,csl::wd1::CSL_External,Quality Specialist,The OpportunityThis position provides immediat...,quality specialist,the opportunitythis position provides immediat...,opportunitythis position provides immediate su...,quality specialist,[TITLE] quality specialist [DESC] the opportun...,1,medical
16171,myhrhome::wd1::OneMainCareers,Consumer Loan Sales Specialist,"At OneMain, Consumer Loan Sales Specialists em...",consumer loan sales specialist,"at onemain, consumer loan sales specialists em...","onemain , consumer loan sales specialists empo...",consumer loan sales specialist,[TITLE] consumer loan sales specialist [DESC] ...,11,finance
...,...,...,...,...,...,...,...,...,...,...
40882,michaels::wd5::External,Full Time Custom Framing Department Manager,"Store - PALM BEACH-PALM BEACH GARDENS, FLLead ...",full time custom framing department manager,"store - palm beach-palm beach gardens, fllead ...","store - palm beach-palm beach gardens , fllead...",full time custom framing department manager,[TITLE] full time custom framing department ma...,5,retail
18434,labcorp::wd1::External,Lab Assistant,At Labcorp we have a passion in helping people...,lab assistant,at labcorp we have a passion in helping people...,labcorp passion helping people live happy heal...,lab assistant,[TITLE] lab assistant [DESC] at labcorp we hav...,6,education
30970,pwc::wd3::Global_Experienced_Careers,Director,Line of ServiceAdvisoryIndustry/SectorFS X-Sec...,director,line of serviceadvisoryindustry sectorfs x-sec...,line serviceadvisoryindustry sectorfs x-sector...,director,[TITLE] director [DESC] line of serviceadvisor...,10,hr
57966,Jobnath,Required DRAFTSMAN,Required DRAFTSMANa) Establishing liaison with...,required draftsman,required draftsmana establishing liaison with ...,required draftsmana establishing liaison engin...,required draftsman,[TITLE] required draftsman [DESC] required dra...,9,skilled trades


## Compare Tensorrt and Pytorch model speeds
The Goal of these cells was to test the speed up of tensorrt models vs pytorch models. 

In [None]:
text = quan_df.iloc[0]['input_row']
print(text)
inputs = loaded_tokenizer(
    text,
    max_length=max_length,
    truncation=truncation,
    padding=padding,
    return_tensors="np"
)

In [22]:
def predict_fast(text):
    """
    Predict function for tensorrt model
    """
    inputs = loaded_tokenizer(
        text,
        max_length=max_length,
        truncation=truncation,
        padding=padding,
        return_tensors="np"
    )
    output = run_inference(context, inputs['input_ids'], inputs['attention_mask'])
    predicted_class = np.argmax(output, axis=-1)
    return predicted_class

def predict_class(text):
    """
    Predict function for pytorch model
    """
    inputs = loaded_tokenizer(
        text,
        max_length=max_length,
        truncation=truncation,
        padding=padding,
        return_tensors="pt"
    ).to('cuda')
    
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    
    logits = outputs.logits.cpu()
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    return predicted_class

In [42]:
quan_df['slow'] = quan_df['input_row'].progress_apply(predict_class)

100%|██████████| 10000/10000 [01:36<00:00, 103.47it/s]


In [23]:
quan_df['fast'] = quan_df['input_row'].progress_apply(predict_fast)

100%|██████████| 10000/10000 [01:25<00:00, 117.09it/s]


In [24]:
quan_df['fast']

44533    [6]
61014    [4]
7677     [4]
60161    [1]
16171    [7]
        ... 
66393    [4]
29270    [4]
147      [4]
12659    [9]
60326    [4]
Name: fast, Length: 10000, dtype: object

##  Test concurrent preprocessing 

The goal of these cells was to test if concurrent pre processing would see any speed ups in processing time.

In [22]:
def preprocess_text_s( text): 
    """Pre Process Text"""
    if text is None:
        return text

    # Fix Contractions
    text = contractions.fix(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove non-standard UTF-8 characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Normalize accented characters to their ASCII equivalents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

    # Remove bullet points (at the start of a line or following whitespace)
    text = re.sub(r'(^[\s]*[\u2022\u2023\u25E6\u2043\u2219*•-]\s+)', '', text, flags=re.MULTILINE)

    # Replace URLs with "URL"
    text = re.sub(r'http\S+|www.\S+', 'URL', text)

    # Replace phone numbers with "PHONE_NUMBER"
    text = re.sub(r'\(?\b\d{3}[-.)\s]*\d{3}[-.\s]*\d{4}\b', 'PHONE_NUMBER', text)

    # Replace email addresses with "EMAIL"
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'EMAIL', text)

    # Replace more than three consecutive punctuation marks with a single instance
    text = re.sub(r'([^\w\s])\1{2,}', r'\1', text)

    # Remove parentheses, curly braces, and square brackets
    text = re.sub(r'[\(\)\{\}\[\]]', ' ', text)

    # Remove special characters 
    text = re.sub(r'[:;@\/\\*#!?]', ' ', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def build_input( title, description):
    """"Build Input Row"""
    return f"[TITLE] {preprocess_text_s(title)} [DESC] {preprocess_text_s(description)}"


def process_items_parallel(items):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda item: build_input(item[0], item[1]), items))
    return results


In [35]:
new_trail = quan_df[['title', 'description']].values.tolist()

In [38]:
import time

In [39]:
start = time.time()
processed_trail = process_items_parallel(new_trail)
end = time.time()
print(end-start)

13.007951736450195


In [40]:
start = time.time()
new_list = []
for item in new_trail:
    new_list.append(build_input(item[0], item[1]))
end = time.time()
print(end-start)

6.360122442245483


In [37]:
processed_trail[0]

'[TITLE] k-5 elementary teacher for 2024-2025 school year [DESC] elementary teacher reports to school director flsa classification certificated, 190-day full time part time status salary, full time, exempt regular temporary regular salary teacher framework steps 1-32 range $63,280- $112,776. salary is commensurate with experience and education level. developed by a coalition of san diego civic leaders and educators, high tech high opened in september 2000 as a small public charter school with plans to serve approximately 450 students. hth has evolved into an integrated network of sixteen charter schools serving approximately 6,350 students in grades k-12 across four campuses. located in san diego county, california, high tech high hth is guided by four connected design principlesequity, personalization, authentic work, and collaborative designthat set aspirational goals and create a foundation for understanding our approach. if you are motivated by these principles, we encourage you to