This notebook takes the labelled entities provided from the user, trains a model and applies it to the full dataset. At the end of this notebook, you will have every item's itemname_col decomposed into its entities.

Expected input:
- NER for tagging
- Preprocessed files

Expected output:
- Saved NER model
- Inference output

In [1]:
import pandas as pd
import numpy as np
import utils as ut
import pickle
import hjson as json
from transformers import AutoTokenizer

import importlib
importlib.reload(ut)

  warn_incompatible_dep(
2023-02-22 14:49:18.334799: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-22 14:49:18.969467: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-22 14:49:18.969529: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


<module 'utils' from '/home/shared/code/08_protein_attribution/utils.py'>

In [2]:
# Read in our params file
f = open('input_params.hjson')
params = json.load(f)
f.close()

# Modelling params
itemname_col = params['core']['itemname_col']
# training_sets = params['core']['training_sets']
tag_lookup = params['core']['tag_lookup']
model_name = params['core']['ner_model_name']

# Algorithm specific params
use_pretrained_model = params['core']['use_pretrained_model']
model_architecture = params['core']['model_architecture']
model_path = params['core']['model_path']
batch_size = params['nb_two']['batch_size']
learning_rate = params['nb_two']['learning_rate']

In [3]:
# Train the NER model
tokenizer = AutoTokenizer.from_pretrained(model_architecture)
bertPreproc = ut.BERTPreprocess(tokenizer)
X_train, Y_train, schema = ut.loadPreprocess(model_name, 'item_for_selection', "_", bertPreproc)
model = ut.loadTrainModel(schema, X_train, Y_train, use_pretrained_model=use_pretrained_model, model_path=model_path, model_architecture = model_architecture, batch_size = batch_size, learning_rate = learning_rate)



60it [00:00, 4065.82it/s]
2023-02-22 14:49:21.314902: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-22 14:49:21.315258: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-22 14:49:21.343684: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-22 14:49:21.344023: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-22 14:49:21.344989: I tensorflow/c

Epoch 1/100


2023-02-22 14:49:41.240375: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x595cce60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-02-22 14:49:41.240409: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
2023-02-22 14:49:41.240414: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (1): Quadro RTX 8000, Compute Capability 7.5
2023-02-22 14:49:41.240417: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (2): Quadro RTX 8000, Compute Capability 7.5
2023-02-22 14:49:41.240421: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (3): Quadro RTX 8000, Compute Capability 7.5
2023-02-22 14:49:41.244842: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-02-22 14:49:41.310448: I tensorfl

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


In [5]:
# Use our trained model to infer on the full catalog
item_df = pd.read_csv(f'named_entity_recognition/{model_name}/data/{model_name}_preprocessed.csv')
cols = list(item_df)
results = ut.modelInferAndFormat(item_df, itemname_col, bertPreproc, schema, model, tag_lookup)

# Replace the clust_input with the NER results
results['clust_input'] = results['protein']
item_df.drop(columns=['clust_input'], inplace=True)
item_df = item_df.merge(results[['clust_input', itemname_col]], on=itemname_col, how='left').fillna('')

Unnamed: 0,lineitem_name,tier_1,tier_2,tier_3,tier_4,sales_amt_gross,item_for_selection,clust_input
0,Grits Large,Food,Breakfast,Breakfast Side,Grits,78552.240000,grits large,grits large
1,Large Boat,Food,Entree,Weighed/Build Your Own,Weighed/Build Your Own,190793.963333,large boat,large boat
2,Kitchen Fresh 1137 Italian Focaccia (7.7oz),Food,Entree,Sandwich/Wrap,Sandwich/Wrap,84461.820000,kitchen fresh 1137 italian focaccia 7 7oz,kitchen fresh 1137 italian focaccia 7 7oz
3,Adobo Chicken Bowl,Food,Entree,Noodle/Grain Bowl,Noodle/Grain Bowl,236852.690000,adobo chicken bowl,adobo chicken bowl
4,8oz Steel Cut Oatmeal (1,Food,Breakfast,Oatmeal/Cereal,Oatmeal,125298.810000,8oz steel cut oatmeal 1,8oz steel cut oatmeal 1
...,...,...,...,...,...,...,...,...
460910,Spicy Pepperjack Burger,Food,Entree,Burger,Burger,5.490000,spicy pepperjack burger,spicy pepperjack burger
460911,Charlotte SP White Egg Salad (6oz),Food,Entree,Other Entree,Other Entree,2.990000,charlotte sp white egg salad 6oz,charlotte sp white egg salad 6oz
460912,Crisper and Waffle Combo,Food,Breakfast,Griddle,Waffles,11.890000,crisper and waffle combo,crisper and waffle combo
460913,BFK - FIT Applewood Bacon Egg & Cheddar Flatbread,Food,Breakfast,Breakfast Sandwiches,Breakfast Sandwich/Wrap,0.000000,bfk fit applewood bacon egg cheddar flatbread,bfk fit applewood bacon egg cheddar flatbread


In [34]:
# Save the results 
item_df[cols].to_csv(f"named_entity_recognition/{model_name}/data/{model_name}_round1results.csv",index = False)
model.save_pretrained(f'named_entity_recognition/{model_name}/models/{model_name}')

out_dict = {
    'vec_modelpath': model_path,
    'schema': schema
}
pickle.dump(out_dict, open(f"named_entity_recognition/{model_name}/models/{model_name}_model_params.p",'wb'))

#### Validation tests

In [35]:
### This section is for exploring model results

results['rounded_confidence'].value_counts()

0.9    458662
Name: rounded_confidence, dtype: int64

In [36]:
results

Unnamed: 0,lineitem_name,protein,not_protein,rounded_confidence,clust_input
0,Grits Large,,Grits Large,0.9,
1290452,Classic Greek (Dairy),,Classic Greek (Dairy),0.9,
1290448,Soup - Tom Yum,,Soup - Tom Yum,0.9,
1290443,American Asian - Chicken Tenders,,American Asian - Chicken Tenders,0.9,
1290439,Vegan Homestyle Chikn Tenders,,Vegan Homestyle Chikn Tenders,0.9,
...,...,...,...,...,...
620632,Shrimp Chorizo and Bacon Taco 2,,Shrimp Chorizo and Bacon Taco 2,0.9,
620629,Fricasse de Pollo,,Fricasse de Pollo,0.9,
620626,WELL-DELI-Small Soup A,,WELL-DELI-Small Soup A,0.9,
620683,Half Entree (3.50),,Half Entree (3.50),0.9,


In [38]:
results.protein.value_counts()

    458662
Name: protein, dtype: int64