In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

from sklearn.metrics import classification_report

from model import BertNer
import json
import os
from optimization import AdamWeightDecay, WarmUp
from tokenization import FullTokenizer
from utils.foodstyle_utils import get_sentence_len_histogram, convert_logit_to_labels, get_ingredients_and_positions
from prepare_data import NerProcessor, convert_examples_to_features, recipe_to_sentences

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

2022-04-24 19:01:36.044672: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-04-24 19:01:36.957096: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-04-24 19:01:37.001255: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-24 19:01:37.001486: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2060 computeCapability: 7.5
coreClock: 1.35GHz coreCount: 30 deviceMemorySize: 5.79GiB deviceMemoryBandwidth: 245.91GiB/s
2022-04-24 19:01:37.001511: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-04-24 19:01:37.003367: I tensorflow/stream_executor/pl

In [2]:
from prepare_data import readcsvfile

In [3]:
train_data, eval_data = readcsvfile('./data/recipes.csv')

In [None]:
# I trained the model per sentence instead of per recipe
# Next step can be making the full recipe integrated into training
train_data_sentece_len_dist = get_sentence_len_histogram(train_data)
eval_data_sentece_len_dist = get_sentence_len_histogram(eval_data)


In [None]:
plt.bar(train_data_sentece_len_dist.keys(), train_data_sentece_len_dist.values())

In [12]:
plt.bar(eval_data_sentece_len_dist.keys(), eval_data_sentece_len_dist.values())

We can see the distribution of length of sentences are almost uniform and mostly between 12-24

In [2]:
# loading trained model
tokenizer = FullTokenizer(os.path.join('output/2022-4-24-14-21-9', "vocab.txt"), do_lower_case=True)

max_seq_length = 50
processor = NerProcessor()
label_list = processor.get_labels()
num_labels = len(label_list) + 1
label_map = {i : label for i, label in enumerate(label_list,1)}

config = json.load(open(os.path.join('output/2022-4-24-14-21-9',"bert_config.json")))
ner = BertNer(config, tf.float32, num_labels, max_seq_length)

ids = tf.ones((1,50),dtype=tf.int32)
_ = ner(ids,ids,ids,ids, training=False)
ner.load_weights(os.path.join('output/2022-4-24-14-21-9', "model-8.h5"))

2022-04-24 19:01:49.901049: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-24 19:01:49.901567: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-24 19:01:49.901807: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2060 computeCapability: 7.5
coreClock: 1.35GHz coreCount: 30 deviceMemorySize: 5.79GiB deviceMemoryBandwidth: 245.91GiB/s
2022-04-24 19:01:49.901866: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), 

In [9]:
sample_recipe = """In a large bowl, combine flour, baking powder, baking soda, salt, cinnamon, nutmeg, brown sugar, and  oats. Add apple, nuts, raisins, eggs, milk, and oil.  Mix until dry ingredients are moistened.**Bake for 55 to 60 minutes, or until done.  Cool on wire rack.
"""
sentences_labels_pair = recipe_to_sentences(sample_recipe, ingrs=[])
sample_example = processor._create_examples(sentences_labels_pair, 'test')
sample_features = convert_examples_to_features(sample_example, label_list, max_seq_length, tokenizer)


0it [00:00, ?it/s]04/24/2022 19:14:52 - INFO - prepare_data -   *** Example ***
04/24/2022 19:14:52 - INFO - prepare_data -   guid: test-0
04/24/2022 19:14:52 - INFO - prepare_data -   tokens: in a large bowl , combine flour , baking powder , baking soda , salt , c ##innamon , nut ##me ##g , brown sugar , and o ##ats .
04/24/2022 19:14:52 - INFO - prepare_data -   input_ids: 101 1107 170 1415 7329 117 12479 15068 117 26377 10794 117 26377 18402 117 6870 117 172 23339 117 22664 3263 1403 117 3058 6656 117 1105 184 9971 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/24/2022 19:14:52 - INFO - prepare_data -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/24/2022 19:14:52 - INFO - prepare_data -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/24/2022 19:14:52 - INFO - prepare_data -   *** Example ***
04/24/2022 19:14:52 - INFO - prepare_data -   guid: t

In [4]:
all_input_ids= tf.convert_to_tensor(np.asarray([f.input_ids for f in sample_features],dtype=np.int32))
all_input_mask = tf.convert_to_tensor(
    np.asarray([f.input_mask for f in sample_features],dtype=np.int32))
all_segment_ids = tf.convert_to_tensor(
    np.asarray([f.segment_ids for f in sample_features],dtype=np.int32))
all_valid_ids = tf.convert_to_tensor(
    np.asarray([f.valid_ids for f in sample_features],dtype=np.int32))
all_label_ids = tf.convert_to_tensor(
    np.asarray([f.label_id for f in sample_features],dtype=np.int32))

In [5]:
logits = ner(all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, training=False)

In [13]:
logits

<tf.Tensor: shape=(6, 50, 5), dtype=float32, numpy=
array([[[2.9373293e-06, 1.2242394e-04, 1.4227959e-05, 9.9981755e-01,
         4.2871641e-05],
        [1.8990738e-07, 9.9995625e-01, 2.8835315e-05, 1.7505897e-06,
         1.2902101e-05],
        [2.1512477e-07, 9.9996018e-01, 2.4754383e-05, 1.7088914e-06,
         1.3102019e-05],
        ...,
        [2.9373293e-06, 1.2242394e-04, 1.4227959e-05, 9.9981755e-01,
         4.2871641e-05],
        [2.9373293e-06, 1.2242394e-04, 1.4227959e-05, 9.9981755e-01,
         4.2871641e-05],
        [2.9373293e-06, 1.2242394e-04, 1.4227959e-05, 9.9981755e-01,
         4.2871641e-05]],

       [[2.8550187e-06, 1.2468622e-04, 1.4352357e-05, 9.9981564e-01,
         4.2502164e-05],
        [2.0830467e-07, 9.9996555e-01, 2.0662570e-05, 1.6479946e-06,
         1.1912990e-05],
        [4.2691488e-07, 1.9341878e-03, 9.9805647e-01, 3.2976293e-06,
         5.6373215e-06],
        ...,
        [2.8550187e-06, 1.2468622e-04, 1.4352357e-05, 9.9981564e-01,
     

In [6]:
pred, tokens = convert_logit_to_labels(logits, all_label_ids, label_map, all_input_ids, tokenizer)
print(pred, tokens)

['N-ING', 'N-ING', 'N-ING', 'N-ING', 'N-ING', 'ING', 'ING', 'ING', 'ING', 'ING', 'ING', 'ING', 'ING', 'ING', 'ING', 'N-ING', 'ING', '[SEP]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', 'N-ING', 'ING', 'N-ING', 'ING', 'ING', 'ING', 'N-ING', 'ING', '[SEP]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', 'N-ING', 'N-ING', 'N-ING', 'N-ING', 'N-ING', 'N-ING', '[SEP]', '[CLS]', 'N-ING', 'N-ING', 'N-ING', 'N-ING', 'N-ING', 'N-ING', 'N-ING', 'N-ING', 'N-ING', '[SEP]', '[CLS]', '[CLS]', 'N-ING', 'N-ING', 'N-ING', 'N-ING', '[SEP]', '[SEP]'] ['in', 'a', 'large', 'bowl', ',', 'combine', 'flour', ',', 'baking', 'powder', ',', 'baking', 'soda', ',', 'salt', ',', 'c', '##innamon', ',', 'nut', '##me', '##g', ',', 'brown', 'sugar', ',', 'and', 'o', '##ats', '.', 'add', 'apple', ',', 'nuts', ',', 'r', '##ais', '##ins', ',', 'eggs', ',', 'milk', ',', 'and', 'oil', '.', 'mix', 'until', 'dry', 'ingredients', 'are', 'moist', '##

In [8]:
print(len(pred))

72


In [7]:
result = get_ingredients_and_positions(pred, tokens)

In [8]:
result

[['combine flour', 17, 31],
 ['baking powder', 32, 46],
 ['baking soda', 47, 59],
 ['salt', 60, 65],
 ['cinnamon', 67, 75],
 ['apple', 108, 114],
 ['nuts', 115, 120],
 ['raisins', 122, 129]]