In [1]:
# NER with BERT in Spark NLP

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *


ModuleNotFoundError: No module named 'pyspark'

In [2]:
data_folder = r"../../Dataset/food/Text/MyData/"

spark = sparknlp.start(gpu=True)
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.6.2
Apache Spark version:  2.4.7


In [3]:
# Convert the CoNLL file to Spark data frame with additional fields
path = data_folder+"CoNLL_2003_dataset/project_data/eng.train";
with open(path) as f:
    c=f.read()

print (c[:500])

-DOCSTART- -X- -X- O

Ordered _ O O
biryani _ O I-MENU
for _ O O
tonight's _ O O
supper _ O O
. _ O O

I _ O O
wasn't _ O O
expecting _ O O
the _ O O
best _ O O
biryani _ O B-MENU
because _ O O
of _ O O
the _ O O
location _ O O
, _ O O
I _ O O
need _ O O
a _ O O
hot _ O O
plate _ O O
of _ O O
food _ O O
good _ O O
food _ O O
. _ O O

That's _ O O
it _ O O
. _ O O
. _ O O

Biryani _ O B-MENU
was _ O O
too _ O O
dry _ O O
, _ O O
raitha _ O B-MENU
was _ O O
little _ O O
sour _ O O
so _ O O
ended _


In [4]:
from sparknlp.training import CoNLL
training_data = CoNLL().readDataset(spark, path)
training_data.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Ordered biryani f...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 6, Or...|[[pos, 0, 6, _, [...|[[named_entity, 0...|
|I wasn't expectin...|[[document, 0, 99...|[[document, 0, 99...|[[token, 0, 0, I,...|[[pos, 0, 0, _, [...|[[named_entity, 0...|
|       That's it . .|[[document, 0, 12...|[[document, 0, 12...|[[token, 0, 5, Th...|[[pos, 0, 5, _, [...|[[named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [5]:
training_data.count()

9003

In [11]:
import numpy as np

emb_vector = np.array(training_data.select("sentence").take(1))
emb_vector

array([[[['document', 0, 37, "Ordered biryani for tonight's supper .",
          {'sentence': '0'}, list([])]]]], dtype=object)

In [12]:
# Get word embedding through BERT, each word is translated to a 768-dimensional vector
bert_annotator = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["document",'token'])\
 .setOutputCol("bert")\
 .setCaseSensitive(False)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [13]:
from sparknlp.training import CoNLL

test_data = CoNLL().readDataset(spark, data_folder+"CoNLL_2003_dataset/eng.testa")
test_data = bert_annotator.transform(test_data)
test_data.write.parquet("MenuNER/bert_embeddings.parquet")
test_data.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                bert|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|It's delicious fo...|[[document, 0, 78...|[[document, 0, 78...|[[token, 0, 3, It...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|
|Food was hot and ...|[[document, 0, 65...|[[document, 0, 65...|[[token, 0, 3, Fo...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|
|Most dishes were ...|[[document, 0, 73...|[[document, 0, 73...|[[token, 0, 3, Mo...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|
+--------------------+--------------------+--------------------+--------------------+--------------------+

In [14]:
test_data.select("bert.result","bert.embeddings",'label.result').show()

+--------------------+--------------------+--------------------+
|              result|          embeddings|              result|
+--------------------+--------------------+--------------------+
|[it's, delicious,...|[[0.028714772, -0...|[O, O, O, O, O, O...|
|[food, was, hot, ...|[[0.6151372, -0.2...|[O, O, O, O, O, O...|
|[most, dishes, we...|[[0.24191713, -0....|[O, O, O, O, O, O...|
|[probably, the, w...|[[0.20407376, -0....|[O, O, O, I-MENU,...|
|[giant, eagle, br...|[[0.40425774, -0....|[O, O, O, B-MENU,...|
| [tea, was, good, .]|[[0.55888885, -0....|   [B-MENU, O, O, O]|
|[scratched, an, i...|[[0.18796587, -0....|[O, O, O, O, O, O...|
|[we, will, defini...|[[0.14424627, -0....|[O, O, O, O, O, O...|
|[best, indian, fo...|[[-0.05868431, -0...|     [O, O, O, O, O]|
|[ordered, chicken...|[[0.46678805, -0....|[O, B-MENU, I-MEN...|
|    [all, delish, !]|[[0.4162415, -0.2...|           [O, O, O]|
|[going, back, aga...|[[0.7048442, -0.1...|        [O, O, O, O]|
|[went, with, my, ...|[[0

In [29]:
import numpy as np

emb_vector = np.array(test_data.select("bert").take(2))
print(emb_vector[0][0][0])

['word_embeddings' 0 3 "it's"
 {'sentence': '0', 'isOOV': 'false', 'isWordStart': 'true', 'pieceId': '1122', 'token': "it's"}
 list([0.028714772313833237, -0.01954510807991028, -0.14898182451725006, -0.0449531190097332, -0.14033105969429016, -0.13059653341770172, -0.04000503569841385, -0.1434706598520279, -0.01618177443742752, -0.5472599267959595, 0.1340707689523697, 0.46343931555747986, -0.15731850266456604, 0.27046892046928406, 0.08926349133253098, 0.022836709395051003, 0.26432257890701294, 0.26798826456069946, 0.22323481738567352, -0.13036906719207764, -0.35764095187187195, 0.08030959963798523, 0.300663024187088, -0.2094012349843979, -0.1701054573059082, -0.13023287057876587, -0.12788549065589905, 0.8047378659248352, -0.33214566111564636, 0.12296024709939957, -0.08900249004364014, -0.4571039378643036, -0.4593285322189331, 0.24178314208984375, 0.018473654985427856, 0.23898540437221527, 0.3477132320404053, 0.20571573078632355, 0.23572644591331482, -0.32364389300346375, 0.1039256080985

In [13]:
nerTagger = NerDLApproach()\
  .setInputCols(["document", "token", "bert"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(1)\
  .setLr(0.001)\
  .setPo(0.005)\
  .setBatchSize(8)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.2)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(True)\
  .setIncludeConfidence(True)\
  .setTestDataset("MenuNER/test_withEmbeds.parquet")


pipeline = Pipeline(
    stages = [
    bert_annotator,
    nerTagger
  ])

In [14]:
%%time

ner_model_bert = pipeline.fit(training_data)

CPU times: user 51.7 ms, sys: 8.8 ms, total: 60.5 ms
Wall time: 4min 58s


In [15]:
ner_model_bert

PipelineModel_019df07f5bc1

In [26]:
ner_model_bert.stages[1].write().save('NER_bert_20201022')

In [20]:
predictions_bert = ner_model_bert.transform(test_data)
predictions_bert.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                bert|                 ner|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|It's delicious fo...|[[document, 0, 78...|[[document, 0, 78...|[[token, 0, 3, It...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|Food was hot and ...|[[document, 0, 65...|[[document, 0, 65...|[[token, 0, 3, Fo...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|Most dishes were ...|[[document, 0, 73...|[[document, 0, 73...|[[token, 0, 3, Mo...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|[[

In [21]:
predictions_bert.select('token.result','label.result','ner.result').show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+
|                                  result|                                  result|                                  result|
+----------------------------------------+----------------------------------------+----------------------------------------+
|[It's, delicious, food, and, priced, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Food, was, hot, and, fresh, ,, and, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Most, dishes, were, quite, average, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Probably, the, worst, tikka, masala,...|   [O, O, O, I-MENU, I-MENU, O, O, O, O]|   [O, O, O, B-MENU, I-MENU, O, O, O, O]|
|[Giant, eagle, brand, sauce, in, a, j...|  [O, O, O, B-MENU, O, O, O, O, O, O, O]|[B-MENU, I-MENU, O, I-MENU, O, O, O, ...|


In [22]:
predictions_bert.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = tr

In [23]:
import pyspark.sql.functions as F

prediction_bert_result = predictions_bert.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction"))
        
prediction_bert_result.show(20, truncate=False)

+---------+------------+----------+
|token    |ground_truth|prediction|
+---------+------------+----------+
|It's     |O           |O         |
|delicious|O           |O         |
|food     |O           |O         |
|and      |O           |O         |
|priced   |O           |O         |
|very     |O           |O         |
|welDon't |O           |O         |
|really   |O           |O         |
|get      |O           |O         |
|the      |O           |O         |
|very     |O           |O         |
|high     |O           |O         |
|reviews  |O           |O         |
|.        |O           |O         |
|Food     |O           |O         |
|was      |O           |O         |
|hot      |O           |O         |
|and      |O           |O         |
|fresh    |O           |O         |
|,        |O           |O         |
+---------+------------+----------+
only showing top 20 rows



In [24]:
!cd ~/annotator_logs && ls -lt

total 4
-rw-rw-r-- 1 muzamil muzamil 1038 10월 22 03:04 NerDLApproach_61224948768b.log


In [25]:
!cat ~/annotator_logs/NerDLApproach_61224948768b.log

Name of the selected graph: ner-dl/blstm_10_768_128_120.pb
Training started - total epochs: 1 - lr: 0.001 - batch size: 8 - labels: 3 - chars: 74 - training examples: 7203


Epoch 1/1 started, lr: 0.001, dataset size: 7203


Epoch 1/1 - 42.55s - loss: 1132.2854 - batches: 903
Quality on validation dataset (20.0%), validation examples = 1800
time to finish evaluation: 4.02s
label	 tp	 fp	 fn	 prec	 rec	 f1
B-MENU	 643	 165	 126	 0.7957921	 0.8361508	 0.8154724
I-MENU	 520	 132	 65	 0.797546	 0.8888889	 0.8407438
tp: 1163 fp: 297 fn: 191 labels: 2
Macro-average	 prec: 0.79666907, rec: 0.86251986, f1: 0.8282877
Micro-average	 prec: 0.79657537, rec: 0.8589365, f1: 0.8265814
Quality on test dataset: 
time to finish evaluation: 1.48s
label	 tp	 fp	 fn	 prec	 rec	 f1
B-MENU	 242	 35	 28	 0.8736462	 0.8962963	 0.88482636
I-MENU	 178	 14	 46	 0.9270833	 0.79464287	 0.8557692
tp: 420 fp: 49 fn: 74 labels: 2
Macro-average	 prec: 0.90036476, rec: 0.8454696, f1: 0.8720541
Micro-average	 prec: 0.895

In [27]:
prediction_bert_result.coalesce(1).write.csv("MenuNER/bert_results.csv")

In [28]:
predictions_bert.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False).show(100,truncate=False)

+------------+-----+
|ground_truth|count|
+------------+-----+
|O           |11587|
|B-MENU      |270  |
|I-MENU      |224  |
+------------+-----+



In [29]:
loaded_ner_bert_model = NerDLModel.load("NER_bert_20201022")\
   .setInputCols(["sentence", "token", "bert"])\
   .setOutputCol("ner")

In [32]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

# sentence = SentenceDetector() \
#     .setInputCols(['document']) \
#     .setOutputCol('sentence')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# ner_dl and onto_100 model are trained with glove_100d, so the embeddings in
# the pipeline should match
# if (MODEL_NAME == "ner_dl") or (MODEL_NAME == "onto_100"):
#     embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
#         .setInputCols(["document", 'token']) \
#         .setOutputCol("embeddings")

# # Bert model uses Bert embeddings
# elif MODEL_NAME == "ner_dl_bert":
#     embeddings = BertEmbeddings.pretrained(name='bert_base_cased', lang='en') \
#         .setInputCols(['document', 'token']) \
#         .setOutputCol('embeddings')

bertEmbedding = BertEmbeddings.pretrained(name='bert_base_cased', lang='en') \
        .setInputCols(['document', 'token']) \
        .setOutputCol('bert') \
        .setCaseSensitive(False)

load_ner_model = NerDLModel.load('NER_bert_20201022') \
    .setInputCols(['document', 'token', 'bert']) \
    .setOutputCol('ner')

ner_converter = NerConverter() \
    .setInputCols(['document', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

custom_nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    tokenizer,
    bertEmbedding,
    load_ner_model,
    ner_converter
])

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [34]:
text = "Great selection of curries. Fantastic levels of spices in chicken, fish, meat and veggie dishes. Serving portions are plentiful and tasty. Prices are more than reasonable $$ Quick with orders and take out, no long waiting time. All dishes replenished quickly when serving trays are low. Pleasant servers."
prediction_data = spark.createDataFrame([[text]]).toDF("text")
prediction_model = custom_nlp_pipeline.fit(prediction_data)
preds = prediction_model.transform(prediction_data)
preds.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                bert|                 ner|           ner_chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Great selection o...|[[document, 0, 30...|[[token, 0, 4, Gr...|[[word_embeddings...|[[named_entity, 0...|[[chunk, 58, 64, ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [38]:
text = "Amazing goat briyani. Huge portions that can easily last for two times. However, it was so tasty that I over ate the whole briyani in one go. The price is very reasonable and briyani had lot of meat prices. Didn't like their chicken qourma. As it lacked taste and was bland. Staff is decent and service is quick. Will definitely be regular here just and just for briyani."

text2 = "Ordered a Veal Biryani, dal and butter chicken for takeout....Super dissatisfied and disappointed. The biryani had a lot of rice and bones (no meat). Their dal was very watery and their portion for regular size is extremely small for $5 (judging by the size, it should be worth $2). The butter chicken should be called just butter because there was very little chicken. The portion of butter chicken is also very small for $7. Not coming back here again because I feel like I've been cheated."

text3 = "Among the list of dishes, overall in terms of quality and portion its an easy 4/5. Veal biriyani \
Chicken biriyani \
Chicken korma \
Lamb curry \
Naan"
lp = LightPipeline(prediction_model)
result = lp.annotate(text3)

for e in list(zip(result['token'], result['ner'])):
    print(e)

('Among', 'O')
('the', 'O')
('list', 'O')
('of', 'O')
('dishes', 'O')
(',', 'O')
('overall', 'O')
('in', 'O')
('terms', 'O')
('of', 'O')
('quality', 'O')
('and', 'O')
('portion', 'O')
('its', 'O')
('an', 'O')
('easy', 'O')
('4/5', 'O')
('.', 'O')
('Veal', 'B-MENU')
('biriyani', 'I-MENU')
('Chicken', 'I-MENU')
('biriyani', 'I-MENU')
('Chicken', 'I-MENU')
('korma', 'I-MENU')
('Lamb', 'I-MENU')
('curry', 'I-MENU')
('Naan', 'I-MENU')


In [43]:
# You can use any word embeddings you want (Glove, Elmo, Bert, custom etc.)

glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [44]:
test_data = CoNLL().readDataset(spark, data_folder+"CoNLL_2003_dataset/eng.testa")
test_data = glove_embeddings.transform(test_data)
test_data.write.parquet("MenuNER/test_withEmbedsGloVe.parquet")
test_data.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|          embeddings|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|It's delicious fo...|[[document, 0, 78...|[[document, 0, 78...|[[token, 0, 3, It...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|
|Food was hot and ...|[[document, 0, 65...|[[document, 0, 65...|[[token, 0, 3, Fo...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|
|Most dishes were ...|[[document, 0, 73...|[[document, 0, 73...|[[token, 0, 3, Mo...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|
+--------------------+--------------------+--------------------+--------------------+--------------------+

In [45]:
test_data.select("embeddings.result","embeddings.embeddings",'label.result').show()

+--------------------+--------------------+--------------------+
|              result|          embeddings|              result|
+--------------------+--------------------+--------------------+
|[It's, delicious,...|[[0.0, 0.0, 0.0, ...|[O, O, O, O, O, O...|
|[Food, was, hot, ...|[[-0.16486, 0.919...|[O, O, O, O, O, O...|
|[Most, dishes, we...|[[-0.56127, 0.665...|[O, O, O, O, O, O...|
|[Probably, the, w...|[[0.23341, 0.4872...|[O, O, O, I-MENU,...|
|[Giant, eagle, br...|[[0.46484, 0.7507...|[O, O, O, B-MENU,...|
| [Tea, was, good, .]|[[-1.0121, 1.3398...|   [B-MENU, O, O, O]|
|[Scratched, an, I...|[[-0.32363, 0.245...|[O, O, O, O, O, O...|
|[We, will, defini...|[[-0.17791, 0.626...|[O, O, O, O, O, O...|
|[Best, Indian, fo...|[[0.13888, 0.1970...|     [O, O, O, O, O]|
|[Ordered, chicken...|[[0.080861, -0.85...|[O, B-MENU, I-MEN...|
|    [All, delish, !]|[[-0.21823, 0.691...|           [O, O, O]|
|[Going, back, aga...|[[0.25882, 0.3098...|        [O, O, O, O]|
|[Went, with, my, ...|[[0

In [46]:
import numpy as np

emb_vector = np.array(test_data.select("embeddings.embeddings").take(1))
emb_vector

array([[[[ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [-0.65534002,  0.34033999,  0.30283999, ..., -0.68853003,
           0.089564  ,  0.72299999],
         [-0.16486   ,  0.91996998,  0.22736999, ..., -0.067904  ,
           1.5072    ,  0.60889   ],
         ...,
         [ 0.23191001,  0.12153   ,  0.29385999, ..., -0.69988   ,
           0.98633999, -0.12299   ],
         [-0.87980002,  0.13604   ,  0.43974999, ...,  0.22445001,
           0.41789001,  0.93756002],
         [-0.33978999,  0.20941   ,  0.46348   , ..., -0.23394001,
           0.47297999, -0.028803  ]]]])

In [47]:
nerTagger = NerDLApproach()\
  .setInputCols(["sentence", "token", "embeddings"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(1)\
  .setLr(0.003)\
  .setPo(0.005)\
  .setBatchSize(32)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.2)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(True)\
  .setIncludeConfidence(True)\
  .setTestDataset("MenuNER/test_withEmbedsGloVe.parquet")
  # .setOutputLogsPath('menu_ner_logs') # if not set, logs will be written to ~/annotator_logs
 #.setGraphFolder('graphs') >> put your graph file (pb) under this folder if you are using a custom graph generated thru 4.1 NerDL-Graph.ipynb notebook
    
    
ner_pipeline = Pipeline(stages=[
          glove_embeddings,
          nerTagger
 ])

In [48]:
%%time

ner_model = ner_pipeline.fit(training_data)

# 1 epoch takes around 2.5 min with batch size=32
# if you get an error for incompatible TF graph, use 4.1 NerDL-Graph.ipynb notebook to create a graph (or see the bottom cell of this notebook)

CPU times: user 13.5 ms, sys: 36 ms, total: 49.5 ms
Wall time: 1min


In [49]:
predictions_glove = ner_model.transform(test_data)
predictions_glove.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|          embeddings|                 ner|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|It's delicious fo...|[[document, 0, 78...|[[document, 0, 78...|[[token, 0, 3, It...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|Food was hot and ...|[[document, 0, 65...|[[document, 0, 65...|[[token, 0, 3, Fo...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|Most dishes were ...|[[document, 0, 73...|[[document, 0, 73...|[[token, 0, 3, Mo...|[[pos, 0, 3, _, [...|[[named_entity, 0...|[[word_embeddings...|[[

In [35]:
predictions_glove.select('token.result','label.result','ner.result').show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+
|                                  result|                                  result|                                  result|
+----------------------------------------+----------------------------------------+----------------------------------------+
|[It's, delicious, food, and, priced, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Food, was, hot, and, fresh, ,, and, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Most, dishes, were, quite, average, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|[O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Probably, the, worst, tikka, masala,...|   [O, O, O, I-MENU, I-MENU, O, O, O, O]|   [O, O, O, B-MENU, I-MENU, O, O, O, O]|
|[Giant, eagle, brand, sauce, in, a, j...|  [O, O, O, B-MENU, O, O, O, O, O, O, O]|  [O, O, O, B-MENU, O, O, O, O, O, O, O]|


In [50]:
import pyspark.sql.functions as F
        
predictions_glove.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show(truncate=False)

+---------+------------+----------+
|token    |ground_truth|prediction|
+---------+------------+----------+
|It's     |O           |O         |
|delicious|O           |O         |
|food     |O           |O         |
|and      |O           |O         |
|priced   |O           |O         |
|very     |O           |O         |
|welDon't |O           |O         |
|really   |O           |O         |
|get      |O           |O         |
|the      |O           |O         |
|very     |O           |O         |
|high     |O           |O         |
|reviews  |O           |O         |
|.        |O           |O         |
|Food     |O           |O         |
|was      |O           |O         |
|hot      |O           |O         |
|and      |O           |O         |
|fresh    |O           |O         |
|,        |O           |O         |
+---------+------------+----------+
only showing top 20 rows



In [51]:
!cd ~/annotator_logs && ls -lt

total 12
-rw-rw-r-- 1 muzamil muzamil 1036 10월 22 01:57 NerDLApproach_2ffc6af65502.log
-rw-rw-r-- 1 muzamil muzamil  683 10월 22 01:38 NerDLApproach_04e531bab85e.log
-rw-rw-r-- 1 muzamil muzamil 1044 10월 22 01:23 NerDLApproach_84282fca4c03.log


In [52]:
!cat ~/annotator_logs/NerDLApproach_2ffc6af65502.log

Name of the selected graph: ner-dl/blstm_10_100_128_120.pb
Training started - total epochs: 1 - lr: 0.003 - batch size: 32 - labels: 3 - chars: 100 - training examples: 7203


Epoch 1/1 started, lr: 0.003, dataset size: 7203


Epoch 1/1 - 27.79s - loss: 304.71024 - batches: 227
Quality on validation dataset (20.0%), validation examples = 1800
time to finish evaluation: 4.17s
label	 tp	 fp	 fn	 prec	 rec	 f1
B-MENU	 640	 144	 129	 0.81632656	 0.8322497	 0.8242112
I-MENU	 495	 107	 90	 0.8222591	 0.84615386	 0.83403546
tp: 1135 fp: 251 fn: 219 labels: 2
Macro-average	 prec: 0.81929284, rec: 0.8392018, f1: 0.8291278
Micro-average	 prec: 0.8189033, rec: 0.838257, f1: 0.82846713
Quality on test dataset: 
time to finish evaluation: 0.78s
label	 tp	 fp	 fn	 prec	 rec	 f1
B-MENU	 240	 45	 30	 0.84210527	 0.8888889	 0.8648648
I-MENU	 172	 11	 52	 0.9398907	 0.76785713	 0.8452088
tp: 412 fp: 56 fn: 82 labels: 2
Macro-average	 prec: 0.890998, rec: 0.828373, f1: 0.858545
Micro-average	 prec: 0.880

In [57]:
np.array (predictions_glove.select('token.result').take(1))[0][0]

array(["It's", 'delicious', 'food', 'and', 'priced', 'very', "welDon't",
       'really', 'get', 'the', 'very', 'high', 'reviews', '.'],
      dtype='<U9')

In [87]:
import pandas as pd

# tokens = np.array (predictions_glove.select('token.result').take(1))[0][0]
# ground = np.array (predictions_glove.select('label.result').take(1))[0][0]
# label_bert = np.array (prediction_result.select('ner.result').take(1))[0][0]
label_glove = np.array (predictions_glove.select('ner.result').take(100))[0][0]
#.take(1))[0][0]

df = pd.DataFrame({'token':tokens,
              'ground':ground,
            #   'label_bert':label_bert,
              'label_glove':label_glove})

In [88]:
df.tail(100)

Unnamed: 0,token,ground,label_glove
0,It's,O,O
1,delicious,O,O
2,food,O,O
3,and,O,O
4,priced,O,O
5,very,O,O
6,welDon't,O,O
7,really,O,O
8,get,O,O
9,the,O,O
