<a href="https://colab.research.google.com/github/gokhanturer/JSL_Public/blob/main/Ner_with_graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -q pyspark==3.1.2 spark-nlp

! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 64 kB/s 
[K     |████████████████████████████████| 140 kB 42.7 MB/s 
[K     |████████████████████████████████| 198 kB 53.4 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 95 kB 3.0 MB/s 
[K     |████████████████████████████████| 66 kB 4.7 MB/s 
[?25h

In [None]:
import sparknlp

spark = sparknlp.start(gpu = True) 

from sparknlp.base import *
from sparknlp.annotator import *
import pyspark.sql.functions as F
from sparknlp.training import CoNLL

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.4.0
Apache Spark version: 3.1.2


In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.train
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.testa

In [None]:
with open("eng.train") as f:
    train_txt =f.read()

In [None]:
train_data = CoNLL().readDataset(spark, 'eng.train')

In [None]:
train_data.select(F.explode(F.arrays_zip('token.result', 'pos.result',  'label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("pos"),
        F.expr("cols['2']").alias("ner_label")).show(truncate=50)

+----------+---+---------+
|     token|pos|ner_label|
+----------+---+---------+
|        EU|NNP|    B-ORG|
|   rejects|VBZ|        O|
|    German| JJ|   B-MISC|
|      call| NN|        O|
|        to| TO|        O|
|   boycott| VB|        O|
|   British| JJ|   B-MISC|
|      lamb| NN|        O|
|         .|  .|        O|
|     Peter|NNP|    B-PER|
| Blackburn|NNP|    I-PER|
|  BRUSSELS|NNP|    B-LOC|
|1996-08-22| CD|        O|
|       The| DT|        O|
|  European|NNP|    B-ORG|
|Commission|NNP|    I-ORG|
|      said|VBD|        O|
|        on| IN|        O|
|  Thursday|NNP|        O|
|        it|PRP|        O|
+----------+---+---------+
only showing top 20 rows



In [None]:
train_data.select(F.explode(F.arrays_zip("token.result","label.result")).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy("ground_truth").count().orderBy("count", ascending=False).show(100,truncate=False)

+------------+------+
|ground_truth|count |
+------------+------+
|O           |169578|
|B-LOC       |7140  |
|B-PER       |6600  |
|B-ORG       |6321  |
|I-PER       |4528  |
|I-ORG       |3704  |
|B-MISC      |3438  |
|I-LOC       |1157  |
|I-MISC      |1155  |
+------------+------+



In [None]:
with open ("eng.testa") as f:
  test_data = f.read()

In [None]:
test_data = CoNLL().readDataset(spark, 'eng.testa')

In [None]:
test_data.select(F.explode(F.arrays_zip("token.result","label.result")).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy("ground_truth").count().orderBy("count", ascending=False).show(100,truncate=False)

+------------+-----+
|ground_truth|count|
+------------+-----+
|O           |42759|
|B-PER       |1842 |
|B-LOC       |1837 |
|B-ORG       |1341 |
|I-PER       |1307 |
|B-MISC      |922  |
|I-ORG       |751  |
|I-MISC      |346  |
|I-LOC       |257  |
+------------+-----+



In [None]:
glove_embeddings = WordEmbeddingsModel.pretrained()\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
glove_embeddings.transform(test_data).write.parquet('test_data.parquet')

In [None]:
%%capture

!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/create_graph.py
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/dataset_encoder.py
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/ner_model.py
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/ner_model_saver.py
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/sentence_grouper.py

! pip -q install tensorflow==1.15.0

import create_graph

ntags = 9 # number of labels
embeddings_dim = 100
nchars =100

create_graph.create_graph(ntags, embeddings_dim, nchars)

# then move the graph to grap folder stated in NerDLApproach

In [None]:
nerTagger = NerDLApproach()\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setLabelColumn("label")\
    .setOutputCol("ner")\
    .setMaxEpochs(15)\
    .setLr(0.002)\
    .setDropout(0.5)\
    .setBatchSize(10)\
    .setRandomSeed(0)\
    .setVerbose(1)\
    .setEvaluationLogExtended(True) \
    .setEnableOutputLogs(True)\
    .setIncludeConfidence(True)\
    .setEnableMemoryOptimizer(False)\
    .setGraphFolder('/content/NER_graph')\
    .setTestDataset('test_data.parquet')

ner_pipeline = Pipeline(stages=[
      glove_embeddings,
      nerTagger
])

In [None]:
%%time

ner_model = ner_pipeline.fit(train_data)

CPU times: user 26.6 s, sys: 2.54 s, total: 29.1 s
Wall time: 1h 16min 56s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 12
-rw-r--r-- 1 root root 12205 Feb  2 14:41 NerDLApproach_8a8df207748f.log


In [None]:
!cat ~/annotator_logs/NerDLApproach_8a8df207748f.log

Name of the selected graph: /content/NER_graph/blstm_9_100_128_100.pb
Training started - total epochs: 15 - lr: 0.002 - batch size: 10 - labels: 9 - chars: 84 - training examples: 14041


Epoch 1/15 started, lr: 0.002, dataset size: 14041


Epoch 1/15 - 301.57s - loss: 3147.6409 - batches: 1407
Quality on test dataset: 
time to finish evaluation: 14.66s
label	 tp	 fp	 fn	 prec	 rec	 f1
B-LOC	 1703	 99	 134	 0.945061	 0.927055	 0.9359714
I-ORG	 572	 80	 179	 0.8773006	 0.76165116	 0.8153956
I-MISC	 273	 145	 73	 0.65311	 0.7890173	 0.71465963
I-LOC	 215	 33	 42	 0.8669355	 0.83657587	 0.8514852
I-PER	 1247	 18	 60	 0.98577076	 0.95409334	 0.9696734
B-MISC	 848	 299	 74	 0.73932	 0.9197397	 0.8197197
B-ORG	 1145	 108	 196	 0.91380686	 0.8538404	 0.8828065
B-PER	 1738	 60	 104	 0.96662956	 0.9435396	 0.9549451
tp: 7741 fp: 842 fn: 862 labels: 8
Macro-average	 prec: 0.86849177, rec: 0.8731891, f1: 0.87083405
Micro-average	 prec: 0.9018991, rec: 0.8998024, f1: 0.90084946


Epoch 2/15 starte

In [None]:
import pyspark.sql.functions as F

predictions = ner_model.transform(test_data)

predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show(truncate=False)

+--------------+------------+----------+
|token         |ground_truth|prediction|
+--------------+------------+----------+
|CRICKET       |O           |O         |
|-             |O           |O         |
|LEICESTERSHIRE|B-ORG       |B-ORG     |
|TAKE          |O           |O         |
|OVER          |O           |O         |
|AT            |O           |O         |
|TOP           |O           |O         |
|AFTER         |O           |O         |
|INNINGS       |O           |O         |
|VICTORY       |O           |O         |
|.             |O           |O         |
|LONDON        |B-LOC       |B-LOC     |
|1996-08-30    |O           |O         |
|West          |B-MISC      |B-MISC    |
|Indian        |I-MISC      |I-MISC    |
|all-rounder   |O           |O         |
|Phil          |B-PER       |B-PER     |
|Simmons       |I-PER       |I-PER     |
|took          |O           |O         |
|four          |O           |O         |
+--------------+------------+----------+
only showing top

In [None]:
from sklearn.metrics import classification_report

preds_df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).toPandas()

print (classification_report(preds_df['ground_truth'], preds_df['prediction']))

              precision    recall  f1-score   support

       B-LOC       0.97      0.97      0.97      1837
      B-MISC       0.90      0.92      0.91       922
       B-ORG       0.94      0.93      0.94      1341
       B-PER       0.96      0.96      0.96      1842
       I-LOC       0.93      0.92      0.92       257
      I-MISC       0.86      0.82      0.84       346
       I-ORG       0.92      0.91      0.92       751
       I-PER       0.98      0.97      0.97      1307
           O       1.00      1.00      1.00     42759

    accuracy                           0.99     51362
   macro avg       0.94      0.93      0.94     51362
weighted avg       0.99      0.99      0.99     51362



### Saving the Trained Model

In [None]:
ner_model.stages

[WORD_EMBEDDINGS_MODEL_48cffc8b9a76, NerDLModel_0fd3b8deb917]

In [None]:
ner_model.stages[1].write().overwrite().save("Ner_glove_graph_100d_e15_b10_lr0.002")

# Prediction Pipeline

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')
    
glove_embeddings = WordEmbeddingsModel.pretrained()\
    .setInputCols(["document", "token"])\
    .setOutputCol("embeddings")

loaded_ner_model = NerDLModel.load("Ner_glove_graph_100d_e15_b10_lr0.002")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_span")

ner_prediction_pipeline = Pipeline(stages = [
      document,
      sentence,
      token,
      glove_embeddings,
      loaded_ner_model,
      converter
  ])

empty_data = spark.createDataFrame([['']]).toDF("text")

prediction_model = ner_prediction_pipeline.fit(empty_data)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
text = "Mustafa Gökhan Türer is a nice guy who likes aviation and lives in Samsun."

sample_data = spark.createDataFrame([[text]]).toDF("text")

sample_data.show(truncate=110)

+--------------------------------------------------------------------------+
|                                                                      text|
+--------------------------------------------------------------------------+
|Mustafa Gökhan Türer is a nice guy who likes aviation and lives in Samsun.|
+--------------------------------------------------------------------------+



In [None]:
preds = prediction_model.transform(sample_data)

preds.select(F.explode(F.arrays_zip(preds.ner_span.result,preds.ner_span.metadata)).alias("entities")) \
      .select(F.expr("entities['0']").alias("chunk"),
              F.expr("entities['1'].entity").alias("entity")).show(truncate=False)

+--------------------+------+
|chunk               |entity|
+--------------------+------+
|Mustafa Gökhan Türer|PER   |
|Samsun              |LOC   |
+--------------------+------+



In [None]:
from sparknlp.base import LightPipeline

light_model = LightPipeline(prediction_model)

result = light_model.annotate(text)

list(zip(result['token'], result['ner']))

[('Mustafa', 'B-PER'),
 ('Gökhan', 'I-PER'),
 ('Türer', 'I-PER'),
 ('is', 'O'),
 ('a', 'O'),
 ('nice', 'O'),
 ('guy', 'O'),
 ('who', 'O'),
 ('likes', 'O'),
 ('aviation', 'O'),
 ('and', 'O'),
 ('lives', 'O'),
 ('in', 'O'),
 ('Samsun', 'B-LOC'),
 ('.', 'O')]

In [None]:
import pandas as pd

result = light_model.fullAnnotate(text)

ner_df= pd.DataFrame([(int(x.metadata['sentence']), x.result, x.begin, x.end, y.result) for x,y in zip(result[0]["token"], result[0]["ner"])], 
                      columns=['sent_id','token','start','end','ner'])
ner_df

Unnamed: 0,sent_id,token,start,end,ner
0,0,Mustafa,0,6,B-PER
1,0,Gökhan,8,13,I-PER
2,0,Türer,15,19,I-PER
3,0,is,21,22,O
4,0,a,24,24,O
5,0,nice,26,29,O
6,0,guy,31,33,O
7,0,who,35,37,O
8,0,likes,39,43,O
9,0,aviation,45,52,O


## Highlight Entities

In [None]:
ann_text = light_model.fullAnnotate(text)[0]
ann_text.keys()

dict_keys(['document', 'ner_span', 'token', 'ner', 'embeddings', 'sentence'])

In [None]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

visualiser.display(ann_text, label_col='ner_span', document_col='document')

# Streamlit