<a href="https://colab.research.google.com/github/gokhanturer/NER_Model_SparkNLP/blob/main/BioNLP09_IOB_NER_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q pyspark==3.1.2 spark-nlp

! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 66 kB/s 
[K     |████████████████████████████████| 140 kB 66.1 MB/s 
[K     |████████████████████████████████| 198 kB 19.0 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 95 kB 4.2 MB/s 
[K     |████████████████████████████████| 66 kB 5.0 MB/s 
[?25h

In [2]:
import sparknlp

spark = sparknlp.start(gpu = True) 

from sparknlp.base import *
from sparknlp.annotator import *
import pyspark.sql.functions as F
from sparknlp.training import CoNLL

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.4.1
Apache Spark version: 3.1.2


In [3]:
!wget -q https://raw.githubusercontent.com/gokhanturer/NER_Model_SparkNLP/main/BioNLP09_IOB_train.conll
!wget -q https://raw.githubusercontent.com/gokhanturer/NER_Model_SparkNLP/main/BioNLP09_IOB_test.conll

In [None]:
with open ("BioNLP09_IOB_train.conll") as f:
  train_data = f.read()

In [23]:
train_data = CoNLL().readDataset(spark, 'BioNLP09_IOB_train.conll')

train_data.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Reactive oxygen i...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 7, Re...|[{pos, 0, 7, NN, ...|[{named_entity, 0...|
|We previously rep...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 1, We...|[{pos, 0, 1, NN, ...|[{named_entity, 0...|
|However , the sou...|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 6, Ho...|[{pos, 0, 6, NN, ...|[{named_entity, 0...|
|5 - LOX and 5 - L...|[{document, 0, 12...|[{document, 0, 12...|[{token, 0, 0, 5,...|[{pos, 0, 0, NN, ...|[{named_entity, 0...|
|Stimulation of ly...|[{document, 0, 29...|[{document, 0, 29...|[{token, 0, 10, S...|[{pos, 0, 10, NN,..

In [24]:
train_data.select(F.explode(F.arrays_zip('token.result', 'pos.result',  'label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("pos"),
        F.expr("cols['2']").alias("ner_label")).show(truncate=50)

+------------+---+---------+
|       token|pos|ner_label|
+------------+---+---------+
|    Reactive| NN|        O|
|      oxygen| NN|        O|
|intermediate| NN|        O|
|           -| NN|        O|
|   dependent| NN|        O|
|          NF| NN|        O|
|           -| NN|        O|
|      kappaB| NN|        O|
|  activation| NN|        O|
|          by| NN|        O|
| interleukin| NN|B-Protein|
|           -| NN|I-Protein|
|       1beta| NN|I-Protein|
|    requires| NN|        O|
|           5| NN|B-Protein|
|           -| NN|I-Protein|
|lipoxygenase| NN|I-Protein|
|          or| NN|        O|
|       NADPH| NN|        O|
|     oxidase| NN|        O|
+------------+---+---------+
only showing top 20 rows



In [None]:
train_data.select(F.explode(F.arrays_zip("token.result","label.result")).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy("ground_truth").count().orderBy("count", ascending=False).show(100,truncate=False)

In [5]:
#conll_data.select(F.countDistinct("label.result")).show()
#conll_data.groupBy("label.result").count().show(truncate=False)

train_data = train_data.withColumn('unique', F.array_distinct("label.result"))\
                       .withColumn('c', F.size('unique'))\
                       .filter(F.col('c')>1)

#train_data.select(F.explode(F.arrays_zip('token.result','label.result')).alias("cols")) \
          #.select(F.expr("cols['0']").alias("token"),
                  #F.expr("cols['1']").alias("ground_truth"))\
          #.groupBy('ground_truth')\
          #.count()\
          #.orderBy('count', ascending=False)\
          #.show(100,truncate=False)

In [None]:
with open ("BioNLP09_IOB_test.conll") as f:
  test_data = f.read()
#print (test_data[:500])

In [6]:
test_data = CoNLL().readDataset(spark, 'BioNLP09_IOB_test.conll')
#test_data.show(5)

In [None]:
test_data.select(F.explode(F.arrays_zip("token.result","label.result")).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy("ground_truth").count().orderBy("count", ascending=False).show(100,truncate=False)

In [7]:
glove_embeddings = WordEmbeddingsModel.pretrained()\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [8]:
glove_embeddings.transform(test_data).write.parquet('/content/drive/MyDrive/Parquet/testdata_bionlp09.parquet')

In [9]:
nerTagger = NerDLApproach()\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setLabelColumn("label")\
    .setOutputCol("ner")\
    .setMaxEpochs(8)\
    .setLr(0.003)\
    .setDropout(0.5)\
    .setBatchSize(10)\
    .setRandomSeed(0)\
    .setVerbose(1)\
    .setEvaluationLogExtended(True) \
    .setEnableOutputLogs(True)\
    .setIncludeConfidence(True)\
    .setTestDataset('/content/drive/MyDrive/Parquet/testdata_bionlp09.parquet')\
    .setEnableMemoryOptimizer(True)

ner_pipeline = Pipeline(stages=[
      glove_embeddings,
      nerTagger
])

In [10]:
%%time

ner_model = ner_pipeline.fit(train_data)

CPU times: user 11.7 s, sys: 1.27 s, total: 13 s
Wall time: 37min 24s


In [11]:
!cd ~/annotator_logs/ && ls -lt

total 4
-rw-r--r-- 1 root root 4076 Feb 16 20:24 NerDLApproach_b1fcf69f949a.log


In [12]:
!cat ~/annotator_logs/NerDLApproach_b1fcf69f949a.log

Name of the selected graph: ner-dl/blstm_10_100_128_120.pb
Training started - total epochs: 8 - lr: 0.003 - batch size: 10 - labels: 3 - chars: 80 - training examples: 4711


Epoch 1/8 started, lr: 0.003, dataset size: 4711


Epoch 1/8 - 267.50s - loss: 2438.2388 - batches: 472
Quality on test dataset: 
time to finish evaluation: 32.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
I-Protein	 2944	 410	 1134	 0.8777579	 0.7219225	 0.79224974
B-Protein	 2642	 457	 947	 0.8525331	 0.7361382	 0.7900718
tp: 5586 fp: 867 fn: 2081 labels: 2
Macro-average	 prec: 0.8651455, rec: 0.7290304, f1: 0.791277
Micro-average	 prec: 0.86564386, rec: 0.728577, f1: 0.79121816


Epoch 2/8 started, lr: 0.0029850747, dataset size: 4711


Epoch 2/8 - 237.14s - loss: 1023.5493 - batches: 472
Quality on test dataset: 
time to finish evaluation: 32.12s
label	 tp	 fp	 fn	 prec	 rec	 f1
I-Protein	 3382	 816	 696	 0.80562174	 0.8293281	 0.81730306
B-Protein	 2885	 494	 704	 0.8538029	 0.8038451	 0.8280712
tp: 6267 fp: 1310 fn: 

In [13]:
import pyspark.sql.functions as F

predictions = ner_model.transform(test_data)

predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show(truncate=False)

+--------------+------------+----------+
|token         |ground_truth|prediction|
+--------------+------------+----------+
|Daidzein      |O           |O         |
|and           |O           |O         |
|genistein     |O           |O         |
|glucuronides  |O           |O         |
|in            |O           |O         |
|vitro         |O           |O         |
|are           |O           |O         |
|weakly        |O           |O         |
|estrogenic    |O           |O         |
|and           |O           |O         |
|activate      |O           |O         |
|human         |O           |O         |
|natural       |O           |O         |
|killer        |O           |O         |
|cells         |O           |O         |
|at            |O           |O         |
|nutritionally |O           |O         |
|relevant      |O           |O         |
|concentrations|O           |O         |
|.             |O           |O         |
+--------------+------------+----------+
only showing top

In [14]:
from sklearn.metrics import classification_report, accuracy_score

preds_df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).toPandas()

print (classification_report(preds_df['ground_truth'], preds_df['prediction']))
print (accuracy_score(preds_df['ground_truth'], preds_df['prediction']))

              precision    recall  f1-score   support

   B-Protein       0.88      0.85      0.86      3589
   I-Protein       0.89      0.86      0.87      4078
           O       0.98      0.99      0.99     66957

    accuracy                           0.97     74624
   macro avg       0.92      0.90      0.91     74624
weighted avg       0.97      0.97      0.97     74624

0.9739493996569468


In [15]:
ner_model.stages

[WORD_EMBEDDINGS_MODEL_48cffc8b9a76, NerDLModel_6514fb7c849c]

In [16]:
ner_model.stages[1].write().overwrite().save("/content/drive/MyDrive/BioNLP09_IOB_NER_Model/Ner_glove_100d_e8_b10_lr0.03")

In [17]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')
    
glove_embeddings = WordEmbeddingsModel.pretrained()\
    .setInputCols(["document", "token"])\
    .setOutputCol("embeddings")

loaded_ner_model = NerDLModel.load("/content/drive/MyDrive/BioNLP09_IOB_NER_Model/Ner_glove_100d_e8_b10_lr0.03")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_span")

ner_prediction_pipeline = Pipeline(stages = [
      document,
      sentence,
      token,
      glove_embeddings,
      loaded_ner_model,
      converter
  ])

empty_data = spark.createDataFrame([['']]).toDF("text")

prediction_model = ner_prediction_pipeline.fit(empty_data)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [38]:
text = '''
MACROPHAGES ARE MONONUCLEAR phagocytes that reside within almost all tissues including adipose tissue, where they are identifiable as distinct populations with tissue-specific morphology, localization, and function (1). During the process of atherosclerosis, monocytes adhere to the endothelium and migrate into the intima, express scavenger receptors, and bind internalized lipoprotein particles resulting in the formation of foam cells (2). In obesity, adipose tissue contains an increased number of resident macrophages (3, 4). Macrophage accumulation in proportion to adipocyte size may increase the adipose tissue production of proinflammatory and acute-phase molecules and thereby contribute to the pathophysiological consequences of obesity (1, 3). These facts indicate that macrophages play an important role in a variety of diseases. When activated, macrophages release stereotypical profiles of cytokines and biological molecules such as nitric oxide TNF-α, IL-6, and IL-1 (5). TNF-α is a potent chemoattractant (6) and originates predominantly from residing mouse peritoneal macrophages (MPM) and mast cells (7). TNF-α induces leukocyte adhesion and degranulation, stimulates nicotinamide adenine dinucleotide phosphate (NADPH) oxidase, and enhances expression of IL-2 receptors and expression of E-selectin and intercellular adhesion molecules on the endothelium (8). TNF-α also stimulates expression of IL-1, IL-2, IL-6, and platelet-activating factor receptor (9). In addition, TNF-α decreases insulin sensitivity and increases lipolysis in adipocytes (10, 11). IL-6 also increase lipolysis and has been implicated in the hypertriglyceridemia and increased serum free fatty acid levels associated with obesity (12). Increased IL-6 signaling induces the expression of C-reactive protein and haptoglubin in liver (13). Recombinant IL-6 treatment increases atherosclerotic lesion size 5-fold (14). IL-6 also dose-dependently increases macrophage oxidative low-density lipoprotein (LDL) degradation and CD36 mRNA expression in vitro (15). These data clearly indicate that IL-6 and TNF-α are important pathogenetic factors associated with obesity, insulin resistance, and atherosclerosis. However, the factors regulating gene expression of these cytokines in macrophages have not been fully clarified.
'''

sample_data = spark.createDataFrame([[text]]).toDF("text")

sample_data.show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                                text|
+----------------------------------------------------------------------------------------------------+
|
MACROPHAGES ARE MONONUCLEAR phagocytes that reside within almost all tissues including adipose t...|
+----------------------------------------------------------------------------------------------------+



In [39]:
preds = prediction_model.transform(sample_data)

result_df = preds.select(F.explode(F.arrays_zip("ner_span.result","ner_span.metadata")).alias("entities")) \
                .select(F.expr("entities['0']").alias("chunk"),
                        F.expr("entities['1'].entity").alias("entity")).show(truncate=False)

+--------------------------------------+-------+
|chunk                                 |entity |
+--------------------------------------+-------+
|IL-6                                  |Protein|
|IL-1                                  |Protein|
|dinucleotide phosphate (NADPH) oxidase|Protein|
|IL-2 receptors                        |Protein|
|E-selectin                            |Protein|
|IL-1                                  |Protein|
|IL-2                                  |Protein|
|IL-6                                  |Protein|
|insulin                               |Protein|
|IL-6                                  |Protein|
|haptoglubin                           |Protein|
|IL-6                                  |Protein|
|CD36                                  |Protein|
|IL-6                                  |Protein|
|insulin                               |Protein|
+--------------------------------------+-------+



In [40]:
from sparknlp.base import LightPipeline

light_model = LightPipeline(prediction_model)

result = light_model.annotate(text)

list(zip(result['token'], result['ner']))

[('MACROPHAGES', 'O'),
 ('ARE', 'O'),
 ('MONONUCLEAR', 'O'),
 ('phagocytes', 'O'),
 ('that', 'O'),
 ('reside', 'O'),
 ('within', 'O'),
 ('almost', 'O'),
 ('all', 'O'),
 ('tissues', 'O'),
 ('including', 'O'),
 ('adipose', 'O'),
 ('tissue', 'O'),
 (',', 'O'),
 ('where', 'O'),
 ('they', 'O'),
 ('are', 'O'),
 ('identifiable', 'O'),
 ('as', 'O'),
 ('distinct', 'O'),
 ('populations', 'O'),
 ('with', 'O'),
 ('tissue-specific', 'O'),
 ('morphology', 'O'),
 (',', 'O'),
 ('localization', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('function', 'O'),
 ('(', 'O'),
 ('1', 'O'),
 (').', 'O'),
 ('During', 'O'),
 ('the', 'O'),
 ('process', 'O'),
 ('of', 'O'),
 ('atherosclerosis', 'O'),
 (',', 'O'),
 ('monocytes', 'O'),
 ('adhere', 'O'),
 ('to', 'O'),
 ('the', 'O'),
 ('endothelium', 'O'),
 ('and', 'O'),
 ('migrate', 'O'),
 ('into', 'O'),
 ('the', 'O'),
 ('intima', 'O'),
 (',', 'O'),
 ('express', 'O'),
 ('scavenger', 'O'),
 ('receptors', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('bind', 'O'),
 ('internalized', 'O'),
 

In [41]:
import pandas as pd

result = light_model.fullAnnotate(text)

ner_df= pd.DataFrame([(int(x.metadata['sentence']), x.result, x.begin, x.end, y.result) for x,y in zip(result[0]["token"], result[0]["ner"])], 
                      columns=['sent_id','token','start','end','ner'])
ner_df.head(15)

Unnamed: 0,sent_id,token,start,end,ner
0,0,MACROPHAGES,1,11,O
1,0,ARE,13,15,O
2,0,MONONUCLEAR,17,27,O
3,0,phagocytes,29,38,O
4,0,that,40,43,O
5,0,reside,45,50,O
6,0,within,52,57,O
7,0,almost,59,64,O
8,0,all,66,68,O
9,0,tissues,70,76,O


In [42]:
ann_text = light_model.fullAnnotate(text)[0]
ann_text.keys()

dict_keys(['document', 'ner_span', 'token', 'ner', 'embeddings', 'sentence'])

In [43]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()
print ('Standard Output')
visualiser.display(ann_text, label_col='ner_span', document_col='document')

Standard Output
