<a href="https://colab.research.google.com/github/gowthambalachandhiran/Spark-NLP-NER/blob/main/SparkNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NER with BERT in Spark NLP

In [182]:
!python -V

Python 3.6.9


## Installation

In [1]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Processing /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471/pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4
Collecting spark-nlp
  Using cached https://files.pythonhosted.org/packages/84/84/3f15673db521fbc4e8e0ec3677a019ba1458b2cb70f0f7738c221511ef32/spark_nlp-2.6.3-py2.py3-none-any.whl
Installing collected packages: spark-nlp
Successfully installed spark-nlp-2.6.3


## Import libraries and download datasets

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [3]:
spark = sparknlp.start()
spark = sparknlp.start(gpu=True)

In [4]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.6.3
Apache Spark version:  2.4.4


In [5]:
def start(gpu=False):
    builder = SparkSession.builder \
        .appName("Spark NLP") \
        .master("local[*]") \
        .config("spark.driver.memory", "8G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.5.1")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.1")

    return builder.getOrCreate()

  
spark = start(gpu=False)

In [7]:
from urllib.request import urlretrieve

urlretrieve('https://github.com/JohnSnowLabs/spark-nlp/raw/master/src/test/resources/conll2003/eng.train',
           'eng.train')

urlretrieve('https://github.com/JohnSnowLabs/spark-nlp/raw/master/src/test/resources/conll2003/eng.testa',
           'eng.testa')


('eng.testa', <http.client.HTTPMessage at 0x7f2fe47f5278>)

In [8]:
import pandas as pd
ncbi=pd.read_csv('/content/training_dataframe.csv')
ncbi['pos']='NN'

In [9]:
ncbi.columns

Index(['Sentence', 'Token', 'Tag', 'pos'], dtype='object')

This disease data set is open data set and annotaded by body of NCBI data set

In [285]:
conll_lines="-DOCSTART- -X- -X- -O-\n\n"
save=0
file_loc = '/content/test.txt'
for sent, token, pos, label in zip(ncbi['Sentence'],ncbi['Token'],ncbi['pos'],ncbi['Tag']): 
    
# If the sentence ID has changed, that means we are starting a new sentence. We have to add an empty line.
    
    if save!=sent:
        conll_lines+='\n'
    
# Save the conll line
    
    conll_lines += "{} {} {} {}\n".format(token,pos,pos,label)
    
    save=sent
    

# Now print all of the lines to a text file

with open(file_loc,'w') as txtfile:
        
    for line in conll_lines:
        txtfile.write(line)

txtfile.close()

with open(file_loc,'r') as f:
    lines=f.readlines()[0:25]
f.close()
lines


['-DOCSTART- -X- -X- -O-\n',
 '\n',
 '\n',
 'Identification NN NN O\n',
 'of NN NN O\n',
 'APC2 NN NN O\n',
 ', NN NN O\n',
 'a NN NN O\n',
 'homologue NN NN O\n',
 'of NN NN O\n',
 'the NN NN O\n',
 'adenomatous NN NN B-Disease\n',
 'polyposis NN NN I-Disease\n',
 'coli NN NN I-Disease\n',
 'tumour NN NN I-Disease\n',
 'suppressor NN NN O\n',
 '. NN NN O\n',
 '\n',
 'The NN NN O\n',
 'adenomatous NN NN B-Disease\n',
 'polyposis NN NN I-Disease\n',
 'coli NN NN I-Disease\n',
 '( NN NN I-Disease\n',
 'APC NN NN I-Disease\n',
 ') NN NN I-Disease\n']

In [286]:
with open("/content/test.txt") as f:
    c=f.read()

print (c[:500])

-DOCSTART- -X- -X- -O-


Identification NN NN O
of NN NN O
APC2 NN NN O
, NN NN O
a NN NN O
homologue NN NN O
of NN NN O
the NN NN O
adenomatous NN NN B-Disease
polyposis NN NN I-Disease
coli NN NN I-Disease
tumour NN NN I-Disease
suppressor NN NN O
. NN NN O

The NN NN O
adenomatous NN NN B-Disease
polyposis NN NN I-Disease
coli NN NN I-Disease
( NN NN I-Disease
APC NN NN I-Disease
) NN NN I-Disease
tumour NN NN I-Disease
- NN NN O
suppressor NN NN O
protein NN NN O
controls NN NN O
the NN NN O


Creating conll format of data set for training

## Building NER pipeline

In [287]:
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, '/content/test.txt')
training_data.show(20)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Identification of...|[[document, 0, 89...|[[document, 0, 89...|[[token, 0, 13, I...|[[pos, 0, 13, NN,...|[[named_entity, 0...|
|The adenomatous p...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 2, Th...|[[pos, 0, 2, NN, ...|[[named_entity, 0...|
|Complex formation...|[[document, 0, 63...|[[document, 0, 63...|[[token, 0, 6, Co...|[[pos, 0, 6, NN, ...|[[named_entity, 0...|
|In colon carcinom...|[[document, 0, 19...|[[document, 0, 19...|[[token, 0, 1, In...|[[pos, 0, 1, NN, ...|[[named_entity, 0...|
|Here , we report ...|[[document, 0, 76...|[[document, 0, 76...|[[token, 0, 3, He...|[[pos, 0, 3, NN, ..

In [288]:
training_data.count()

3641

In [289]:
from pyspark.sql import functions as F

training_data.select(F.explode(F.arrays_zip('token.result','label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count()\
        .orderBy('count', ascending=False).show(100,truncate=False)

+------------+-----+
|ground_truth|count|
+------------+-----+
|O           |72040|
|I-Disease   |3375 |
|B-Disease   |2971 |
|nan         |1    |
+------------+-----+



Training and Evaluating NerDL¶
NerDL is a deep learning named entity recognition model in the SparkNLP library which does not require training data to contain parts-of-speech. It is a Bidirectional LSTM-CNN. For a more detailed overview of training a model using NerDL, you can check out this post. We've already loaded the BC5CDR-Chem test and train datasets. Now I can show you how to add Glove embeddings and save the test data as a parquet file before NerDL model training


Neural Network architecture is Char CNNs - BiLSTM - CRF that achieves state-of-the-art in most datasets.
Output type: Named_Entity
Input types: Document, Token, Word_Embeddings
Reference: NerDLApproach | NerDLModel


https://nlp.johnsnowlabs.com/docs/en/annotators

In [290]:
glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")
nerTagger = NerDLApproach()\
        .setInputCols(["sentence", "token", "embeddings"])\
        .setLabelColumn("label")\
        .setOutputCol("ner")\
        .setMaxEpochs(4)\
        .setLr(0.002)\
        .setPo(0.005)\
        .setBatchSize(16)\
        .setRandomSeed(0)\
        .setVerbose(1)\
        .setValidationSplit(0.2)\
        .setEvaluationLogExtended(True) \
        .setEnableOutputLogs(True)\
        .setIncludeConfidence(True)\
        .setOutputLogsPath('ner_logs')
ner_pipeline_glove = Pipeline(stages=[
    glove_embeddings,
    nerTagger
])

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [291]:
ner_model_glove = ner_pipeline_glove.fit(training_data)
ner_model_glove.stages[1].write().overwrite().save("/content/Tr_NER_DL")

In [293]:
test_data = glove_embeddings.transform(test)
predictions = ner_model_glove.transform(test_data)
from sklearn.metrics import classification_report
import pyspark.sql.functions as F

preds = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
        .select(F.col('cols.0').alias("token"),
        F.col('cols.1').alias("label"),
        F.col('cols.2').alias("ner"))





In [294]:
#Convert the Spark dataframe to a Pandas dataframe.
import pandas as pd
preds_df_dl =preds.toPandas()

In [295]:
print (classification_report(preds_df_dl['label'], preds_df_dl['ner']))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   B-Disease       0.86      0.90      0.88      2971
   I-Disease       0.92      0.83      0.88      3375
           O       0.99      0.99      0.99     72040
         nan       0.00      0.00      0.00         1

    accuracy                           0.98     78387
   macro avg       0.69      0.68      0.69     78387
weighted avg       0.98      0.98      0.98     78387



In [296]:
document = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")
sentence = SentenceDetector()\
  .setInputCols(['document'])\
  .setOutputCol('sentence')
token = Tokenizer()\
  .setInputCols(['sentence'])\
  .setOutputCol('token')
glove_embeddings = WordEmbeddingsModel() \
  .pretrained('glove_100d')\
  .setInputCols(["sentence",'token'])\
  .setOutputCol("embeddings")\
  .setCaseSensitive(True)
loaded_ner_model = NerDLModel.load("/content/Tr_NER_DL")\
  .setInputCols(["sentence", "token", "embeddings"])\
  .setOutputCol("ner")
converter = NerConverter()\
  .setInputCols(["document", "token", "ner"])\
  .setOutputCol("ner_span")
ner_prediction_pipeline = Pipeline(
  stages = [document,
            sentence,
            token,
            glove_embeddings,
            loaded_ner_model,
            converter])

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [297]:
empty_data = spark.createDataFrame([['']]).toDF("text")
prediction_model = ner_prediction_pipeline.fit(empty_data)
prediction_model.transform(empty_data)

DataFrame[text: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, sentence: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, ner: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, ner_span: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [298]:
from sparknlp.base import LightPipeline
light_model = LightPipeline(prediction_model)
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/utils/ner_highlighter.py
import ner_highlighter

In [299]:
text = """Coronavirus disease (COVID-19) is an infectious disease caused adenomatous  by a newly discovered coronavirus.

Most people infected with the COVID-19 virus will experience mild to moderate respiratory illness and recover without requiring special treatment.  Older people, and those with underlying medical problems like cardiovascular disease, diabetes, chronic respiratory disease, and cancer are more likely to develop serious illness.

The best way to prevent and slow down transmission is to be well informed about the COVID-19 virus, the disease it causes and how it spreads. Protect yourself and others from infection by washing your hands or using an alcohol based rub frequently and not touching your face. 
"""

In [300]:
result = light_model.fullAnnotate(text)[0]
ner_highlighter.chunk_highlighter(result, entity_column="ner_span")

Bidirectional Encoder Representations from Transformers is a Transformer-based machine learning technique for natural language processing pre-training developed by Google.

In [301]:
bert = BertEmbeddings.pretrained() \
 .setInputCols(["sentence", "token"])\
 .setOutputCol("bert")\
 .setCaseSensitive(False)

small_bert_L2_768 download started this may take some time.
Approximate size to download 139.6 MB
[OK!]


In [190]:
training_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Identification of...|[[document, 0, 89...|[[document, 0, 89...|[[token, 0, 13, I...|[[pos, 0, 13, NN,...|[[named_entity, 0...|
|The adenomatous p...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 2, Th...|[[pos, 0, 2, NN, ...|[[named_entity, 0...|
|Complex formation...|[[document, 0, 63...|[[document, 0, 63...|[[token, 0, 6, Co...|[[pos, 0, 6, NN, ...|[[named_entity, 0...|
|In colon carcinom...|[[document, 0, 19...|[[document, 0, 19...|[[token, 0, 1, In...|[[pos, 0, 1, NN, ...|[[named_entity, 0...|
|Here , we report ...|[[document, 0, 76...|[[document, 0, 76...|[[token, 0, 3, He...|[[pos, 0, 3, NN, ..

In [191]:
%%time
from pathlib import Path


# WARNING: Setting benchmark to true is  slow and might crash your system and is not recommended on standardCollab notebooks-- High end hardware and/or GPU required
## dataframe.cache() does not solve this. Results must be serialized to disk for maximum efficiency
### You might need to restart your driver after this step finishes
benchmark = False 


with_bert_path = "./with_bert.parquet"
if benchmark == True :
  if not Path(with_bert_path).is_dir(): 
    bert.transform(training_data).write.parquet("./with_bert.parquet")
    training_with_bert = spark.read.parquet("./with_bert.parquet").cache()
else : training_with_bert = bert.transform(training_data)


print(training_with_bert.count())
training_with_bert.select("token", "bert").show()

3641
+--------------------+--------------------+
|               token|                bert|
+--------------------+--------------------+
|[[token, 0, 13, I...|[[word_embeddings...|
|[[token, 0, 2, Th...|[[word_embeddings...|
|[[token, 0, 6, Co...|[[word_embeddings...|
|[[token, 0, 1, In...|[[word_embeddings...|
|[[token, 0, 3, He...|[[word_embeddings...|
|[[token, 0, 8, Ma...|[[word_embeddings...|
|[[token, 0, 3, Li...|[[word_embeddings...|
|[[token, 0, 4, Hu...|[[word_embeddings...|
|[[token, 0, 0, 3,...|[[word_embeddings...|
|[[token, 0, 2, AP...|[[word_embeddings...|
|[[token, 0, 0, A,...|[[word_embeddings...|
|[[token, 0, 2, Th...|[[word_embeddings...|
|[[token, 0, 2, Th...|[[word_embeddings...|
|[[token, 0, 7, Al...|[[word_embeddings...|
|[[token, 0, 1, In...|[[word_embeddings...|
|[[token, 0, 1, To...|[[word_embeddings...|
|[[token, 0, 5, Wi...|[[word_embeddings...|
|[[token, 0, 1, In...|[[word_embeddings...|
|[[token, 0, 4, Th...|[[word_embeddings...|
|[[token, 0, 1, We...|[[wor

In [192]:

nerTagger = NerDLApproach()\
  .setInputCols(["sentence", "token", "bert"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(1)\
  .setRandomSeed(0)\
  .setVerbose(0)

converter = NerConverter()\
  .setInputCols(["document", "token", "ner"])\
  .setOutputCol("ner_span")

pipeline = Pipeline(
    stages = [
    nerTagger,
    converter
  ])

In [194]:
%%time
import time
start = time.time()
print("Start fitting")
#We have to limit the rows in Collab, otherwise we will encounter exceptions because of RAM limitations
model = pipeline.fit(training_with_bert)  
print("Fitting is ended")
print (time.time() - start)

Start fitting
Fitting is ended
415.52539229393005
CPU times: user 114 ms, sys: 17.8 ms, total: 132 ms
Wall time: 6min 55s


In [223]:
from sklearn.metrics import classification_report
import pyspark.sql.functions as F

preds = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
        .select(F.col('cols.0').alias("token"),
        F.col('cols.1').alias("label"),
        F.col('cols.2').alias("ner"))

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|          embeddings|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Identification of...|[[document, 0, 89...|[[document, 0, 89...|[[token, 0, 13, I...|[[pos, 0, 13, NN,...|[[named_entity, 0...|[[word_embeddings...|
|The adenomatous p...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 2, Th...|[[pos, 0, 2, NN, ...|[[named_entity, 0...|[[word_embeddings...|
|Complex formation...|[[document, 0, 63...|[[document, 0, 63...|[[token, 0, 6, Co...|[[pos, 0, 6, NN, ...|[[named_entity, 0...|[[word_embeddings...|
|In colon carcinom...|[[document, 0, 19...|[[document, 0, 19...|[[token, 0, 1, In...|[[pos, 0, 1, NN, ...|

In [195]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

prediction_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        bert,
        model
    ]
)

In [204]:
prediction_data = spark.createDataFrame([[""]]).toDF("text")
prediction_model_bert = prediction_pipeline.fit(empty_data)
prediction_model.transform(empty_data)

DataFrame[text: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, sentence: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, bert: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, ner: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, ner_span: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [302]:
from sparknlp.base import LightPipeline
light_model = LightPipeline(prediction_model_bert)
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/utils/ner_highlighter.py
import ner_highlighter

In [303]:
result = light_model.fullAnnotate(text)[0]
ner_highlighter.chunk_highlighter(result, entity_column="ner_span")


In [235]:
word_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

nerTagger = NerCrfApproach()\
    .setInputCols(["sentence", "token", "pos","embeddings"])\
    .setLabelColumn("label")\
    .setOutputCol("ner")\
    .setMaxEpochs(9)\
    
ner_pipeline = Pipeline(stages=[
          word_embeddings,
          nerTagger
 ])

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [237]:
ner_model = ner_pipeline.fit(training_data)

In [238]:
from sparknlp.training import CoNLL

file_loc='/content/test.txt'
test = CoNLL().readDataset(spark, file_loc)

test_data = word_embeddings.transform(test)

In [240]:
predictions = ner_model.transform(test_data)


You can see all of your input and output columns in the final "predictions" dataframe, but I'll focus on the 'ner' column which contains the prediction, and the 'label' column which contains the ground truth. You can use sklearn.metrics classification_report to check the accuracy of the predictions using these 2 columns.

In [241]:
from sklearn.metrics import classification_report
import pyspark.sql.functions as F

preds = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
        .select(F.col('cols.0').alias("token"),
        F.col('cols.1').alias("label"),
        F.col('cols.2').alias("ner"))

In [242]:
#Convert the Spark dataframe to a Pandas dataframe.
import pandas as pd
preds_df=preds.toPandas()

In [243]:
print(classification_report(preds_df['label'], preds_df['ner']))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   B-Disease       0.92      0.74      0.82      2971
   I-Disease       0.95      0.74      0.83      3375
           O       0.98      1.00      0.99     72040
         nan       0.00      0.00      0.00         1

    accuracy                           0.98     78387
   macro avg       0.71      0.62      0.66     78387
weighted avg       0.98      0.98      0.98     78387



In [245]:
ner_model.stages[1].write().overwrite().save("crf_model_14_11_2020")

In [247]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

word_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

loaded_ner_model = NerCrfModel.load("crf_model_14_11_2020")\
  .setInputCols(["sentence", "token", "embeddings"])\
  .setOutputCol("ner")

ner_prediction_pipeline = Pipeline(
  stages = [document,
            sentence,
            token,
            word_embeddings,
            loaded_ner_model])



glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [271]:
prediction_data = spark.createDataFrame([["""Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus."""]]).toDF("text")
prediction_data.show()

+--------------------+
|                text|
+--------------------+
|Coronavirus disea...|
+--------------------+



In [276]:
prediction_model = ner_prediction_pipeline.fit(prediction_data)

In [277]:

%%time

lp = LightPipeline(prediction_model)
result = lp.annotate("Patient was suffering from heart disease")
for e in list(zip(result['token'], result['ner'])):
    print(e)

('Patient', 'O')
('was', 'O')
('suffering', 'O')
('from', 'O')
('heart', 'B-Disease')
('disease', 'I-Disease')
CPU times: user 51.8 ms, sys: 15 ms, total: 66.8 ms
Wall time: 120 ms
