# NER System Training

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-07-07 16:08:16--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-07-07 16:08:16--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1608 (1.6K) [text/plain]
Saving to: ‘STDOUT’


2021-07-07 16:08:16 (29.5 MB/s) - written to stdout [1608/1608]

setup Colab for PySpark 3.0.3 and Spark NLP 3.1.2
Get:1 http://security.ubuntu.com/ubuntu 

## Import Moduli

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, arrays_zip, monotonically_increasing_id, regexp_replace
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.training import CoNLL

spark = sparknlp.start(gpu=True)
# spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

import datetime
import pandas as pd
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import sys
import numpy as np

Spark NLP version:  3.1.2
Apache Spark version:  3.0.3


In [None]:
spark = SparkSession.builder.appName("App")\
  .config("spark.executor.memory", "12g")\
  .config("spark.driver.memory", "5g")\
  .config("spark.memory.offHeap.enabled",True)\
  .config("spark.memory.offHeap.size","12g")\
  .config("spark.sql.broadcastTimeout", "360000")\
  .getOrCreate()

In [None]:
spark

## Dataset

In [None]:
df_sym = spark.read.text("/content/drive/MyDrive/final-project-BDABI/dataset/symptoms_diseases.txt")
df_sym.show(10)

## Preprocessing

In [None]:
df_sym = df_sym.withColumn("value", regexp_replace("value", '\t', '   '))

columns = ['value']
vals = [['-DOCSTART- -X- -X- O'], ['']]
intestazione = spark.createDataFrame(vals, columns)

df_sym = intestazione.union(df_sym)

In [None]:
df_sym.coalesce(1).write.format("text") \
                  .option("header", "false") \
                  .option("encoding", "UTF-8") \
                  .save("/content/drive/MyDrive/final-project-BDABI/dataset/training_set")

**Nota:** è necessario rinominare il file appena salvato

In [None]:
df = CoNLL().readDataset(spark, "/content/drive/MyDrive/final-project-BDABI/dataset/training_set/training_set.txt")
df.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Ex fumatore , ipe...|[[document, 0, 17...|[[document, 0, 17...|[[token, 0, 1, Ex...|[[pos, 0, 1, , [w...|[[named_entity, 0...|
|Cardiopatia ische...|[[document, 0, 12...|[[document, 0, 12...|[[token, 0, 10, C...|[[pos, 0, 10, , [...|[[named_entity, 0...|
|EOC : toni puri s...|[[document, 0, 10...|[[document, 0, 10...|[[token, 0, 2, EO...|[[pos, 0, 2, , [w...|[[named_entity, 0...|
|Iperteso in terap...|[[document, 0, 51...|[[document, 0, 51...|[[token, 0, 7, Ip...|[[pos, 0, 7, , [w...|[[named_entity, 0...|
|Attuale buon comp...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 6, At...|[[pos, 0, 6, , [w..

Si rilevano le **sentence** e si convertono le **label** associate alle entità (e.g. *B-Symptom* e *I-Symptom*) in *chunk* (e.g. *Symptom*).

In [None]:
sentence = SentenceDetector() \
  .setInputCols(["document"]) \
  .setOutputCol("sentence")

pos = PerceptronModel.pretrained("pos_ud_isdt", "it") \
  .setInputCols(["document", "token"]) \
  .setOutputCol("pos")

converter = NerConverter() \
  .setInputCols(["document", "token", "label"]) \
  .setOutputCol("chunk")

preproc_pipeline = Pipeline(
  stages = [
    sentence,
    pos,
    converter
  ])

df = preproc_pipeline.fit(df).transform(df)
display(df.limit(10))

pos_ud_isdt download started this may take some time.
Approximate size to download 2.3 MB
[OK!]


DataFrame[text: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, sentence: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, pos: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, label: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, chunk: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [None]:
texts = df.select("text")

print("Numero di entry: ", texts.count())
print("Numero di entry distinte:", texts.distinct().count())

Numero di entry:  5638
Numero di entry distinte: 5410


Si eliminano le entry duplicate.

In [None]:
df = df.drop_duplicates(["text"])

print("Numero di entry: ", df.count())

Numero di entry:  5410


## Data Analysis

### Sentence

In [None]:
sentences = df.select("sentence.result") \
           .withColumn("sentence", explode("result")) \
           .select("sentence")
  
print("Numero di sentence:", sentences.count())
print("Numero di sentence distinte:", sentences.distinct().count())

Numero di sentence: 20654
Numero di sentence distinte: 11568


### Token

In [None]:
tokens = df.select("token.result") \
           .withColumn("token", explode("result")) \
           .select("token")
  
print("Numero di token:", tokens.count())
print("Numero di token distinti:", tokens.distinct().count())

Numero di token: 218225
Numero di token distinti: 7945


### POS

In [None]:
pos = df.select(col("token.result").alias("token"), col("pos.result").alias("pos")) \
        .withColumn("tmp", arrays_zip("token", "pos")) \
        .withColumn("tmp", explode("tmp")) \
        .select("tmp.token", "tmp.pos")

print("Numero di coppie (token, pos):", pos.count())
print("Numero di coppie (token, pos) distinte:", pos.distinct().count())

Numero di coppie (token, pos): 218225
Numero di coppie (token, pos) distinte: 10139


### Label

In [None]:
labels = df.select(col("token.result").alias("token"), col("label.result").alias("label")) \
         .withColumn("tmp", arrays_zip("token", "label")) \
         .withColumn("tmp", explode("tmp")) \
         .select("tmp.token", "tmp.label")

print("Numero di coppie (token, label):", labels.count())
print("Numero di coppie (token, label) distinte:", labels.distinct().count())

Numero di coppie (token, label): 218225
Numero di coppie (token, label) distinte: 10295


Ad alcuni token sono associate diverse label (10295 != 7945).

In [None]:
labels.groupBy("label").count().show()

labels.distinct().groupBy("label").count().show()

+---------+------+
|    label| count|
+---------+------+
|I-Symptom| 15146|
|        O|184067|
|B-Symptom|  4892|
|B-Disease|  6131|
|I-Disease|  7989|
+---------+------+

+---------+-----+
|    label|count|
+---------+-----+
|I-Symptom|  934|
|        O| 7018|
|B-Symptom|  605|
|B-Disease|  775|
|I-Disease|  963|
+---------+-----+



### Chunk

In [None]:
chunks = df.select("chunk.result") \
           .withColumn("chunk", explode("result")) \
           .select("chunk")
  
print("Numero di chunk:", chunks.count())
print("Numero di chunk distinti:", chunks.distinct().count())

Numero di chunk: 11109
Numero di chunk distinti: 3357


### Entity

In [None]:
entities = df.select("chunk.result", "chunk.metadata") \
             .withColumn("tmp", arrays_zip("result", "metadata")) \
             .withColumn("tmp", explode("tmp")) \
             .select("tmp.result", "tmp.metadata.entity") \
             .withColumnRenamed("result", "chunk")

print("Numero di coppie (chunk, entity):", entities.count())
print("Numero di coppie (chunk, entity) distinte:", entities.distinct().count())

Numero di coppie (chunk, entity): 11109
Numero di coppie (chunk, entity) distinte: 3524


Ad alcuni chunk sono associate diverse entity (3524 != 3357).

In [None]:
entities.groupBy("entity").count().show()

entities.distinct().groupBy("entity").count().show()

+-------+-----+
| entity|count|
+-------+-----+
|Disease| 6167|
|Symptom| 4942|
+-------+-----+

+-------+-----+
| entity|count|
+-------+-----+
|Disease| 1736|
|Symptom| 1788|
+-------+-----+



## Data Preparation

In [None]:
temp = df.withColumn("id", monotonically_increasing_id())

df_test = temp.sample(False, 0.1, seed=0)
df_train = temp.join(df_test, on="id", how="left_anti")

In [None]:
dataset_size = df.count()
train_size = df_train.count()
test_size = df_test.count()

print("Training set:", train_size, "( " + str(train_size/dataset_size * 100) + " % )")
print("Test set:", test_size, "( " + str(test_size/dataset_size * 100) + " % )")

Training set: 4835 ( 89.37153419593345 % )
Test set: 575 ( 10.628465804066543 % )


### Training Set

In [None]:
train_entities = df_train.select("chunk.result", "chunk.metadata") \
                         .withColumn("tmp", arrays_zip("result", "metadata")) \
                         .withColumn("tmp", explode("tmp")) \
                         .select("tmp.result", "tmp.metadata.entity") \
                         .withColumnRenamed("result", "chunk")

In [None]:
train_entities.groupBy("entity").count().show()

train_entities.distinct().groupBy("entity").count().show()

+-------+-----+
| entity|count|
+-------+-----+
|Disease| 5506|
|Symptom| 4423|
+-------+-----+

+-------+-----+
| entity|count|
+-------+-----+
|Disease| 1600|
|Symptom| 1648|
+-------+-----+



### Test Set

In [None]:
test_entities = df_test.select("chunk.result", "chunk.metadata") \
                       .withColumn("tmp", arrays_zip("result", "metadata")) \
                       .withColumn("tmp", explode("tmp")) \
                       .select("tmp.result", "tmp.metadata.entity") \
                       .withColumnRenamed("result", "chunk")

In [None]:
test_entities.groupBy("entity").count().show()

test_entities.distinct().groupBy("entity").count().show()

+-------+-----+
| entity|count|
+-------+-----+
|Disease|  661|
|Symptom|  519|
+-------+-----+

+-------+-----+
| entity|count|
+-------+-----+
|Disease|  368|
|Symptom|  325|
+-------+-----+



## Deep Learning Approach

In [None]:
bert = BertEmbeddings.pretrained("bert_base_italian_uncased", lang="it") \
  .setInputCols("sentence", "token") \
  .setOutputCol("bert") \
  .setCaseSensitive(False)

DL_ner = NerDLApproach()\
  .setInputCols(["sentence", "token", "bert"]) \
  .setLabelColumn("label") \
  .setOutputCol("ner") \
  .setMaxEpochs(10) \
  .setLr(1e-3) \
  .setPo(0.005) \
  .setBatchSize(8) \
  .setRandomSeed(0) \
  .setVerbose(2) \
  .setEnableOutputLogs(True)

DL_pipeline = Pipeline(
  stages = [
    bert,
    DL_ner
  ])

bert_base_italian_uncased download started this may take some time.
Approximate size to download 393.2 MB
[OK!]


### Training

In [None]:
%%time
DL_model = DL_pipeline.fit(df_train)

In [None]:
! cat /root/annotator_logs/NerDLApproach_a049c8b4816c.log

Name of the selected graph: ner-dl/blstm_10_768_128_120.pb
Training started - total epochs: 10 - lr: 0.001 - batch size: 8 - labels: 5 - chars: 76 - training examples: 18427


Epoch 1/10 started, lr: 0.001, dataset size: 18427


Epoch 1/10 - 315.22s - loss: 6866.2095 - batches: 2305


Epoch 2/10 started, lr: 9.950249E-4, dataset size: 18427


Epoch 2/10 - 316.66s - loss: 3910.9993 - batches: 2305


Epoch 3/10 started, lr: 9.90099E-4, dataset size: 18427


Epoch 3/10 - 335.70s - loss: 3332.9546 - batches: 2305


Epoch 4/10 started, lr: 9.852217E-4, dataset size: 18427


Epoch 4/10 - 328.57s - loss: 3076.4937 - batches: 2305


Epoch 5/10 started, lr: 9.803922E-4, dataset size: 18427


Epoch 5/10 - 320.93s - loss: 2908.4597 - batches: 2305


Epoch 6/10 started, lr: 9.756098E-4, dataset size: 18427


Epoch 6/10 - 324.77s - loss: 2817.1863 - batches: 2305


Epoch 7/10 started, lr: 9.7087387E-4, dataset size: 18427


Epoch 7/10 - 314.74s - loss: 2731.546 - batches: 2305


Epoch 8/10 started,

In [None]:
DL_model.stages[1].write().save('/content/drive/MyDrive/final-project-BDABI/models/NER_DL_' + datetime.datetime.today().strftime("%y-%m-%d-%H-%M"))

### Evaluation

In [None]:
bert = BertEmbeddings.pretrained("bert_base_italian_uncased", lang="it") \
  .setInputCols("sentence", "token") \
  .setOutputCol("bert") \
  .setCaseSensitive(False)

# model_path = "/content/drive/MyDrive/final-project-BDABI/models/NER_DL_21-07-01-14-14"   # 5 epochs
model_path = "/content/drive/MyDrive/final-project-BDABI/models/NER_DL_21-07-02-13-02"   # 10 epochs

DL_ner_loaded = NerDLModel.load(model_path) \
   .setInputCols(["sentence", "token", "bert"]) \
   .setOutputCol("ner")

In [None]:
DL_preds = bert.transform(df_test)

DL_preds = DL_ner_loaded.transform(DL_preds)

In [None]:
preds = DL_preds.select(col("token.result").alias("token"), \
                        col("label.result").alias("label"), \
                        col("ner.result").alias("ner")) \
                .withColumn("tmp", arrays_zip("token", "label", "ner")) \
                .withColumn("tmp", explode("tmp")) \
                .select("tmp.token", "tmp.label", "tmp.ner")

In [None]:
preds.filter("label!='O'").show(10)

In [None]:
preds_df = preds.toPandas()

report = classification_report(preds_df["label"], preds_df["ner"], output_dict=True)
df_report = pd.DataFrame(report).T

In [None]:
print(df_report.to_latex())

#### Curve di Loss

In [None]:
log_path = "/content/drive/MyDrive/final-project-BDABI/models/BERT_NER_DL_10epochs.txt"

In [None]:
log = pd.read_csv(log, sep='-', skiprows=2, header=None, names=['epoch', 'time', 'loss', 'batches'])

log['loss'] = log.loss.str.strip('loss: ')
log['batches'] = log.batches.str.strip('batches: ')
log['epoch'] = log.epoch.str.split('/').str[0]
log['epoch'] = log.epoch.str.strip('Epoch ')

log = log.astype({'epoch': 'int32', 'loss':float}, errors='ignore')

In [None]:
plt.figure(figsize=(10,6))
plt.plot(log['epoch'], log['loss'], 'r', linewidth=1.5)
plt.title('Curva di Loss', size=24)
plt.xlabel('Epoch', size=14)
plt.ylabel('Loss', size=14)
plt.xticks(ticks=log.epoch)
plt.grid()
plt.show()