# NER System Inference

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-07-06 07:17:26--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-07-06 07:17:26--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1608 (1.6K) [text/plain]
Saving to: ‘STDOUT’


2021-07-06 07:17:26 (28.8 MB/s) - written to stdout [1608/1608]

setup Colab for PySpark 3.0.3 and Spark NLP 3.1.1
Get:1 https://cloud.r-project.org/bin/li

## Import moduli

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, arrays_zip, monotonically_increasing_id
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.training import CoNLL

spark = sparknlp.start(gpu=True)
# spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

import datetime
import pandas as pd
from sklearn.metrics import classification_report

Spark NLP version:  3.1.1
Apache Spark version:  3.0.3


In [None]:
spark = SparkSession.builder.appName("App")\
  .config("spark.executor.memory", "12g")\
  .config("spark.driver.memory", "5g")\
  .config("spark.memory.offHeap.enabled",True)\
  .config("spark.memory.offHeap.size","12g")\
  .config("spark.sql.broadcastTimeout", "360000")\
  .getOrCreate()

In [None]:
spark

## Read Dataset

In [None]:
file_type = "csv"
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
path = "/content/drive/MyDrive/final-project-BDABI/dataset/DF_ANAM_DIA_COMPLETO.csv"

df = spark.read.format(file_type) \
          .option("inferSchema", infer_schema) \
          .option("header", first_row_is_header) \
          .option("multiLine", "true") \
          .option("sep", delimiter) \
          .load(path)

In [None]:
print("Numero di visite:" , df.count())

Numero di visite: 147599


In [None]:
df.limit(5).show()

+---------+--------+--------------------+
|ID_VISITA|ANAMNESI|            DIAGNOSI|
+---------+--------+--------------------+
|        0|    null|Ipertensione Arte...|
|        1|    null|Ipertensione Arte...|
|        2|    null|Prossimo controll...|
|        3|    null|CODICI IMPEGNATIV...|
|        4|    null|VISITA DI CONTROL...|
+---------+--------+--------------------+



## NLP Pipeline

In [None]:
document = DocumentAssembler() \
  .setInputCol("text") \
  .setOutputCol("document")

sentence = SentenceDetector() \
  .setInputCols(["document"]) \
  .setOutputCol("sentence")

tokenizer = Tokenizer() \
  .setInputCols(["sentence"]) \
  .setOutputCol("token") \
  .setSplitChars(["."])

bert = BertEmbeddings.pretrained("bert_base_italian_uncased", lang="it") \
  .setInputCols("sentence", "token") \
  .setOutputCol("bert") \
  .setCaseSensitive(False)

# model = "NER_DL_21-07-01-14-14"   # 4 epochs
model = "NER_DL_21-07-02-13-02"   # 10 epochs

ner = NerDLModel.load("/content/drive/MyDrive/final-project-BDABI/models/" + model) \
   .setInputCols(["sentence", "token", "bert"]) \
   .setOutputCol("ner")

converter = NerConverter() \
   .setInputCols(["document", "token", "ner"]) \
   .setOutputCol("chunk")

pipeline = Pipeline(
  stages = [
    document,
    sentence,
    tokenizer,
    bert,
    ner,
    converter
  ])

bert_base_italian_uncased download started this may take some time.
Approximate size to download 393.2 MB
[OK!]


## Anamnesi

In [None]:
anamnesi = df.dropna(subset=["ANAMNESI"]) \
             .select("ID_VISITA", "ANAMNESI") \
             .withColumnRenamed("ANAMNESI", "text")

print("Anamnesi totali:", anamnesi.count())
print("Anamnesi distinte:", anamnesi.distinct().count())

Anamnesi totali: 2012
Anamnesi distinte: 2012


In [None]:
anamnesi = anamnesi.drop_duplicates()

print("Anamnesi totali:", anamnesi.count())

In [None]:
display(anamnesi.limit(5))

In [None]:
res_anamnesi = pipeline.fit(anamnesi).transform(anamnesi)

In [None]:
path = "/content/drive/MyDrive/final-project-BDABI/inference/" + model + "/anamnesi"

res_anamnesi.select("ID_VISITA", "text", "chunk") \
            .coalesce(1).write.format("json") \
            .save(path)

In [None]:
display(res_anamnesi.limit(5))

## Diagnosi

In [None]:
diagnosi = df.dropna(subset=["DIAGNOSI"]) \
             .select("ID_VISITA", "DIAGNOSI") \
             .withColumnRenamed("DIAGNOSI", "text")

print("Diagnosi totali:", diagnosi.count())
print("Diagnosi distinte:", diagnosi.distinct().count())

Diagnosi totali: 134094
Diagnosi distinte: 134094


In [None]:
diagnosi = diagnosi.drop_duplicates()

print("Anamnesi totali:", diagnosi.count())

In [None]:
display(diagnosi.limit(5))

In [None]:
res_diagnosi = pipeline.fit(diagnosi).transform(diagnosi)

In [None]:
path = "/content/drive/MyDrive/final-project-BDABI/inference/" + model + "/diagnosi"

res_diagnosi.select("ID_VISITA", "text", "chunk") \
            .coalesce(1).write.format("json") \
            .save(path)

In [None]:
display(res_diagnosi.limit(5))