In this notebook use the spark-nlp library to compute the NER of the title/body columns from the questions dataset.

* spark-nlp [docs](https://nlp.johnsnowlabs.com/docs/en/quickstart)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import size, col, sum, expr, explode, desc, length
from pyspark.ml import Pipeline

import os
import re


from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetector, Tokenizer, BertEmbeddings, NerDLModel, NerConverter

In [None]:
spark = (
    SparkSession
    .builder
    .appName('NLP I')
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.3.2")
    .getOrCreate()
)

In [None]:
spark.version

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/questions')

In [None]:
dataDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
    .withColumnRenamed('title', 'Text')
)

In [None]:
documentAssembler = (
    DocumentAssembler()
    .setInputCol('Text')
    .setOutputCol('document')
)

In [None]:
sentenceDetector = (
    SentenceDetector()
    .setInputCols('document')
    .setOutputCol('sentence')
)

In [None]:
model = Pipeline().setStages([documentAssembler, sentenceDetector]).fit(dataDF)

In [None]:
(
  model.transform(dataDF)
  .withColumn('sentences', size('sentence'))
  .agg(sum('sentences'))
).show()

In [None]:
tokenizer = (
    Tokenizer()
    .setInputCols(['document'])
    .setOutputCol('token')
)

model = Pipeline().setStages([documentAssembler, tokenizer]).fit(dataDF)

model.transform(dataDF).select('token').show(truncate=100)

In [None]:
embeddings = (
    BertEmbeddings
    .pretrained('bert_base_cased', 'en')
    .setInputCols(['token', 'document'])
    .setOutputCol('embeddings')
)

In [None]:
model = Pipeline().setStages([documentAssembler, tokenizer, embeddings]).fit(dataDF)

# model.transform(dataDF).show()

In [None]:
"""
ner_converter = (
    NerConverter()
    .setInputCols(['document', 'token', 'ner'])
    .setOutputCol('entities')
)

model = Pipeline().setStages([documentAssembler, tokenizer, embeddings, ner, ner_converter]).fit(dataDF)

result = model.transform(df.filter(length(col('Text')) < 100))


result.select('Text', 'entities').show()
"""