## Setup & start Spark NLP

In [43]:
import pyspark.sql.functions as F
import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from IPython.display import display, HTML

# spark = SparkSession.builder.master("spark://localhost:7077").appName("HarryPotterAnalysis").getOrCreate()
# spark.catalog.clearCache()

def display_df(df, rows=1):
    display(HTML(df.toPandas().head(rows).to_html()))

spark = sparknlp.start()



## Load some data

In [37]:
data = spark.read.csv("hp_script.csv", header=True, inferSchema=True)
print("data rows:", data.count())
DIALOGUE_COLUMN = 'dialogue'
data.describe()

data rows: 793


DataFrame[summary: string, ID_number: string, scene: string, character_name: string, sentence: string]

In [3]:
# from sparknlp.pretrained import PretrainedPipeline
# # Define a pipeline
# pipeline = PretrainedPipeline('recognize_entities_dl', lang='en')
# ppres = pipeline.fullAnnotate(data)

## Create pipeline
* DocumentAssembler
* Tokenizer
* BERT classifier
* NER converter

In [44]:
document_assembler = DocumentAssembler().setInputCol(DIALOGUE_COLUMN).setOutputCol("document")


tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
# bert_base_token_classifier_conll03
ner_model = BertForTokenClassification \
    .pretrained("bert_token_classifier_base_token_classification_for_atc_english_uwb_atcc", "en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("ner")


ner_converter = NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_span")

pipeline = Pipeline(stages=[ document_assembler, 
                                 tokenizer,
                                 ner_model,
                                 ner_converter
                                 ])

bert_token_classifier_base_token_classification_for_atc_english_uwb_atcc download started this may take some time.
Approximate size to download 388.3 MB
[OK!]


In [50]:
import pyspark.sql.functions as F


# def dialogue_generator():
#     for row in data.toLocalIterator():
#       yield(row["sentence"])

# dialogue = " ".join(list(dialogue_generator()))

# Create the dataframe
df_empty = spark.createDataFrame([['']]).toDF("text")#spark.createDataFrame([['']], ['Sentence']) #.toDF("text")
df_all = data.agg(F.array_join(F.collect_list("sentence"), ' ', ' ').alias(DIALOGUE_COLUMN))
dialogue = df_all.first()[DIALOGUE_COLUMN]

# Fit the dataframe to the pipeline to get the model
model = pipeline.fit(df_all)

# Convert to LightPipeline Model
light_model = LightPipeline(model)

# Full annotate the light model to get predictions
cpres = light_model.fullAnnotate(dialogue)

49752


In [51]:
cpres[0:10]
# from sparknlp_display import NerVisualizer
# visualiser = NerVisualizer()

# visualiser.display(cpres, label_col='entities', document_col='document', save_path=f"display_result_ner.html")

  'token': [Annotation(token, 0, 0, I, {'sentence': '0'}, []),
   Annotation(token, 2, 7, should, {'sentence': '0'}, []),
   Annotation(token, 9, 12, have, {'sentence': '0'}, []),
   Annotation(token, 14, 18, known, {'sentence': '0'}, []),
   Annotation(token, 20, 23, that, {'sentence': '0'}, []),
   Annotation(token, 25, 27, you, {'sentence': '0'}, []),
   Annotation(token, 29, 33, would, {'sentence': '0'}, []),
   Annotation(token, 35, 36, be, {'sentence': '0'}, []),
   Annotation(token, 38, 41, here, {'sentence': '0'}, []),
   Annotation(token, 42, 42, ,, {'sentence': '0'}, []),
   Annotation(token, 44, 52, Professor, {'sentence': '0'}, []),
   Annotation(token, 54, 63, McGonagall, {'sentence': '0'}, []),
   Annotation(token, 64, 64, ., {'sentence': '0'}, []),
   Annotation(token, 66, 69, Good, {'sentence': '0'}, []),
   Annotation(token, 71, 77, evening, {'sentence': '0'}, []),
   Annotation(token, 78, 78, ,, {'sentence': '0'}, []),
   Annotation(token, 80, 88, Professor, {'sentenc

In [19]:
model = pipeline.fit(data)
result = model.transform(data)

result.select(
    F.explode(F.arrays_zip(result.ner_span.result, result.ner_span.metadata)).alias(
        "cols"
    )
).select(F.expr("cols['0']"), F.expr("cols['1'].entity")).show()

# Convert to LightPipeline Model
# lmodel = LightPipeline(model)
# lmodel.fullAnnotate(dialogue)



+--------------------+--------------+
|              cols.0|cols.1[entity]|
+--------------------+--------------+
|I should have kno...|          atco|
|                   ,|         pilot|
|Professor McGonag...|         pilot|
|Good evening, Pro...|         pilot|
|Are the rumours t...|         pilot|
|I'm afraid so, Pr...|         pilot|
|The good, and the...|         pilot|
|        And the boy?|         pilot|
|Hagrid is bringin...|         pilot|
|Do you think it w...|          atco|
|Hagrid with somet...|         pilot|
|Ah, Professor, I ...|         pilot|
|Professor Dumbled...|         pilot|
|No problems, I tr...|         pilot|
|No, Sir. Little t...|         pilot|
|    not to wake him.|          atco|
|       There you go.|         pilot|
|Albus, do you rea...|         pilot|
|I've watched them...|         pilot|
|The only family h...|          atco|
+--------------------+--------------+
only showing top 20 rows

