In [1]:
import os
import sys
import findspark
os.environ["JAVA_HOME"] = "/usr/lib64/jvm/java-1.8.0-openjdk-1.8.0"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/dfs/is/home/m292915/spark-2.3.0-bin-hadoop2.7"
findspark.init()
jar_path='/dfs/is/home/m292915/spark-2.3.0-bin-hadoop2.7/jars/'

from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("Spark NLP") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "1000M") \
        .config("spark.driver.maxResultSize", "0")\
        .config("spark.jars", "{}spark-nlp-spark23-assembly-2.5.5.jar,{}spark-nlp-spark23_2.11-2.6.0.jar".format(jar_path,jar_path)) \
        .getOrCreate()

In [2]:
import sparknlp
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 2.6.0
Apache Spark version: 2.3.0


In [3]:
# creating text from a document
spark_df = spark.read.text('Data/sample-sentences-en.txt').toDF('text')
spark_df.show(truncate=False)

+-----------------------------------------------------------------------------+
|text                                                                         |
+-----------------------------------------------------------------------------+
|Peter is a very good person.                                                 |
|My life in Russia is very interesting.                                       |
|John and Peter are brothers. However they don't support each other that much.|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |
|Europe is very culture rich. There are huge churches! and big houses!        |
+-----------------------------------------------------------------------------+



### Transformers

## 2. Document Assembler

In [4]:
from sparknlp.base import *

documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")\
.setCleanupMode("shrink")

doc_df = documentAssembler.transform(spark_df)

doc_df.show(truncate=30)

+------------------------------+------------------------------+
|                          text|                      document|
+------------------------------+------------------------------+
|  Peter is a very good person.|[[document, 0, 27, Peter is...|
|My life in Russia is very i...|[[document, 0, 37, My life ...|
|John and Peter are brothers...|[[document, 0, 76, John and...|
|Lucas Nogal Dunbercker is n...|[[document, 0, 67, Lucas No...|
|Europe is very culture rich...|[[document, 0, 68, Europe i...|
+------------------------------+------------------------------+



In [5]:
doc_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)



## 3. Sentence Detector

In [6]:
from sparknlp.annotator import *

# we feed the document column coming from Document Assembler

sentenceDetector = SentenceDetector().\
setInputCols(['document']).\
setOutputCol('sentences')


In [7]:
sent_df = sentenceDetector.transform(doc_df)
sent_df.show(truncate=False)

Py4JJavaError: An error occurred while calling o82.getParam.
: java.util.NoSuchElementException: Param detectLists does not exist.
	at org.apache.spark.ml.param.Params$$anonfun$getParam$2.apply(params.scala:729)
	at org.apache.spark.ml.param.Params$$anonfun$getParam$2.apply(params.scala:729)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.ml.param.Params$class.getParam(params.scala:728)
	at org.apache.spark.ml.PipelineStage.getParam(Pipeline.scala:42)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
