In [1]:
!pip install -q pyspark spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.5/579.5 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:

import sparknlp
from sparknlp.base import DocumentAssembler, LightPipeline, Pipeline
from sparknlp.annotator import Tokenizer, Stemmer

spark = sparknlp.start()

In [3]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

nlpPipeline = Pipeline(stages=[documentAssembler,
                               tokenizer,
                               stemmer])

sample_texts = [["I love working with SparkNLP."],
        ["I am living in Canada."]]

data = spark.createDataFrame(sample_texts).toDF("text")

model = nlpPipeline.fit(data)

result = model.transform(data)

result.show(truncate=40)

+-----------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|                         text|                                document|                                   token|                                    stem|
+-----------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|I love working with SparkNLP.|[{document, 0, 28, I love working wit...|[{token, 0, 0, I, {sentence -> 0}, []...|[{token, 0, 0, i, {sentence -> 0}, []...|
|       I am living in Canada.|[{document, 0, 21, I am living in Can...|[{token, 0, 0, I, {sentence -> 0}, []...|[{token, 0, 0, i, {sentence -> 0}, []...|
+-----------------------------+----------------------------------------+----------------------------------------+----------------------------------------+



In [6]:
result.select("text","token.result","stem.result").show(truncate=40)

+-----------------------------+-------------------------------------+----------------------------------+
|                         text|                               result|                            result|
+-----------------------------+-------------------------------------+----------------------------------+
|I love working with SparkNLP.|[I, love, working, with, SparkNLP, .]|[i, love, work, with, sparknlp, .]|
|       I am living in Canada.|       [I, am, living, in, Canada, .]|      [i, am, live, in, canada, .]|
+-----------------------------+-------------------------------------+----------------------------------+



## Usage with LightPipeline

In [7]:
light_pipeline = LightPipeline(model)

In [8]:
light_pipeline.annotate("I love working with SparkNLP.")["stem"]

['i', 'love', 'work', 'with', 'sparknlp', '.']

In [10]:
light_pipeline.fullAnnotate("I love working with SparkNLP.")[0]["stem"]

[Annotation(token, 0, 0, i, {'sentence': '0'}, []),
 Annotation(token, 2, 5, love, {'sentence': '0'}, []),
 Annotation(token, 7, 13, work, {'sentence': '0'}, []),
 Annotation(token, 15, 18, with, {'sentence': '0'}, []),
 Annotation(token, 20, 27, sparknlp, {'sentence': '0'}, []),
 Annotation(token, 28, 28, ., {'sentence': '0'}, [])]