In [1]:
!pip install -q pyspark spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.5/579.5 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

spark = sparknlp.start()
spark

## setN

In [3]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

bigrams = NGramGenerator() \
.setInputCols(["token"]) \
.setOutputCol("bigrams") \
.setN(2)

trigrams = NGramGenerator() \
.setInputCols(["token"]) \
.setOutputCol("trigrams") \
.setN(3)

pipeline = Pipeline(stages=[documentAssembler, tokenizer, bigrams, trigrams])

data = spark.createDataFrame([
    "Cloud computing is benefiting major manufacturing companies",
    "Big data cloud computing cyber security machine learning"
], StringType()).toDF("text")

result = pipeline.fit(data).transform(data)


In [5]:
result.select("bigrams.result").show(2, truncate=False)

+--------------------------------------------------------------------------------------------------------------+
|result                                                                                                        |
+--------------------------------------------------------------------------------------------------------------+
|[Cloud computing, computing is, is benefiting, benefiting major, major manufacturing, manufacturing companies]|
|[Big data, data cloud, cloud computing, computing cyber, cyber security, security machine, machine learning]  |
+--------------------------------------------------------------------------------------------------------------+



In [11]:
result.select("trigrams.result").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                    |
+------------------------------------------------------------------------------------------------------------------------------------------+
|[Cloud computing is, computing is benefiting, is benefiting major, benefiting major manufacturing, major manufacturing companies]         |
|[Big data cloud, data cloud computing, cloud computing cyber, computing cyber security, cyber security machine, security machine learning]|
+------------------------------------------------------------------------------------------------------------------------------------------+



## setEnableCumulative

In [12]:
trigrams.setEnableCumulative(True)

NGramGenerator_952ce417c8e1

In [13]:
data = spark.createDataFrame([
    "Cloud computing is benefiting major manufacturing companies",
    "Big data cloud computing cyber security machine learning"
], StringType()).toDF("text")

result = pipeline.fit(data).transform(data)

In [14]:
result.select("trigrams.result").show(2, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                                                                                 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Cloud, computing, is, benefiting, major, manufacturing, 

## setDelimiter

In [15]:
bigrams.setDelimiter("/")

NGramGenerator_b5837da0873e

In [16]:
data = spark.createDataFrame([
    "Cloud computing is benefiting major manufacturing companies",
], StringType()).toDF("text")

result = pipeline.fit(data).transform(data)

In [17]:
result.select("bigrams.result").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------+
|result                                                                                                        |
+--------------------------------------------------------------------------------------------------------------+
|[Cloud/computing, computing/is, is/benefiting, benefiting/major, major/manufacturing, manufacturing/companies]|
+--------------------------------------------------------------------------------------------------------------+

