In [1]:
# import session
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, NGram

In [2]:
# create spark app
spark = SparkSession.builder.appName('ngrams').getOrCreate()

In [3]:
# create word dataframe
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["What", "exactly", "are", "ngrams", "used", "for", "hmmm"]),
    (2, ["Not", "sure", "yea", "but", "learning"])
], ["id", "words"])

In [4]:
# create word dataframe
dataframe = spark.read.format("csv").option("header", "true").load("sampleText.csv")
dataframe.show()

+--------------------+
|         Sample Text|
+--------------------+
|The dog barks at ...|
+--------------------+



In [5]:
# Tokenize dataframe
token_data = Tokenizer(inputCol="Sample Text", outputCol="tokenized")
reviewed = token_data.transform(dataframe)
reviewed.show()

+--------------------+--------------------+
|         Sample Text|           tokenized|
+--------------------+--------------------+
|The dog barks at ...|[the, dog, barks,...|
+--------------------+--------------------+



In [6]:
# Create an bigram with n set to 2
ngram = NGram(n=2, inputCol="tokenized", outputCol="ngrams")

In [7]:
# Transform the data frame
ngramDataFrame = ngram.transform(reviewed)

In [8]:
# Show N gram results
ngramDataFrame.select("ngrams").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                        |
+----------------------------------------------------------------------------------------------------------------------------------------------+
|[the dog, dog barks, barks at, at the, the nearby, nearby neighbor, neighbor and, and the, the dog, dog barks, barks at, at the, the mailman.]|
+----------------------------------------------------------------------------------------------------------------------------------------------+



In [10]:
spark.stop()