## NLP - NGram

---

In [1]:
#Import Session
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, NGram

In [2]:
#Create the spark app
spark = SparkSession.builder.appName("NGram").getOrCreate()

In [3]:
# create word dataframe
wordDataFrame = spark.createDataFrame(
    [(0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["What", "exactly", "are", "ngrams", "used", "for", "hmmm"]),
    (2, ["Not", "sure", "yea", "but", "learning"])], 
    ["id", "words"])

wordDataFrame.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|[Hi, I, heard, ab...|
|  1|[What, exactly, a...|
|  2|[Not, sure, yea, ...|
+---+--------------------+



In [4]:
#Create word dataframe from csv file
dataframe = spark.read.format('csv').option('header','true').load('data/sampleText.csv')
dataframe.show()

+--------------------+
|         Sample Text|
+--------------------+
|The dog barks at ...|
+--------------------+



In [5]:
#Tokenize the dataframe
token_data = Tokenizer(inputCol='Sample Text', outputCol='tokenized')
reviewed = token_data.transform(dataframe)
reviewed.show()

+--------------------+--------------------+
|         Sample Text|           tokenized|
+--------------------+--------------------+
|The dog barks at ...|[the, dog, barks,...|
+--------------------+--------------------+



In [6]:
#Create a bigram with n set to 2
ngram = NGram(n=2, inputCol='tokenized', outputCol='ngrams')

In [7]:
#Transform the dataframe
ngram_df = ngram.transform(reviewed)

In [9]:
#Show N gram results
ngram_df.select('ngrams').show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                        |
+----------------------------------------------------------------------------------------------------------------------------------------------+
|[the dog, dog barks, barks at, at the, the nearby, nearby neighbor, neighbor and, and the, the dog, dog barks, barks at, at the, the mailman.]|
+----------------------------------------------------------------------------------------------------------------------------------------------+



In [10]:
spark.stop()