In [68]:
import findspark 

In [69]:
findspark.init()

In [70]:
from pyspark.sql import SparkSession

In [71]:
spark = SparkSession.builder.appName("NLP").getOrCreate()

In [72]:
from pyspark.ml.feature import Tokenizer,RegexTokenizer

In [73]:
from pyspark.sql.functions import col,udf##col for calling a column and udf is user defined function
from pyspark.sql.types import IntegerType

In [74]:
sen_df = spark.createDataFrame([
    (0,"Hi I heard about Spark"),
    (1,"I wish java could use case classes"),
    (2,"Logistic,regression,models,are,neat")
],["id","sentence"])

In [75]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [76]:
tokenizer = Tokenizer(inputCol="sentence",outputCol= "words")

In [77]:
regexTokenizer = RegexTokenizer(inputCol = "sentence", outputCol = "words",pattern = "\\W")

In [78]:
countTokens = udf(lambda words: len(words),IntegerType())

In [79]:
tokenized = tokenizer.transform(sen_df)

In [80]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, ı, heard, ab...|
|  1|I wish java could...|[ı, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [81]:
tokenized.withColumn("tokens",countTokens(col("words"))).show()## since split was done by white space last sentence counted as 1 string

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, ı, heard, ab...|     5|
|  1|I wish java could...|[ı, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [82]:
rg_tokenized = regexTokenizer.transform(sen_df)

In [83]:
rg_tokenized.withColumn("tokens",countTokens(col("words"))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, heard, about...|     4|
|  1|I wish java could...|[wish, java, coul...|     6|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [84]:
from pyspark.ml.feature import StopWordsRemover

In [85]:
sentenceDataFrame = spark.createDataFrame([
    (0,["I","saw","the","gree","horse"]),
    (1,["Mary","had","a","little","lamb"])
],["id","tokens"])

In [86]:
sentenceDataFrame.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [87]:
remover = StopWordsRemover(inputCol = "tokens", outputCol= "filtered")

In [88]:
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...|  [saw, gree, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



In [89]:
#n-gram

In [90]:
from pyspark.ml.feature import NGram

In [96]:
word_df = spark.createDataFrame([
    (0,["Hi", "I", "heard", "about", "Spark"] ),
    (1,["I", "wish", "java", "could", "use", "case", "classes"]),
    (2,["Logistic", "regression", "models", "are", "neat"])
],["id","words"])

In [98]:
ngram = NGram(n= 2, inputCol = "words", outputCol = "grams")

In [100]:
ngram.transform(word_df).show(truncate = False)

+---+------------------------------------------+------------------------------------------------------------------+
|id |words                                     |grams                                                             |
+---+------------------------------------------+------------------------------------------------------------------+
|0  |[Hi, I, heard, about, Spark]              |[Hi I, I heard, heard about, about Spark]                         |
|1  |[I, wish, java, could, use, case, classes]|[I wish, wish java, java could, could use, use case, case classes]|
|2  |[Logistic, regression, models, are, neat] |[Logistic regression, regression models, models are, are neat]    |
+---+------------------------------------------+------------------------------------------------------------------+



In [None]:
# n grams are representing strings of consecutive words, ngrams are showing pairs of consecutive words
# to show the relationship between words