In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from modules.my_pyspark import *

In [4]:
spark = MyPySpark(session=True)

In [6]:
sentenceDataFrame = spark.session.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I know Spark can work well with NLP"),
    (2, "Logistic,regression,models,are,supervised")
], ['id', 'sentence'])

In [11]:
sentenceDataFrame.show(truncate=False)

+---+-----------------------------------------+
|id |sentence                                 |
+---+-----------------------------------------+
|0  |Hi I heard about Spark                   |
|1  |I know Spark can work well with NLP      |
|2  |Logistic,regression,models,are,supervised|
+---+-----------------------------------------+



In [9]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
regexTokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern="\\W")
                                                                                # alternative: pattern="\\w+", gaps(False)
countTokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(sentenceDataFrame)

In [10]:
tokenized.select('sentence', 'words').withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+-----------------------------------------+--------------------------------------------+------+
|sentence                                 |words                                       |tokens|
+-----------------------------------------+--------------------------------------------+------+
|Hi I heard about Spark                   |[hi, i, heard, about, spark]                |5     |
|I know Spark can work well with NLP      |[i, know, spark, can, work, well, with, nlp]|8     |
|Logistic,regression,models,are,supervised|[logistic,regression,models,are,supervised] |1     |
+-----------------------------------------+--------------------------------------------+------+



In [12]:
regexTokenized = regexTokenizer.transform(sentenceDataFrame)

In [13]:
regexTokenized.select('sentence', 'words').withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+-----------------------------------------+-----------------------------------------------+------+
|sentence                                 |words                                          |tokens|
+-----------------------------------------+-----------------------------------------------+------+
|Hi I heard about Spark                   |[hi, i, heard, about, spark]                   |5     |
|I know Spark can work well with NLP      |[i, know, spark, can, work, well, with, nlp]   |8     |
|Logistic,regression,models,are,supervised|[logistic, regression, models, are, supervised]|5     |
+-----------------------------------------+-----------------------------------------------+------+



# Stopword remover

In [14]:
x = regexTokenized.withColumn('tokens', countTokens(col('words')))

In [15]:
x.show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I know Spark can ...|[i, know, spark, ...|     8|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [16]:
from pyspark.ml.feature import StopWordsRemover

In [21]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
remover.transform(x).show(truncate=False)

+---+-----------------------------------------+-----------------------------------------------+------+------------------------------------------+
|id |sentence                                 |words                                          |tokens|filtered                                  |
+---+-----------------------------------------+-----------------------------------------------+------+------------------------------------------+
|0  |Hi I heard about Spark                   |[hi, i, heard, about, spark]                   |5     |[hi, heard, spark]                        |
|1  |I know Spark can work well with NLP      |[i, know, spark, can, work, well, with, nlp]   |8     |[know, spark, work, well, nlp]            |
|2  |Logistic,regression,models,are,supervised|[logistic, regression, models, are, supervised]|5     |[logistic, regression, models, supervised]|
+---+-----------------------------------------+-----------------------------------------------+------+----------------------

In [24]:
remover_df = remover.transform(x)

# NGram

In [22]:
from pyspark.ml.feature import NGram

In [25]:
ngram = NGram(n=2, inputCol='filtered', outputCol='ngrams')
ngramDataFrame = ngram.transform(remover_df)
ngramDataFrame.select('ngrams').show(truncate=False)

+-----------------------------------------------------------+
|ngrams                                                     |
+-----------------------------------------------------------+
|[hi heard, heard spark]                                    |
|[know spark, spark work, work well, well nlp]              |
|[logistic regression, regression models, models supervised]|
+-----------------------------------------------------------+



# TF-IDF

In [32]:
from pyspark.ml.feature import HashingTF, IDF

In [30]:
wordsData = ngramDataFrame = ngram.transform(remover_df).select('ngrams')

In [31]:
hashingTF = HashingTF(inputCol='ngrams', outputCol='rawFeatures', numFeatures=10)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show(truncate=False)

+-----------------------------------------------------------+--------------------------+
|ngrams                                                     |rawFeatures               |
+-----------------------------------------------------------+--------------------------+
|[hi heard, heard spark]                                    |(10,[1],[2.0])            |
|[know spark, spark work, work well, well nlp]              |(10,[4,6,7],[1.0,2.0,1.0])|
|[logistic regression, regression models, models supervised]|(10,[0,1,3],[1.0,1.0,1.0])|
+-----------------------------------------------------------+--------------------------+



In [33]:
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [34]:
rescaledData.select('features').show(truncate=False)

+------------------------------------------------------------------------+
|features                                                                |
+------------------------------------------------------------------------+
|(10,[1],[0.5753641449035617])                                           |
|(10,[4,6,7],[0.6931471805599453,1.3862943611198906,0.6931471805599453]) |
|(10,[0,1,3],[0.6931471805599453,0.28768207245178085,0.6931471805599453])|
+------------------------------------------------------------------------+



# CountVectorizer

In [35]:
from pyspark.ml.feature import CountVectorizer

In [36]:
df = spark.session.createDataFrame([
    (0, "a b c".split(' ')),
    (1, "a b b c a".split(" ")),
    (2, "a b d d a c c".split(" "))
], ['id', 'words'])

In [39]:
cv = CountVectorizer(inputCol='ngrams', outputCol='features_cv', vocabSize=4, minDF=1)
model =cv.fit(wordsData)
res = model.transform(wordsData)
res.show(truncate=False)

+-----------------------------------------------------------+-------------------+
|ngrams                                                     |features_cv        |
+-----------------------------------------------------------+-------------------+
|[hi heard, heard spark]                                    |(4,[3],[1.0])      |
|[know spark, spark work, work well, well nlp]              |(4,[0,2],[1.0,1.0])|
|[logistic regression, regression models, models supervised]|(4,[1],[1.0])      |
+-----------------------------------------------------------+-------------------+

