In [2]:
from pyspark import SparkContext
sc=SparkContext(master='local')

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Sentence example').config('spark.some.config.option','some-value').getOrCreate()

In [5]:
df=spark.createDataFrame([('Apache spark is faster than Hadoop',1),\
                          ('Spark supports both streaming and batch processing',0)],['text','label'])
df.show(truncate=False)

+--------------------------------------------------+-----+
|text                                              |label|
+--------------------------------------------------+-----+
|Apache spark is faster than Hadoop                |1    |
|Spark supports both streaming and batch processing|0    |
+--------------------------------------------------+-----+



## Tokenizer

In [7]:
from pyspark.ml.feature import Tokenizer
tokenizer=Tokenizer(inputCol='text',outputCol='tokens')
token_df=tokenizer.transform(df)
token_df.show(truncate=False)

+--------------------------------------------------+-----+----------------------------------------------------------+
|text                                              |label|tokens                                                    |
+--------------------------------------------------+-----+----------------------------------------------------------+
|Apache spark is faster than Hadoop                |1    |[apache, spark, is, faster, than, hadoop]                 |
|Spark supports both streaming and batch processing|0    |[spark, supports, both, streaming, and, batch, processing]|
+--------------------------------------------------+-----+----------------------------------------------------------+



## HashingTF

In [13]:
from pyspark.ml.feature import HashingTF
hashing=HashingTF(numFeatures=pow(2,4),inputCol='tokens',outputCol='hashTF')
hash_df=hashing.transform(token_df)
hash_df.show(truncate=False)

+--------------------------------------------------+-----+----------------------------------------------------------+----------------------------------------+
|text                                              |label|tokens                                                    |hashTF                                  |
+--------------------------------------------------+-----+----------------------------------------------------------+----------------------------------------+
|Apache spark is faster than Hadoop                |1    |[apache, spark, is, faster, than, hadoop]                 |(16,[1,7,10,13],[2.0,1.0,1.0,2.0])      |
|Spark supports both streaming and batch processing|0    |[spark, supports, both, streaming, and, batch, processing]|(16,[1,2,7,13,15],[2.0,1.0,1.0,1.0,2.0])|
+--------------------------------------------------+-----+----------------------------------------------------------+----------------------------------------+



## CountVectorizer

In [18]:
from pyspark.ml.feature import CountVectorizer
countvector=CountVectorizer(vocabSize=pow(2,4),inputCol='tokens',outputCol='countVec')
countvector_df=countvector.fit(token_df).transform(token_df)
countvector_df.show(truncate=False)

+--------------------------------------------------+-----+----------------------------------------------------------+----------------------------------------------------+
|text                                              |label|tokens                                                    |countVec                                            |
+--------------------------------------------------+-----+----------------------------------------------------------+----------------------------------------------------+
|Apache spark is faster than Hadoop                |1    |[apache, spark, is, faster, than, hadoop]                 |(12,[0,1,2,3,5,7],[1.0,1.0,1.0,1.0,1.0,1.0])        |
|Spark supports both streaming and batch processing|0    |[spark, supports, both, streaming, and, batch, processing]|(12,[0,4,6,8,9,10,11],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+--------------------------------------------------+-----+----------------------------------------------------------+----------------------------

# Example 2

In [20]:
a_df=spark.createDataFrame([('Rich Dad Poor Dad',),('Dad is rich',)],['text'])
a_df.show()

+-----------------+
|             text|
+-----------------+
|Rich Dad Poor Dad|
|      Dad is rich|
+-----------------+



In [29]:
a_token=Tokenizer(inputCol='text',outputCol='tokens')
token_a_df=a_token.transform(a_df)
token_a_df.show(truncate=False)

+-----------------+----------------------+
|text             |tokens                |
+-----------------+----------------------+
|Rich Dad Poor Dad|[rich, dad, poor, dad]|
|Dad is rich      |[dad, is, rich]       |
+-----------------+----------------------+



In [33]:
a_countvec=CountVectorizer(vocabSize=pow(2,2),inputCol='tokens',outputCol='counts')
countvec_a_df=a_countvec.fit(token_a_df).transform(token_a_df)
countvec_a_df.show(truncate=False)

+-----------------+----------------------+-------------------------+
|text             |tokens                |counts                   |
+-----------------+----------------------+-------------------------+
|Rich Dad Poor Dad|[rich, dad, poor, dad]|(4,[0,1,3],[2.0,1.0,1.0])|
|Dad is rich      |[dad, is, rich]       |(4,[0,1,2],[1.0,1.0,1.0])|
+-----------------+----------------------+-------------------------+

