* main doc: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.HashingTF.html

In [2]:
import findspark
findspark.init("/spark")

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master('local[4]')
    .appName('example')
    .getOrCreate()
)

spark

In [21]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "b", 1.0),
    (0, "hadoop", 1.0),
    (0, "spark spark", 1.0),
    (0, "b", 1.0),
    (0, "b b b", 1.0),
    (0, "a spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark a e", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

training.show()

+---+----------------+-----+
| id|            text|label|
+---+----------------+-----+
|  0|               b|  1.0|
|  0|          hadoop|  1.0|
|  0|     spark spark|  1.0|
|  0|               b|  1.0|
|  0|           b b b|  1.0|
|  0|         a spark|  1.0|
|  1|             b d|  0.0|
|  2|       spark a e|  1.0|
|  3|hadoop mapreduce|  0.0|
+---+----------------+-----+



In [22]:
from pyspark.ml.feature import Tokenizer

# define tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# transform
tokenized = tokenizer.transform(training)
tokenized.show()

+---+----------------+-----+-------------------+
| id|            text|label|              words|
+---+----------------+-----+-------------------+
|  0|               b|  1.0|                [b]|
|  0|          hadoop|  1.0|           [hadoop]|
|  0|     spark spark|  1.0|     [spark, spark]|
|  0|               b|  1.0|                [b]|
|  0|           b b b|  1.0|          [b, b, b]|
|  0|         a spark|  1.0|         [a, spark]|
|  1|             b d|  0.0|             [b, d]|
|  2|       spark a e|  1.0|      [spark, a, e]|
|  3|hadoop mapreduce|  0.0|[hadoop, mapreduce]|
+---+----------------+-----+-------------------+



In [23]:
# hashing: Maps a sequence of terms to their term frequencies using the hashing trick. 
# Currently we use Austin Appleby’s MurmurHash 3 algorithm (MurmurHash3_x86_32) 
# to calculate the hash code value for the term object. Since a simple modulo 
# is used to transform the hash function to a column index, it is advisable 
# to use a power of two as the numFeatures parameter; otherwise 
# the features will not be mapped evenly to the columns.

from pyspark.ml.feature import HashingTF

# define hashing
hashingTF = HashingTF(inputCol="words", outputCol="features")
hashingTF.setNumFeatures(1000)

# apply hashing
hashed = hashingTF.transform(tokenized)
hashed.show(10, False, True)

-RECORD 0--------------------------------------
 id       | 0                                  
 text     | b                                  
 label    | 1.0                                
 words    | [b]                                
 features | (1000,[165],[1.0])                 
-RECORD 1--------------------------------------
 id       | 0                                  
 text     | hadoop                             
 label    | 1.0                                
 words    | [hadoop]                           
 features | (1000,[585],[1.0])                 
-RECORD 2--------------------------------------
 id       | 0                                  
 text     | spark spark                        
 label    | 1.0                                
 words    | [spark, spark]                     
 features | (1000,[286],[2.0])                 
-RECORD 3--------------------------------------
 id       | 0                                  
 text     | b                           