In [1]:
import findspark
findspark.init("/spark")

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master('local[4]')
    .appName('example')
    .getOrCreate()
)

spark

In [10]:
import pyspark.sql.functions as f
data = [
    (['john', 'sam', 'jane'], 1), 
    (['whiskers', 'rover', 'fido'], 2),
    (['whiskers', 'rover', 'fido'], 3),
    (['whiskers'], 4)
    ]

df = spark.createDataFrame(data, ["names", "id"])
df.show(truncate=False)

+-----------------------+---+
|names                  |id |
+-----------------------+---+
|[john, sam, jane]      |1  |
|[whiskers, rover, fido]|2  |
|[whiskers, rover, fido]|3  |
|[whiskers]             |4  |
+-----------------------+---+



In [8]:
df.printSchema()

root
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [11]:
from pyspark.ml.feature import CountVectorizer

In [18]:
# Doc.
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.CountVectorizer.html

# see tutorial:
# https://towardsdatascience.com/countvectorizer-hashingtf-e66f169e2d4e

cv = CountVectorizer(
    inputCol="names", 
    outputCol="features", 
    vocabSize=300, # max size of the vocabulary
    minDF=1.0 # cantidad minima de documentos en los que un token debe aparecer para ser considerado.
    )

In [19]:
model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+-----------------------+---+-------------------------+
|names                  |id |features                 |
+-----------------------+---+-------------------------+
|[john, sam, jane]      |1  |(6,[3,4,5],[1.0,1.0,1.0])|
|[whiskers, rover, fido]|2  |(6,[0,1,2],[1.0,1.0,1.0])|
|[whiskers, rover, fido]|3  |(6,[0,1,2],[1.0,1.0,1.0])|
|[whiskers]             |4  |(6,[0],[1.0])            |
+-----------------------+---+-------------------------+



In [20]:
model.vocabulary

['whiskers', 'fido', 'rover', 'sam', 'jane', 'john']

In [23]:
model

CountVectorizerModel: uid=CountVectorizer_dbccca9d33af, vocabularySize=6