In [2]:
# Create a spark instance
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master('local')\
        .appName('stopwrds') \
    .config('spark.executor.memory', '5gb') \
    .config("spark.cores.max", "6") \
    .getOrCreate()

# Stop Words

In [4]:
# Create an example
sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"]),
], ["id", "raw"])

sentenceData.show(truncate=False)

+---+----------------------------+
|id |raw                         |
+---+----------------------------+
|0  |[I, saw, the, red, balloon] |
|1  |[Mary, had, a, little, lamb]|
+---+----------------------------+



In [8]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
filteredData = remover.transform(sentenceData)

display(type(filteredData))
display(filteredData.printSchema())
filteredData.show(truncate=False)

pyspark.sql.dataframe.DataFrame

root
 |-- id: long (nullable = true)
 |-- raw: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)



None

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



In [9]:
help(StopWordsRemover)

Help on class StopWordsRemover in module pyspark.ml.feature:

class StopWordsRemover(pyspark.ml.wrapper.JavaTransformer, pyspark.ml.param.shared.HasInputCol, pyspark.ml.param.shared.HasOutputCol, pyspark.ml.util.JavaMLReadable, pyspark.ml.util.JavaMLWritable)
 |  A feature transformer that filters out stop words from input.
 |  
 |  .. note:: null values from input array are preserved unless adding null to stopWords explicitly.
 |  
 |  >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"])
 |  >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"])
 |  >>> remover.transform(df).head().words == ['a', 'c']
 |  True
 |  >>> stopWordsRemoverPath = temp_path + "/stopwords-remover"
 |  >>> remover.save(stopWordsRemoverPath)
 |  >>> loadedRemover = StopWordsRemover.load(stopWordsRemoverPath)
 |  >>> loadedRemover.getStopWords() == remover.getStopWords()
 |  True
 |  >>> loadedRemover.getCaseSensitive() == remover.getCaseSensitive()
 |  True
 |  
 |  .. ver