In [1]:
!pip install pyspark spark-nlp

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spark-nlp
  Downloading spark_nlp-5.4.2-py2.py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m964.6 kB/s[0m eta [36m0:00:00[0m
Downloading spark_nlp-5.4.2-py2.py3-none-any.whl (579 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.5/579.5 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=a443eacf04f3d2da170ca4a1d98f596bff72bda34c67d5ef982f99eb6a6397bb
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9f

In [2]:

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import functions as F

spark = sparknlp.start()
spark


## CaseSensitive
Whether to do a case-sensitive comparison over the stop words (Default: false)

In [3]:
documenter = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

sentencer = SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")

tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")

stop_words = StopWordsCleaner()\
.setInputCols(["token"])\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)

pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, stop_words])

data = spark.createDataFrame([["Tom is a nice man. He lives in Kashmir."]]).toDF("text")

result = pipeline.fit(data).transform(data)
result.select("cleanTokens.result").show(truncate=False)

+--------------------------------------+
|result                                |
+--------------------------------------+
|[Tom, nice, man, ., lives, Kashmir, .]|
+--------------------------------------+



As nothing specified, by default CaseSensitive = False . So if any stopword from the default (Stop words from MLlib) is present, it is removed. In this case, words like: "is", "a", "He", "in" were removed.

##  CaseSensitive = True

In [4]:
documenter = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

sentencer = SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")

tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")

stop_words = StopWordsCleaner()\
.setInputCols(["token"])\
.setOutputCol("cleanTokens")\
.setCaseSensitive(True)

pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, stop_words])

data = spark.createDataFrame([["Tom is a nice man. He lives in Kashmir."]]).toDF("text")

result = pipeline.fit(data).transform(data)
result.select("cleanTokens.result").show(truncate=False)

+------------------------------------------+
|result                                    |
+------------------------------------------+
|[Tom, nice, man, ., He, lives, Kashmir, .]|
+------------------------------------------+



Because of CaseSensitive = True, the word "He" was not considered a stopword because all stopwords in MLlibs StopWordsRemover are specified in lowercase.

## Stopwords from MLlibs StopWordsRemover

In [5]:
stop_words.getStopWords()

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

## StopWords: as an array of strings from a text file or manually.

In [7]:
stop_words = StopWordsCleaner()\
.setInputCols(["token"])\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)\
.setStopWords(["Tom", "a"])

pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, stop_words])

data = spark.createDataFrame([["Tom is a nice man. He lives in Kashmir."]]).toDF("text")

result = pipeline.fit(data).transform(data)
result.select("cleanTokens.result").show(truncate=False)

+---------------------------------------------+
|result                                       |
+---------------------------------------------+
|[is, nice, man, ., He, lives, in, Kashmir, .]|
+---------------------------------------------+



## Token positions are preserved.

In [9]:
stop_words = StopWordsCleaner()\
.setInputCols(["token"])\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)


pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, stop_words])

data = spark.createDataFrame([["Tom is a nice man. He lives in Kashmir."]]).toDF("text")

result = pipeline.fit(data).transform(data)


In [11]:
result.select("token.result","token.begin","token.end").withColumnRenamed("result","Tokens").show(truncate=False)
result.select("cleanTokens.result","cleanTokens.begin","cleanTokens.end").withColumnRenamed("result","Clean Tokens").show(truncate=False)


+-----------------------------------------------------+----------------------------------------+-----------------------------------------+
|Tokens                                               |begin                                   |end                                      |
+-----------------------------------------------------+----------------------------------------+-----------------------------------------+
|[Tom, is, a, nice, man, ., He, lives, in, Kashmir, .]|[0, 4, 7, 9, 14, 17, 19, 22, 28, 31, 38]|[2, 5, 7, 12, 16, 17, 20, 26, 29, 37, 38]|
+-----------------------------------------------------+----------------------------------------+-----------------------------------------+

+--------------------------------------+--------------------------+---------------------------+
|Clean Tokens                          |begin                     |end                        |
+--------------------------------------+--------------------------+---------------------------+
|[Tom, nice, man

## StopWordsCleaner Pre-trained Models

In [12]:
documenter = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

sentencer = SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")

tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")

stop_words = StopWordsCleaner.pretrained("stopwords_iso", "en")\
.setInputCols(["token"])\
.setOutputCol("cleanTokens")\


pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, stop_words])

data = spark.createDataFrame([["You are not better than me"]]).toDF("text")

result = pipeline.fit(data).transform(data)
result.select("cleanTokens.result").show(truncate=False)

stopwords_iso download started this may take some time.
Approximate size to download 2.1 KB
[OK!]
+--------+
|result  |
+--------+
|[better]|
+--------+



In [13]:

# Pretrained model ("stopwords_iso", "en") stopwords
stop_words.getStopWords()

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'call',
 'can',
 'cannot',
 'ca',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'g

## other languages stopwords

In [17]:
documenter = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

sentencer = SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")

tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")

stop_words = StopWordsCleaner.pretrained("stopwords_iso", "tr")\
.setInputCols(["token"])\
.setOutputCol("cleanTokens")\


pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, stop_words])

data = spark.createDataFrame([["Dinlenmemek üzere yola çıkanlar asla yorulmazlar. İyi daha iyinin düşmanıdır."]]).toDF("text")

result = pipeline.fit(data).transform(data)
result.select("cleanTokens.result").show(truncate=False)

stopwords_iso download started this may take some time.
Approximate size to download 3.1 KB
[OK!]
+-------------------------------------------------------------------------------+
|result                                                                         |
+-------------------------------------------------------------------------------+
|[Dinlenmemek, yola, çıkanlar, asla, yorulmazlar, ., İyi, iyinin, düşmanıdır, .]|
+-------------------------------------------------------------------------------+



In [15]:
stop_words.getStopWords()

['acaba',
 'acep',
 'adamakıllı',
 'adeta',
 'ait',
 'ama',
 'amma',
 'anca',
 'ancak',
 'arada',
 'artık',
 'aslında',
 'aynen',
 'ayrıca',
 'az',
 'açıkça',
 'açıkçası',
 'bana',
 'bari',
 'bazen',
 'bazı',
 'bazısı',
 'bazısına',
 'bazısında',
 'bazısından',
 'bazısını',
 'bazısının',
 'başkası',
 'başkasına',
 'başkasında',
 'başkasından',
 'başkasını',
 'başkasının',
 'başka',
 'belki',
 'ben',
 'bende',
 'benden',
 'beni',
 'benim',
 'beri',
 'beriki',
 'berikinin',
 'berikiyi',
 'berisi',
 'bilcümle',
 'bile',
 'binaen',
 'binaenaleyh',
 'biraz',
 'birazdan',
 'birbiri',
 'birbirine',
 'birbirini',
 'birbirinin',
 'birbirinde',
 'birbirinden',
 'birden',
 'birdenbire',
 'biri',
 'birine',
 'birini',
 'birinin',
 'birinde',
 'birinden',
 'birice',
 'birileri',
 'birilerinde',
 'birilerinden',
 'birilerine',
 'birilerini',
 'birilerinin',
 'birisi',
 'birisine',
 'birisini',
 'birisinin',
 'birisinde',
 'birisinden',
 'birkaç',
 'birkaçı',
 'birkaçına',
 'birkaçını',
 'birkaçının'