In [None]:
!pip install -q pyspark spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.5/579.5 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
import sparknlp
from sparknlp import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

spark = sparknlp.start()


In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

spellchecker = NorvigSweetingModel.pretrained("spellcheck_norvig") \
.setInputCols(["token"]) \
.setOutputCol("spell")

pipeline = Pipeline().setStages([
documentAssembler,
tokenizer,
spellchecker
])

data = spark.createDataFrame([["somtimes i wrrite wordz erong."]]).toDF("text")
result = pipeline.fit(data).transform(data)
result.select(col("token.result").alias("before_spellchecker"), col("spell.result").alias("after_spellchecker")).show(truncate=False)

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]
+--------------------------------------+--------------------------------------+
|before_spellchecker                   |after_spellchecker                    |
+--------------------------------------+--------------------------------------+
|[somtimes, i, wrrite, wordz, erong, .]|[sometimes, i, write, words, wrong, .]|
+--------------------------------------+--------------------------------------+



## Training a spellchecker with the NorvigSweetingModel

In [None]:
external_dict = """
dog
fish
horse
"""

with open("external_dict.txt", "w") as f:
  f.write(external_dict)

! head external_dict.txt


dog
fish
horse


In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

spellchecker = NorvigSweetingApproach() \
.setInputCols(["token"]) \
.setOutputCol("spell") \
.setDictionary("external_dict.txt")

pipeline = Pipeline().setStages([
documentAssembler,
tokenizer,
spellchecker
])

empty_df = spark.createDataFrame([[""]]).toDF("text")

spellcheck_model = pipeline.fit(empty_df)

text_df = spark.createDataFrame([["The dogh is eating."]]).toDF("text")

corrected_text = spellcheck_model.transform(text_df)

corrected_text.select(col("token.result").alias("before_spellchecker"), col("spell.result").alias("after_spellchecker")).show(truncate=False)


+--------------------------+-------------------------+
|before_spellchecker       |after_spellchecker       |
+--------------------------+-------------------------+
|[The, dogh, is, eating, .]|[The, dog, is, eating, .]|
+--------------------------+-------------------------+



## setCaseSensitive

In [None]:
capital_external_dict = """
Dog
Fish
Horse
"""

with open("capital_external_dict.txt", "w") as f:
  f.write(capital_external_dict)

! head capital_external_dict.txt


Dog
Fish
Horse


In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

spellchecker_1 = NorvigSweetingApproach() \
.setInputCols(["token"]) \
.setOutputCol("spell_1") \
.setDictionary("capital_external_dict.txt") \
.setCaseSensitive(False)

spellchecker_2 = NorvigSweetingApproach() \
.setInputCols(["token"]) \
.setOutputCol("spell_2") \
.setDictionary("capital_external_dict.txt") \
.setCaseSensitive(True)


pipeline = Pipeline().setStages([
documentAssembler,
tokenizer,
spellchecker_1,
spellchecker_2
])

empty_df = spark.createDataFrame([[""]]).toDF("text")

spellcheck_model = pipeline.fit(empty_df)

text_df = spark.createDataFrame([["The name of the dogh is Dogh."]]).toDF("text")

corrected_text = spellcheck_model.transform(text_df)

corrected_text.select(col("token.result").alias("before_spellchecker"), col("spell_1.result").alias("case_sensitive_false"), col("spell_2.result").alias("case_sensitive_true")).show(truncate=False)


+---------------------------------------+--------------------------------------+--------------------------------------+
|before_spellchecker                    |case_sensitive_false                  |case_sensitive_true                   |
+---------------------------------------+--------------------------------------+--------------------------------------+
|[The, name, of, the, dogh, is, Dogh, .]|[The, name, of, the, dog, is, Dogh, .]|[The, name, of, the, dogh, is, Dog, .]|
+---------------------------------------+--------------------------------------+--------------------------------------+



When caseSensitive is False (default value) , the spellchecker innores the uppercase included in the external dictionary. When it is set to True, only the uppercased version of the word is corrected.

## setDupsLimit

In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

spellchecker_1 = NorvigSweetingApproach() \
.setInputCols(["token"]) \
.setOutputCol("spell_1") \
.setDictionary("external_dict.txt") \
.setDupsLimit(1)

spellchecker_2 = NorvigSweetingApproach() \
.setInputCols(["token"]) \
.setOutputCol("spell_2") \
.setDictionary("external_dict.txt") \
.setDupsLimit(0)


pipeline = Pipeline().setStages([
documentAssembler,
tokenizer,
spellchecker_1,
spellchecker_2
])

empty_df = spark.createDataFrame([[""]]).toDF("text")

spellcheck_model = pipeline.fit(empty_df)

text_df = spark.createDataFrame([["It was a gooood dogh."]]).toDF("text")

corrected_text = spellcheck_model.transform(text_df)

corrected_text.select(col("token.result").alias("before_spellchecker"), col("spell_1.result").alias("dups_limit_1"), col("spell_2.result").alias("dups_limit_0")).show(truncate=False)


+-----------------------------+--------------------------+-------------------------+
|before_spellchecker          |dups_limit_1              |dups_limit_0             |
+-----------------------------+--------------------------+-------------------------+
|[It, was, a, gooood, dogh, .]|[It, was, a, good, dog, .]|[It, was, a, god, dog, .]|
+-----------------------------+--------------------------+-------------------------+



##setWordSizeIgnore

In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

spellchecker_1 = NorvigSweetingApproach() \
.setInputCols(["token"]) \
.setOutputCol("spell_1") \
.setDictionary("external_dict.txt") \
.setWordSizeIgnore(3)

spellchecker_2 = NorvigSweetingApproach() \
.setInputCols(["token"]) \
.setOutputCol("spell_2") \
.setDictionary("external_dict.txt") \
.setWordSizeIgnore(4)


pipeline = Pipeline().setStages([
documentAssembler,
tokenizer,
spellchecker_1,
spellchecker_2
])

empty_df = spark.createDataFrame([[""]]).toDF("text")

spellcheck_model = pipeline.fit(empty_df)

text_df = spark.createDataFrame([["It was a good dogh."]]).toDF("text")

corrected_text = spellcheck_model.transform(text_df)

corrected_text.select(col("token.result").alias("before_spellchecker"), col("spell_1.result").alias("word_size_ignore_3"), col("spell_2.result").alias("word_size_ignore_4")).show(truncate=False)


+---------------------------+--------------------------+---------------------------+
|before_spellchecker        |word_size_ignore_3        |word_size_ignore_4         |
+---------------------------+--------------------------+---------------------------+
|[It, was, a, good, dogh, .]|[It, was, a, good, dog, .]|[It, was, a, good, dogh, .]|
+---------------------------+--------------------------+---------------------------+



In this example, the misspelled word has 4 characters. The spellchecker with a value for wordSizeIgnore of 4 ignored this token and did not correct its spelling.