In [None]:
!pip install -q pyspark==3.1.2  spark-nlp==4.2.4

In [None]:
import sparknlp
from sparknlp.base import *
from sparknlp import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

spark = sparknlp.start()


In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

# "spellcheck_sd" can be omitted, as it is the default value
spellChecker = SymmetricDeleteModel.pretrained("spellcheck_sd")\
.setInputCols(["token"]) \
.setOutputCol("spell")

pipeline = Pipeline().setStages([
documentAssembler,
tokenizer,
spellChecker
])

data = spark.createDataFrame([["somtimes i wrrite wordz erong."]]).toDF("text")
result = pipeline.fit(data).transform(data)
result.select(col('token.result').alias("before_spellchecker"), col('spell.result').alias("after_spellchecker")).show(truncate = False)

## Training a spellchecker using SymmetricDeleteApproach

In [None]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

spellChecker = SymmetricDeleteApproach() \
.setInputCols(["token"]) \
.setOutputCol("spell")

pipeline = Pipeline().setStages([
documentAssembler,
tokenizer,
spellChecker
])

training_df = spark.createDataFrame([["The dog and the cat play together."]]).to_DF("text")

spellcheck_model = pipeline.fit(training_df)

text_df = spark.createDataFrame([["The dogh and th caat is eating"]]).to_DF("text")

corrected_text = spellcheck_model.transform(text_df)
corrected_text.select(col("token.result").alias("before_spellchecker"), col("spell.result").alias("after_spellchecker")).show(truncate=False)


## setDictionary

In [None]:
external_dict = '''
dogs
are
'''
with open('external_dict.txt', 'w') as f:
  f.write(external_dict)

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

spellChecker_1 = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell_1")

spellChecker_2 = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell_2") \
    .setDictionary("external_dict.txt")

pipeline = Pipeline().setStages([
    documentAssembler,
    tokenizer,
    spellChecker_1,
    spellChecker_2
])


training_df = spark.createDataFrame([["The dog and the cat play together."]]).toDF("text")

spellcheck_model = pipeline.fit(training_df)

text_df = spark.createDataFrame([["teh dogs aree eating."]]).toDF("text")

corrected_text = spellcheck_model.transform(text_df)

corrected_text.select(col('token.result').alias("before_spellchecker"), col('spell_1.result').alias("spellchecker_without_dict"), col('spell_2.result').alias("spellchecker_with_dict")).show(truncate = False)

## setDupsLimit

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

spellChecker_1 = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell_1") \
    .setDupsLimit(1)

spellChecker_2 = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell_2") \
    .setDupsLimit(0)

pipeline = Pipeline().setStages([
    documentAssembler,
    tokenizer,
    spellChecker_1,
    spellChecker_2
])

training_df = spark.createDataFrame([["it was a good day, and the dog played alone."]]).toDF("text")

spellcheck_model = pipeline.fit(training_df)

text_df = spark.createDataFrame([["it was a goood dogg."]]).toDF("text")

corrected_text = spellcheck_model.transform(text_df)

corrected_text.select(col('token.result').alias("before_spellchecker"), col('spell_1.result').alias("dups_limit_1"), col('spell_2.result').alias("dups_limit_0")).show(truncate = False)

## setFrequencyThreshold

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

spellChecker_1 = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell_1") \
    .setFrequencyThreshold(0)

spellChecker_2 = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell_2") \
    .setFrequencyThreshold(2)

pipeline = Pipeline().setStages([
    documentAssembler,
    tokenizer,
    spellChecker_1,
    spellChecker_2
])

training_df = spark.createDataFrame([["the dog and the cat play together."]]).toDF("text")

spellcheck_model = pipeline.fit(training_df)

text_df = spark.createDataFrame([["teh dogh is eating."]]).toDF("text")

corrected_text = spellcheck_model.transform(text_df)

corrected_text.select(col('token.result').alias("before_spellchecker"), col('spell_1.result').alias("frequency_threshold_0"), col('spell_2.result').alias("frequency_threshold_2")).show(truncate = False)

In this example, the spellchecker with frequencyThreshold = 2 did not correct the misspelled word "dogh", because the correct spelling of that word appears only once in the training data. In contrast to this, the word "teh" was corrected, because "the" appears at least twice in the training data.

## setMaxEditDistance

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

spellChecker_1 = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell_1") \
    .setMaxEditDistance(1)

spellChecker_2 = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell_2") \
    .setMaxEditDistance(2)

pipeline = Pipeline().setStages([
    documentAssembler,
    tokenizer,
    spellChecker_1,
    spellChecker_2
])

training_df = spark.createDataFrame([["the dog and the cat play together."]]).toDF("text")

spellcheck_model = pipeline.fit(training_df)

text_df = spark.createDataFrame([["teh dogh is eating."]]).toDF("text")

corrected_text = spellcheck_model.transform(text_df)

corrected_text.select(col('token.result').alias("before_spellchecker"), col('spell_1.result').alias("max_edit_distance_1"), col('spell_2.result').alias("max_edit_distance_2")).show(truncate = False)

When maxEditDistance is 1, "teh" is not corrected to "the" because the amount of edits that are needed (2 letters) is higher than the maximum amount of edits that are allowed.