In [None]:
!pip install -q pyspark spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.5/579.5 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd

from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.sql.functions import array_contains

import sparknlp
from sparknlp.annotator import (
    Tokenizer,
    ContextSpellCheckerModel,
    ContextSpellCheckerApproach,
    SentenceDetector,
    NorvigSweetingModel,
    SymmetricDeleteModel,
)
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler, LightPipeline

In [None]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  5.4.2
Apache Spark version:  3.5.2


In [None]:
example_sentence = "Plaese alliow me tao introdduce myhelf, I am a man of waelht und tiaste"

In [None]:
def get_light_pipeline(spellModel):
  documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

  tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")
  pipeline = Pipeline(stages=[documentAssembler, tokenizer, spellModel])

  empty_ds = spark.createDataFrame([[""]]).toDF("text")
  lp = LightPipeline(pipeline.fit(empty_ds))
  return lp



In [None]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

spellModel = (
    ContextSpellCheckerModel.pretrained("spellcheck_dl")
    .setInputCols("token")
    .setOutputCol("checked")
    )

pipeline = Pipeline(stages=[documentAssembler, tokenizer, spellModel])

empty_ds = spark.createDataFrame([[""]]).toDF("text")
lp = LightPipeline(pipeline.fit(empty_ds))

spellcheck_dl download started this may take some time.
Approximate size to download 95.1 MB
[OK!]


In [None]:
spellModel.getWordClasses()

['(_NAME_,VocabParser)',
 '(_DATE_,RegexParser)',
 '(_LOC_,VocabParser)',
 '(_NUM_,RegexParser)']

In [None]:
spellModel.setOutputCol("checked")

SPELL_eaf90fb024f0

In [None]:
spellModel.getMaxWindowLen()

5

In [None]:
spellModel.getCaseStrategy()

2

In [None]:
spellModel.getWordMaxDistance()

3

In [None]:
result = lp.annotate(example_sentence)
print(result["checked"])

['Please', 'allow', 'me', 'to', 'introduce', 'myself', ',', 'I', 'am', 'a', 'man', 'of', 'wealth', 'and', 'taste']


In [None]:
print(result)

{'document': ['Plaese alliow me tao introdduce myhelf, I am a man of waelht und tiaste'], 'token': ['Plaese', 'alliow', 'me', 'tao', 'introdduce', 'myhelf', ',', 'I', 'am', 'a', 'man', 'of', 'waelht', 'und', 'tiaste'], 'checked': ['Please', 'allow', 'me', 'to', 'introduce', 'myself', ',', 'I', 'am', 'a', 'man', 'of', 'wealth', 'and', 'taste']}


In [None]:
for token, checked in zip(result["token"], result["checked"]):
  print(f"{token} -> {checked}")

Plaese -> Please
alliow -> allow
me -> me
tao -> to
introdduce -> introduce
myhelf -> myself
, -> ,
I -> I
am -> am
a -> a
man -> man
of -> of
waelht -> wealth
und -> and
tiaste -> taste


In [None]:
spellModel_modified = (
    ContextSpellCheckerModel.pretrained("spellcheck_dl")
    .setInputCols("token")
    .setOutputCol("checked")
    .setWordMaxDistance(1)
)

lp = get_light_pipeline(spellModel_modified)
result = lp.annotate(example_sentence)

for token, checked in zip(result["token"], result["checked"]):
  print(f"{token} -> {checked}")

spellcheck_dl download started this may take some time.
Approximate size to download 95.1 MB
[OK!]
Plaese -> Please
alliow -> allow
me -> me
tao -> tao
introdduce -> introduce
myhelf -> myself
, -> ,
I -> I
am -> am
a -> a
man -> man
of -> of
waelht -> waelht
und -> und
tiaste -> taste


## Training a context-aware spell checker

In [None]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/holmes.txt


In [None]:
path = "holmes.txt"

corpus = spark.read.text(path).toDF("text")
corpus.show(truncate=800)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                                                                                    

In [None]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

spellChecker = (
    ContextSpellCheckerApproach()
    .setInputCols("token")
    .setOutputCol("checked")
    .setBatchSize(1)
    .setEpochs(1)
    .setWordMaxDistance(3)
    .setMaxWindowLen(3)
    .setMinCount(3.0)
    .setCompoundCount(3)
    .setClassCount(5)

)

pipeline = Pipeline(stages=[documentAssembler, tokenizer, spellChecker])


In [None]:
try:
 model = pipeline.fit(corpus)

except Exception as e:
 print(e)

requirement failed: We couldn't find any suitable graph for 2000 classes, vocabSize: 3094


## Preparing the corpus for training
We will use the SentenceDetector annotator to split the book into sentences. Then we will sample a number of sentences that Colab is able to process. As a deep learning model, it demands heavy computation during training. For big datasets, it is recommended to use spark clusters to train efficiently.

In [None]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

sentenceDetector = SentenceDetector().setInputCols("document").setOutputCol("sentence")

sentences = sentenceDetector.transform(documentAssembler.transform(corpus))

# Get 10% of the senteces only
sample = sentences.select(F.explode("sentence.result").alias("sentence")).sample(fraction=0.1, seed=42)
sample.count()



561

## Training the model
Create a new pipeline to process this sample from the beginning (DocumentAssembler -> ContextSpellChecker)

In [None]:

# Note that we se `sentence` as input column name
documentAssembler = DocumentAssembler().setInputCol("sentence").setOutputCol("document")

tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

spellChecker = (
    ContextSpellCheckerApproach()
    .setInputCols("token")
    .setOutputCol("checked")
    .setBatchSize(1) # Batch size 1 to run in Colab
    .setEpochs(1)
    .setWordMaxDistance(3) # Maximum edit distance to consider
    .setMaxWindowLen(3) # important to find context
    .setMinCount(3.0) # Removes words that appear less than that from the vocabulary
    .setCompoundCount(5) # Removes compound words that appear less than that from the vocabulary
    .setClassCount(10.0) # Minimun occurrences of a class
    .setLanguageModelClasses(1650) # Value taht we have a TF graph available
)

pipeline = Pipeline(stages=[documentAssembler, tokenizer, spellChecker])

In [None]:

%%time
try:
  model = pipeline.fit(sample)
except Exception as e:
  print(e)

In [None]:
lp = LightPipeline(model)

test = lp.annotate("Sherlok Hlmes founds the solution to the mistrey")

for token, checked in zip(test["token"], test["checked"]):
  print(f"{token} => {checked}")

