In [1]:
import re
import os
import sys
from pkg_resources import resource_filename
from symspellpy import SymSpell, Verbosity
from transformers import pipeline


# --------------------------------------------------------
# 1. Load SymSpell dictionary
# --------------------------------------------------------
def load_symspell(max_distance=3):
    sym_spell = SymSpell(max_dictionary_edit_distance=max_distance, prefix_length=7)

    # Try to load dictionary packaged inside symspellpy
    try:
        dictionary_path = resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt"
        )
    except Exception:
        dictionary_path = "frequency_dictionary_en_82_765.txt"

    if not os.path.exists(dictionary_path):
        print(f"[ERROR] Dictionary not found at: {dictionary_path}")
        print(
            "Download: https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt"
        )
        sys.exit(1)

    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    return sym_spell


# --------------------------------------------------------
# 2. Load BERT Masked-LM model
# --------------------------------------------------------
def load_bert():
    return pipeline("fill-mask", model="bert-base-uncased")


# --------------------------------------------------------
# 3. Smart contextual ranking with BERT
# --------------------------------------------------------
def choose_best_with_bert(bert, sentence, target_word, candidates):
    """
    Replace target_word with [MASK], ask BERT which candidate fits the context.
    """
    masked = sentence.replace(target_word, "[MASK]", 1)

    # Query BERT
    outputs = bert(masked)

    # Format:
    # [{'sequence': '...', 'score': ..., 'token_str': 'this'}, ...]
    scores = {entry["token_str"]: entry["score"] for entry in outputs}

    # Normalize candidates (lowercase BERT tokens)
    def normalize(w):
        return w.lower()

    ranked = []
    for cand in candidates:
        c = normalize(cand)
        ranked.append((cand, scores.get(c, 0)))  # 0 if BERT didn't propose it

    # Sort by BERT contextual probability
    ranked.sort(key=lambda x: x[1], reverse=True)

    return ranked[0][0]  # best candidate


# --------------------------------------------------------
# 4. Capitalization helper
# --------------------------------------------------------
def apply_original_casing(original, corrected):
    if original.isupper():
        return corrected.upper()
    if original[0].isupper():
        return corrected.capitalize()
    return corrected


# --------------------------------------------------------
# 5. Full correction logic
# --------------------------------------------------------
def correct_text(sym_spell, bert, text):
    # Tokenize while keeping punctuation
    tokens = re.findall(r"[A-Za-z']+|[^A-Za-z']+", text)
    corrected_tokens = tokens.copy()
    corrections = {}

    # Rebuild text by sentences (for contextual BERT)
    sentences = re.split(r"(?<=[.!?]) +", text)

    for s_idx, sentence in enumerate(sentences):
        words = re.findall(r"[A-Za-z']+", sentence)

        for word in words:
            lookup = sym_spell.lookup(
                word.lower(), Verbosity.CLOSEST, max_edit_distance=3
            )

            if not lookup:
                continue

            # top N SymSpell suggestions
            candidates = [cand.term for cand in lookup[:5]]

            # If the original is included → skip
            if word.lower() in candidates:
                continue

            # Use BERT to pick best suggestion given context
            best = choose_best_with_bert(bert, sentence, word, candidates)

            # Apply original casing
            best = apply_original_casing(word, best)

            # Replace exactly in the reconstructed text
            for i, tok in enumerate(corrected_tokens):
                if tok == word:
                    corrected_tokens[i] = best

            corrections[word] = best

    final_text = "".join(corrected_tokens)
    return final_text, corrections


# --------------------------------------------------------
# 6. User input logic
# --------------------------------------------------------
default_text = (
    "Thiss cod chekks speling in text using SymSpell and BERRT ranking.\n"
    "It corects words by two stepps: dictionnary lookup and conttext moddel.\n"
    "SymmSpell findd candidattes from a big dictonary faast.\n"
    "Then a BERRT maskked language modele chooces the best suggetion in context.\n"
    "The input can be a defaalt texxt or custtom lines typped by the usser.\n"
    "In the defalt texxt we add mispelled wordss to testt the code.\n"
    "This codee prints the correced texte and a raport of all wrong wordt.\n"
    "Only CPU ussage is needded, usingg a smaler BERT model for speeed."
)


def get_text():
    choice = input("Use default text? (yes/no): ").strip().lower()
    if choice == "yes":
        print("\nUsing default text:\n", default_text)
        return default_text

    print("\nEnter your text (finish with empty line):")
    lines = []
    while True:
        line = input()
        if line == "":
            break
        lines.append(line)
    return "\n".join(lines)


# --------------------------------------------------------
# 7. Main
# --------------------------------------------------------
def main():
    print("Loading SymSpell…")
    sym_spell = load_symspell()

    print("Loading BERT model… (first run takes ~10 sec)")
    bert = load_bert()

    text = get_text()

    print("\nCorrecting text using SymSpell + BERT…")
    corrected, report = correct_text(sym_spell, bert, text)

    print("\n---------------------------------------")
    print("Corrected Text:")
    print("---------------------------------------")
    print(corrected)

    print("\n---------------------------------------")
    print("Correction Report:")
    print("---------------------------------------")
    for wrong, correct in report.items():
        print(f"{wrong} → {correct}")

    print(f"\nTotal corrected words: {len(report)}")


if __name__ == "__main__":
    main()

Loading SymSpell…
Loading BERT model… (first run takes ~10 sec)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



Using default text:
 Thiss cod chekks speling in text using SymSpell and BERRT ranking.
It corects words by two stepps: dictionnary lookup and conttext moddel.
SymmSpell findd candidattes from a big dictonary faast.
Then a BERRT maskked language modele chooces the best suggetion in context.
The input can be a defaalt texxt or custtom lines typped by the usser.
In the defalt texxt we add mispelled wordss to testt the code.
This codee prints the correced texte and a raport of all wrong wordt.
Only CPU ussage is needded, usingg a smaler BERT model for speeed.

Correcting text using SymSpell + BERT…

---------------------------------------
Corrected Text:
---------------------------------------
This cod checks spelling in text using Spell and BERRY ranking.
It corrects words by two steps: dictionary lookup and context model.
SymmSpell find candidates from a big dictionary fast.
Then a BERRY masked language model choices the best suggestion in context.
The input can be a default text or cu