# Symspell Implementation

## Load Libraries and SymSpell

In [1]:
from symspellpy.symspellpy import SymSpell
import spacy
from nltk import word_tokenize
nlp = spacy.load("en_core_web_sm") # $ python -m spacy download en_core_web_sm

In [3]:
symspell = SymSpell(3, 7) # edit_distance = 3, prefix_length = 7

# load custom dictionary (increased frequency for words that often appear in documents relating to school shootings)
if not symspell.load_dictionary('/Users/elisa/OneDrive/Wesley_mock/spellcheck/frequency_dictionary_en_82_765.txt', term_index=0, count_index=1):
    print("Cannot find dictionary")

## Spell Checking Helper Functions

In [4]:
def find_misspelled(words, proper_nouns): # return list of misspelled words from string input
    misspelled = []
    for i in words:
        clean_i = ''.join(spellclean(i.lower())) # removes special characters
        if not clean_i.isspace() and clean_i != '' and i.lower() not in proper_nouns \
          and i not in misspelled and (len(symspell.lookup(clean_i, verbosity = 0)) == 0 \
          or symspell.lookup(clean_i, verbosity = 0)[0].term != clean_i):
            misspelled.append(i) # deems word misspelled if word is not in the dictionary, is not a proper noun
    return misspelled

def spellclean(text): # removes parts of text that prevent spellchecking
    text = re.sub(r'\bhttps?:\/\/[^\s]+', '', text)
    text = re.sub(r'\bwww\.[^\s]+', '', text)
    text = re.sub(r'\w*\.com\b', '', text)
    text = re.sub(r'\w*\.org\b', '', text)
    text = re.sub(r'\w*\.net\b', '', text)
    text = re.sub('\'s', '', text)
    text = re.sub('_', '', text)
    text = re.sub('\d+', '', text)
    text = re.sub('\W+', ' ', text)
    return text

def get_proper_nouns(text): # returns list of proper nouns (lowercased for consitency)
    tagged_output, final_nnps = [], []
    tagged_text = nlp(text)
    for word in tagged_text:
        tagged_output.append((str(word), str(word.tag_)))
    nnps = [word for (word, tag) in tagged_output if tag == 'NNP']
    [final_nnps.append(i.lower()) for i in nnps if i.lower() not in final_nnps]
    return final_nnps

## The Spell Checker

In [5]:
def symspell_check(text, threshold = 0.02): # returns dictionary of misspelled words and their suggested corrections
    text = re.sub('-', ' ', text)
    text = re.sub('/', ' ', text)
    text = text.replace(r'+', ' ')
    proper_nouns = get_proper_nouns(text)
    words = word_tokenize(text)
    to_return = {}
    misspelled = find_misspelled(words, proper_nouns)
    if len(misspelled) == 0:
        print('No misspelled words have been detected.')
        return False
    elif len(misspelled)/len(word_tokenize(text)) < threshold:
        print('Spell checking not recommended. Your text only contains ' + 
             str((len(misspelled)/len(word_tokenize(text)))*100) + 
             '% misspelled words, which is under the ' + str(threshold*100) + '% theshold')
        return False
    for word in misspelled:
        clean_word = ''.join(spellclean(word.lower()))
        suggestions = symspell.lookup_compound(clean_word, 3) # find suggestion for misspelled word (edit distance 3)
        to_return[word] = suggestions[0].term
    return to_return

def sub_symspell(text, verbose = False, threshold = 0.02): 
    suggestions = symspell_check(text, threshold)
    if suggestions == False: return text
    if verbose: print('Percent of words misspelled: ' + str((len(suggestions)/len(word_tokenize(text))) if isinstance(suggestions, dict) else 0))
    if isinstance(suggestions, dict):
        for i in suggestions:
            text = re.sub(r'([^A-Za-z]{1})' + i + r'([^A-Za-z]{1})', r'\1' + suggestions[i] + r'\2', text)
            text = re.sub(r'([^A-Za-z]{1})' + i + r'$', r'\1' + suggestions[i] , text)
            text = re.sub(r'^' +i + r'([^A-Za-z]{1})', suggestions[i] + r'\1', text)
            if verbose: print('Misspelled word: ' + i + ' | Suggestion: ' + suggestions[i])
    return text