## Spelling correction

Notes for Rachel:

Here I give two options: 

1. A function that takes an essay and returns a corrected essay with no full stops, in lowercase and with contractions expaned.
2. A function that takes an essay and returns a corrected essay with full stops, in lowercase and with contractions expaned.

I give explanations below. Scroll all the way down for just the code for both options.

In [85]:


import contractions #library pertaining to contractions (things like "don't" and "you're")
import nltk
from nltk.tokenize import RegexpTokenizer #we'll use this to remove non-number and non-letter symbols


nltk.download('words') #nltk's collection of words
from nltk.corpus import words 

from nltk.metrics.distance import jaccard_distance #distance we'll use to find the nearest correct word
from nltk.util import ngrams


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\s1557452\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [86]:
essay = "This is an essay. This is a mistakke. Here's a contraction"
#this is our example essay

In [87]:


# The following code preprocesses an essay. Corresponding to each essay we now have a list of 
# words that are lowercase, all contractions are expanded, and they do not contain no non-number and 
# non-letter symbols.

def preprocess_spelling(essay): #returns a list of words
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') #This tokenises strings that consist of characters and numbers, i.e. it removes other symbols
    text_no_contr = contractions.fix(essay)  #Expands all contractions in essays. For example, converts "you're" to "you are".
    words_no_punct= tokenizer.tokenize(text_no_contr.lower())  #Removes all non-letter and non-number symbols. Also makes everything lowercase.
    return words_no_punct



# The following code loads the list of most common english words and adds numbers to it as I assume 
# we don't want to count the use of numbers as a spelling mistake.

words_into_list = words.words() #this is a text file with 1 word per line
words_into_list = words_into_list+[str(i) for i in range(0,1000000)] #add numbers to the list
words_lower = [word.lower() for word in words_into_list] #we will make all words lowercase
word_set = set(words_lower)






In [88]:
# The following function takes an essay and returns the list of misspelled words.

def misspellings(essay):
    errors=[]
    clean_essay = preprocess_spelling(essay)
    for word in clean_essay: #loop through words in each essay
        if word not in word_set: #if a word is not contained in our word_set, add to errors
            errors.append(word)
    return errors

In [89]:
misspellings(essay)

['mistakke']

This function returns a corrrected essay without punctuation and in lowercase.

In [90]:
# this is a function that takes in an essay and returns the corrected essay, but without punctuation and in lowercase

def corrected(essay):
    errors=[]
    clean_essay = preprocess_spelling(essay)
    correct_essay_words = clean_essay
    for word_index in range(0,len(clean_essay)): #loop through words in each essay
        word = clean_essay[word_index]
        if word not in word_set: #if a word is not contained in our word_set, correct it using jaccard
            temp = [(jaccard_distance(set(ngrams(word, 2)), set(ngrams(w, 2))),w) 
            for w in word_set if w[0]==word[0]] 
            correct_word = sorted(temp, key = lambda val:val[0])[0][1] #corrected word
            correct_essay_words[word_index] = correct_word
    correct_essay = " ".join(correct_essay_words)
    return correct_essay

In [91]:

corrected(essay)

'this is an essay this is a mistake here is a contraction'

This function returns a corrected essay with punctuation and in lowercase.

In [92]:
#Here we preprocess the text so that we keep sentence structure.

def preprocess_sent(text):
    text_no_contr = contractions.fix(text)  #Expands all contractions in essays. For example, converts "you're" to "you are".
    tokens = text_no_contr.split(".")
    tokens = [token.lower() for token in tokens if token not in string.punctuation]
    return tokens

In [93]:
preprocess_sent(essay)

['this is an essay', ' this is a mistakke', ' here is a contraction']

In [94]:
#this function takes in an essay and returns a corrected version in lowercase with punctuation and expanded contractions.

def corrected2(essay):
    sentences = preprocess_sent(essay)
    corrected_sent = [corrected(sentence) for sentence in sentences]
    corrected_essay = ". ".join(corrected_sent)+"."
    return corrected_essay

In [95]:
corrected2(essay)

'this is an essay. this is a mistake. here is a contraction.'

## Just code for no full-stops version:

In [96]:

import contractions #library pertaining to contractions (things like "don't" and "you're")
import nltk
from nltk.tokenize import RegexpTokenizer #we'll use this to remove non-number and non-letter symbols


nltk.download('words') #nltk's collection of words
from nltk.corpus import words 

from nltk.metrics.distance import jaccard_distance #distance we'll use to find the nearest correct word
from nltk.util import ngrams

# The following code preprocesses an essay. Corresponding to each essay we now have a list of 
# words that are lowercase, all contractions are expanded, and they do not contain no non-number and 
# non-letter symbols.

def preprocess_spelling(essay): #returns a list of words
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') #This tokenises strings that consist of characters and numbers, i.e. it removes other symbols
    text_no_contr = contractions.fix(essay)  #Expands all contractions in essays. For example, converts "you're" to "you are".
    words_no_punct= tokenizer.tokenize(text_no_contr.lower())  #Removes all non-letter and non-number symbols. Also makes everything lowercase.
    return words_no_punct


# The following code loads the list of most common english words and adds numbers to it as I assume 
# we don't want to count the use of numbers as a spelling mistake.

words_into_list = words.words() #this is a text file with 1 word per line
words_into_list = words_into_list+[str(i) for i in range(0,1000000)] #add numbers to the list
words_lower = [word.lower() for word in words_into_list] #we will make all words lowercase
word_set = set(words_lower)


# this is a function that takes in an essay and returns the corrected essay, but without punctuation and in lowercase

def corrected(essay):
    errors=[]
    clean_essay = preprocess_spelling(essay)
    correct_essay_words = clean_essay
    for word_index in range(0,len(clean_essay)): #loop through words in each essay
        word = clean_essay[word_index]
        if word not in word_set: #if a word is not contained in our word_set, correct it using jaccard
            temp = [(jaccard_distance(set(ngrams(word, 2)), set(ngrams(w, 2))),w) 
            for w in word_set if w[0]==word[0]] 
            correct_word = sorted(temp, key = lambda val:val[0])[0][1] #corrected word
            correct_essay_words[word_index] = correct_word
    correct_essay = " ".join(correct_essay_words)
    return correct_essay


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\s1557452\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [97]:
corrected(essay)

'this is an essay this is a mistake here is a contraction'

## Just code for version with full-stops:

In [98]:

import contractions #library pertaining to contractions (things like "don't" and "you're")
import nltk
from nltk.tokenize import RegexpTokenizer #we'll use this to remove non-number and non-letter symbols


nltk.download('words') #nltk's collection of words
from nltk.corpus import words 

from nltk.metrics.distance import jaccard_distance #distance we'll use to find the nearest correct word
from nltk.util import ngrams

# The following code preprocesses an essay. Corresponding to each essay we now have a list of 
# words that are lowercase, all contractions are expanded, and they do not contain no non-number and 
# non-letter symbols.

def preprocess_spelling(essay): #returns a list of words
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') #This tokenises strings that consist of characters and numbers, i.e. it removes other symbols
    text_no_contr = contractions.fix(essay)  #Expands all contractions in essays. For example, converts "you're" to "you are".
    words_no_punct= tokenizer.tokenize(text_no_contr.lower())  #Removes all non-letter and non-number symbols. Also makes everything lowercase.
    return words_no_punct


# The following code loads the list of most common english words and adds numbers to it as I assume 
# we don't want to count the use of numbers as a spelling mistake.

words_into_list = words.words() #this is a text file with 1 word per line
words_into_list = words_into_list+[str(i) for i in range(0,1000000)] #add numbers to the list
words_lower = [word.lower() for word in words_into_list] #we will make all words lowercase
word_set = set(words_lower)

#Here we preprocess the text so that we keep sentence structure.

def preprocess_sent(text):
    text_no_contr = contractions.fix(text)  #Expands all contractions in essays. For example, converts "you're" to "you are".
    tokens = text_no_contr.split(".")
    tokens = [token.lower() for token in tokens if token not in string.punctuation]
    return tokens

#this function takes in an essay and returns a corrected version in lowercase with punctuation and expanded contractions.

def corrected2(essay):
    sentences = preprocess_sent(essay)
    corrected_sent = [corrected(sentence) for sentence in sentences]
    corrected_essay = ". ".join(corrected_sent)+"."
    return corrected_essay



[nltk_data] Downloading package words to
[nltk_data]     C:\Users\s1557452\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [99]:
corrected2(essay)

'this is an essay. this is a mistake. here is a contraction.'