# Auto Correction

Import Libraries

In [3]:
%%capture
# Imports/pips/initializations go here
import string
import types
import numpy as np
import nltk
import re
import random
import pandas as pd
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import gutenberg, stopwords
from nltk.probability import FreqDist
from nltk.corpus import words
import matplotlib.pyplot as plt
import requests

from gensim.models import Word2Vec
from sklearn import tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

!python -m spacy download en_core_web_sm
import en_core_web_sm

nltk.download('stopwords')
nltk.download('gutenberg')
nltk.download('punkt')

# Load the Data Set

In [4]:
books=[]
urls=[]
urls.append("https://www.gutenberg.org/files/51660/51660-0.txt")
# urls.append("https://www.gutenberg.org/files/20023/20023-8.txt")
# urls.append("https://www.gutenberg.org/files/36641/36641-8.txt")
# urls.append("https://www.gutenberg.org/files/63444/63444-0.txt")
# urls.append("https://www.gutenberg.org/files/1265/1265.txt")

for url in urls:
  books.append(requests.get(url).text)

**Dataset example**

In [23]:
len(books)

1

> - **Seems like it's need some preprossing and cleaning**

**Simple Cleaning Function**

In [24]:
import string

def prepare_data(text):
  # to lower case
  text = text.lower()
  # Remove non-ascii characters
  text = text.encode("ascii", "ignore").decode()
  # Remove \n and \r
  text = re.sub(r'[\r*\n*]+', ' ', text)
  # Remove websites and hashtags
  text = re.sub(r'_|#|http\S+|www\S+', ' ', text)
  # Substitute numbers by keyword num
  text = re.sub(r'\d+', 'num', text)
  
  # Divide into sentences
  # sents = text_to_sent_list(text, nlp)
  # Apply tokenization
  regex = r"\w+'?\w+\s"
  words = re.findall(regex,text)
  words = [w[:-1] for w in words]
  words = [w for w in words if len(w)>1 or w in ["a", "i"]]
  unique = np.unique(words, return_counts=True)
  [[w, c] for w, c in zip(*unique) if  len(w) == 2]
  return words

words = [prepare_data(book) for book in books]
words = [y for x in words for y in x]


words = set(words)

**Generate Incorrect Words**

In [7]:
def apply_error(word, remove=0.3, add=0.3, swap_letter=True):
  def swap(s, i, j):
    return ''.join((s[:i], s[j], s[i+1:j], s[i], s[j+1:]))

  if swap_letter:
    i = random.randint(0,len(word)-1)
    j = random.randint(0,len(word)-1)
    word = swap(word, i, j)

  letters_to_remove = int(remove*len(word))
  letters_to_add = int(add*len(word))

  for _ in range(letters_to_remove):
    i = random.randint(1,len(word))
    word = word[:i-1] + word[i:]

  lower_upper_alphabet = string.ascii_letters
  for _ in range(letters_to_add):
    random_letter = random.choice(lower_upper_alphabet)
    i = random.randint(1,len(word))
    word = word[:i] + random_letter.lower() + word[i:]

  return word

In [8]:
apply_error("hospital", remove=0.3, add=0.2, swap_letter=False)

'hpitgal'

In [9]:
apply_error("hospital", remove=0.1, add=0.1)

'hospitooasptal'

Due to the Ram consumtion i would chose the first 100 word only

In [30]:
correct_words = list(set(prepare_data(books[0])))[:100]

100

Build the Evaluation Dataset

In [59]:
wrong_1 =[]

for correct in correct_words:
  # wrong_1.append(apply_error(correct, remove=0.1, add=0.1,swap_letter=True)) # Baseline
  wrong_1.append(apply_error(correct, remove=0.1, add=0.1)) # Don't swap

  # wrong_1.append(apply_error(correct, remove=0.2, add=0.1, swap_letter=True)) # Remove more 20%
  # wrong_1.append(apply_error(correct, remove=0.1, add=0.2, swap_letter=True)) # Add more 20%
  
  # wrong_1.append(apply_error(correct, remove=0.3, add=0.1,swap_letter=True)) # Remove more 30%
  # wrong_1.append(apply_error(correct, remove=0.1, add=0.3,swap_letter=True)) # Add more 30%

  # wrong_1.append(apply_error(correct, remove=0.5, add=0.1,swap_letter=True)) # Remove more 20%
  # wrong_1.append(apply_error(correct, remove=0.1, add=0.5,swap_letter=True)) # Add more 20%

  # wrong_1.append(apply_error(correct, remove=0.2, add=0.2,swap_letter=True)) # Romove and Add by 20%
  # wrong_1.append(apply_error(correct, remove=0.3, add=0.3,swap_letter=True)) # Romove and Add by 30%

In [60]:
df = pd.DataFrame({"correct":correct_words,"wrong_1":wrong_1})
df

Unnamed: 0,correct,wrong_1
0,adulations,aulattiionsf
1,arms,arrms
2,outline,outliunttlne
3,mankind,mnakind
4,laigle,laegli
...,...,...
95,falcon,falaclcon
96,burned,bunuewrned
97,reach,erach
98,stripes,striippes


# Define the AutoCorrection function

In [61]:
import spacy
from numpy import empty

def Autocorrection(correct_words,wrong_words, corrector='textblob'):
  close_matches =[]
  correct_counter = 0

  if corrector=='autocorrect':
    from autocorrect import Speller
    spell = Speller(lang='en')

  if corrector=='pyspellchecker':
    from spellchecker import SpellChecker
    spell = SpellChecker()

  if corrector=='contextualSpellCheck':
    import contextualSpellCheck
    nlp = spacy.load('en_core_web_sm')
    contextualSpellCheck.add_to_pipe(nlp)  

  if corrector=='textblob':
    from textblob import TextBlob

  for i in range(len(correct_words)):

    correct_w = correct_words[i]
    wrong = wrong_words[i]
    if corrector=='autocorrect':
      close_match = spell(wrong)

    if corrector=='pyspellchecker':
      close_match = spell.correction(wrong)
      
    if corrector=='contextualSpellCheck':
      close_match = nlp(wrong)

    if corrector=='textblob':
      close_match = TextBlob(wrong).correct()

    if close_match is not empty and close_match == correct_w:
      correct_counter+=1
    close_matches.append(close_match)

  return close_matches, correct_counter/len(correct_words)

# contextualSpellCheck

In [62]:
# !pip install contextualSpellCheck

In [63]:
corrected_contextualSpellCheck, contextualSpellCheck_acc = Autocorrection(df.iloc[:,0],df.iloc[:,1],corrector='contextualSpellCheck')

In [64]:
contextualSpellCheck_acc

0.0

# textblob

In [65]:
# !pip install textblob

In [66]:
corrected_textblob, textblob_acc = Autocorrection(df.iloc[:,0],df.iloc[:,1],corrector='textblob')

In [67]:
textblob_acc

0.36

# pyspellchecker

In [68]:
# !pip install pyspellchecker

In [69]:
corrected_pyspellchecker, pyspellchecker_acc = Autocorrection(df.iloc[:,0],df.iloc[:,1],corrector='pyspellchecker')

In [70]:
pyspellchecker_acc

0.39

# Speller

In [71]:
# ! pip install autocorrect

In [72]:
Correced_autocorrect, autocorrect_accuracy= Autocorrection(df.iloc[:,0],df.iloc[:,1],corrector='autocorrect')

In [73]:
autocorrect_accuracy

0.36

data frame

In [44]:
df['wrong1_performance'] = Correced_autocorrect

In [46]:
df

Unnamed: 0,correct,wrong_1,wrong_2,wrong_3,wrong_4,wrong_5,wrong_6,wrong_7,wrong_8,wrong_9,wrong_10,wrong1_performance
0,adulations,adulattionsf,adumlationuslatios,anuiatlon,aouliratids,adulyatios,dulamtpmionlsatieonso,laoinsns,aouliftlidnasa,adutionlkisons,dfulartidluatigofns,adulattionsf
1,arms,asmr,armrsms,mras,aacrms,rsm,armmssx,aram,mradsa,aamrmms,armmrfs,amr
2,outline,outliine,utlirtnline,ouilne,outliinnze,oxuoulne,nuvtliome,ouinn,outlttqtinne,oauttline,oulieuptlipe,outline
3,mankind,mankiind,mankaiankid,manind,mnakinmd,mndin,mxaknikeirnd,mank,maibankvinpd,maainnd,abnkinadtkidc,mankind
4,laigle,ialgle,laaigle,ligle,leiglaj,lagle,laiglep,lie,leikzglva,lligga,llaileee,eagle
...,...,...,...,...,...,...,...,...,...,...,...,...
95,falcon,faflalcon,aflcon,lacon,falcloncon,oalcn,aflcvon,flcn,fahlclocemojn,falino,nacorf,faflalcon
96,burned,burnrened,durneb,burunrnd,bbuurnzed,burnedd,burneccdd,ubr,bbufwureuned,durebn,bubrrhwne,burdened
97,reach,rehca,reacch,aech,aekrch,erch,roeacnchh,reah,rgbedaach,ceprh,reaeachxk,reach
98,stripes,srtipes,stttiripes,strisp,strstpei,strep,stritzrirpeps,rrie,sturijpseentrpeosx,ssripems,striaeisvee,stripes
