In [1]:
import re
import string
from collections import Counter
import numpy as np

In [2]:
def read_corpus(filename):
  with open(filename, "r") as file:
    lines = file.readlines()
    words = []
    for line in lines:
      words += re.findall(r'\w+', line.lower())

  return words

In [24]:
words = read_corpus("Indonesian_corpus.txt")
print(f"There are {len(words)} total words in the corpus")

There are 537892 total words in the corpus


In [25]:
vocabs = set(words)
print(f"There are {len(vocabs)} unique words in the vocabulary")

There are 22611 unique words in the vocabulary


In [4]:
def split(word):
  return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [28]:
word_counts = Counter(words)
print(word_counts["saya"])

221


In [30]:
total_word_count = float(sum(word_counts.values()))
word_probas = {word: word_counts[word] / total_word_count for word in word_counts.keys()}

In [32]:
print(word_probas["saya"])

0.00041086314724889014


In [33]:
def split(word):
  return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [34]:
print(split("sampah"))

[('', 'sampah'), ('s', 'ampah'), ('sa', 'mpah'), ('sam', 'pah'), ('samp', 'ah'), ('sampa', 'h'), ('sampah', '')]


In [6]:
def delete(word):
  return [l + r[1:] for l,r in split(word) if r]

In [35]:
print(delete("sampah"))

['ampah', 'smpah', 'sapah', 'samah', 'samph', 'sampa']


In [8]:
def swap(word):
  return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]

In [36]:
print(swap("sampah"))

['asmpah', 'smapah', 'sapmah', 'samaph', 'sampha']


In [10]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [11]:
def replace(word):
  letters = string.ascii_lowercase
  return [l + c + r[1:] for l, r in split(word) if r for c in letters]

In [37]:
print(replace("sampah"))

['aampah', 'bampah', 'campah', 'dampah', 'eampah', 'fampah', 'gampah', 'hampah', 'iampah', 'jampah', 'kampah', 'lampah', 'mampah', 'nampah', 'oampah', 'pampah', 'qampah', 'rampah', 'sampah', 'tampah', 'uampah', 'vampah', 'wampah', 'xampah', 'yampah', 'zampah', 'sampah', 'sbmpah', 'scmpah', 'sdmpah', 'sempah', 'sfmpah', 'sgmpah', 'shmpah', 'simpah', 'sjmpah', 'skmpah', 'slmpah', 'smmpah', 'snmpah', 'sompah', 'spmpah', 'sqmpah', 'srmpah', 'ssmpah', 'stmpah', 'sumpah', 'svmpah', 'swmpah', 'sxmpah', 'sympah', 'szmpah', 'saapah', 'sabpah', 'sacpah', 'sadpah', 'saepah', 'safpah', 'sagpah', 'sahpah', 'saipah', 'sajpah', 'sakpah', 'salpah', 'sampah', 'sanpah', 'saopah', 'sappah', 'saqpah', 'sarpah', 'saspah', 'satpah', 'saupah', 'savpah', 'sawpah', 'saxpah', 'saypah', 'sazpah', 'samaah', 'sambah', 'samcah', 'samdah', 'sameah', 'samfah', 'samgah', 'samhah', 'samiah', 'samjah', 'samkah', 'samlah', 'sammah', 'samnah', 'samoah', 'sampah', 'samqah', 'samrah', 'samsah', 'samtah', 'samuah', 'samvah',

In [13]:
def insert(word):
  letters = string.ascii_lowercase
  return [l + c + r for l, r in split(word) for c in letters]

In [38]:
print(insert("sampah"))

['asampah', 'bsampah', 'csampah', 'dsampah', 'esampah', 'fsampah', 'gsampah', 'hsampah', 'isampah', 'jsampah', 'ksampah', 'lsampah', 'msampah', 'nsampah', 'osampah', 'psampah', 'qsampah', 'rsampah', 'ssampah', 'tsampah', 'usampah', 'vsampah', 'wsampah', 'xsampah', 'ysampah', 'zsampah', 'saampah', 'sbampah', 'scampah', 'sdampah', 'seampah', 'sfampah', 'sgampah', 'shampah', 'siampah', 'sjampah', 'skampah', 'slampah', 'smampah', 'snampah', 'soampah', 'spampah', 'sqampah', 'srampah', 'ssampah', 'stampah', 'suampah', 'svampah', 'swampah', 'sxampah', 'syampah', 'szampah', 'saampah', 'sabmpah', 'sacmpah', 'sadmpah', 'saempah', 'safmpah', 'sagmpah', 'sahmpah', 'saimpah', 'sajmpah', 'sakmpah', 'salmpah', 'sammpah', 'sanmpah', 'saompah', 'sapmpah', 'saqmpah', 'sarmpah', 'sasmpah', 'satmpah', 'saumpah', 'savmpah', 'sawmpah', 'saxmpah', 'saympah', 'sazmpah', 'samapah', 'sambpah', 'samcpah', 'samdpah', 'samepah', 'samfpah', 'samgpah', 'samhpah', 'samipah', 'samjpah', 'samkpah', 'samlpah', 'sammpah'

In [15]:
def edit1(word):
  return set(delete(word) + swap(word) + replace(word) + insert(word))

In [39]:
print(edit1("sampah"))

{'lsampah', 'saepah', 'samsah', 'sabmpah', 'scmpah', 'smmpah', 'sdampah', 'samzpah', 'wampah', 'sahmpah', 'sampaw', 'sympah', 'satpah', 'sampkh', 'spampah', 'sempah', 'sampahs', 'sawpah', 'hsampah', 'samcah', 'sampaih', 'samjah', 'samparh', 'sampaa', 'samxah', 'sgampah', 'mampah', 'sapah', 'sampkah', 'tampah', 'ssampah', 'sampcah', 'sxmpah', 'sampazh', 'sammah', 'sampxh', 'sampch', 'sampath', 'sgmpah', 'csampah', 'sampbh', 'sampauh', 'samfpah', 'samppah', 'sampiah', 'sampahl', 'sampzh', 'satmpah', 'sampar', 'samipah', 'jampah', 'msampah', 'vampah', 'sampahd', 'samapah', 'sampad', 'samaph', 'salmpah', 'sampdah', 'spmpah', 'samjpah', 'samwah', 'sacpah', 'samlpah', 'sampyah', 'sbmpah', 'sampadh', 'zsampah', 'saapah', 'salpah', 'xsampah', 'sawmpah', 'saympah', 'samph', 'sampoh', 'sampai', 'rsampah', 'sampal', 'sampaq', 'skampah', 'sampeah', 'sompah', 'srmpah', 'sampoah', 'sampanh', 'sampahp', 'sampan', 'samplh', 'sampaeh', 'sampay', 'sampax', 'sampxah', 'sacmpah', 'sameah', 'sampbah', 'sar

In [18]:
def edit2(word):
  return set(e2 for e1 in edit1(word) for e2 in edit1(e1))

In [40]:
print(edit2("sampah"))

{'samptadh', 'samhpth', 'sabmpaph', 'ssampahw', 'sxmpahe', 'saimpaf', 'saitah', 'spamrah', 'sakmyah', 'snampiah', 'sacmphh', 'sqdmpah', 'sadpfah', 'saomdpah', 'saqpath', 'sagnah', 'suampvah', 'sakmpgh', 'samkuh', 'sagpaho', 'tampak', 'sambpqah', 'smamfpah', 'sacmpaq', 'zssampah', 'qsamvah', 'bgmpah', 'shampbh', 'sxamgah', 'samfpazh', 'osamzah', 'mamaah', 'bmsampah', 'sacpaho', 'xamph', 'samqpawh', 'fampph', 'satpah', 'samtvh', 'kampfh', 'sampshah', 'sambahv', 'fsamnah', 'sampty', 'sxmepah', 'saxqah', 'safzmpah', 'campan', 'oampkh', 'asmpahb', 'samkoah', 'waympah', 'gampaeh', 'sapmauh', 'sxlampah', 'saompagh', 'samparp', 'sawpfh', 'asaumpah', 'sopampah', 'syampha', 'sxmvpah', 'skmkah', 'samaparh', 'sammmpah', 'saypph', 'qsampahe', 'sampzazh', 'sampamha', 'saumph', 'sbampuah', 'fsempah', 'sampcaz', 'swmfpah', 'saqpav', 'sampkhy', 'samcpae', 'sapmahg', 'samsrh', 'rsamapah', 'evsampah', 'samspahd', 'sktpah', 'saumpazh', 'wampkah', 'slampiah', 'sarpach', 'fampap', 'sameakh', 'samjvah', 'waj

In [41]:
def correct_spelling(word, vocabulary, word_probabilities):
  if word in vocabulary:
    print(f"{word} is already correctly spelt")
    return 

  suggestions = edit1(word) or edit2(word) or [word]
  best_guesses = [w for w in suggestions if w in vocabulary]
  return [(w, word_probabilities[w]) for w in best_guesses]

In [54]:
word = "samaph"
corrections = correct_spelling(word, vocabs, word_probas)

if corrections:
  print(corrections)
  probs = np.array([c[1] for c in corrections])
  best_ix = np.argmax(probs)
  correct = corrections[best_ix][0]
  print(f"{correct} is suggested for {word}")

[('sampah', 3.718218527139277e-06)]
sampah is suggested for samaph


In [48]:
class SpellChecker(object):

  def __init__(self, corpus_file_path):
    with open(corpus_file_path, "r") as file:
      lines = file.readlines()
      words = []
      for line in lines:
        words += re.findall(r'\w+', line.lower())

    self.vocabs = set(words)
    self.word_counts = Counter(words)
    total_words = float(sum(self.word_counts.values()))
    self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}

  def _level_one_edits(self, word):
    letters = string.ascii_lowercase
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [l + r[1:] for l,r in splits if r]
    swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
    replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
    inserts = [l + c + r for l, r in splits for c in letters] 

    return set(deletes + swaps + replaces + inserts)

  def _level_two_edits(self, word):
    return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

  def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
    valid_candidates = [w for w in candidates if w in self.vocabs]
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)

In [56]:
checker = SpellChecker("Indonesian_corpus.txt")

In [58]:
checker.check("sampah")

[('sampai', 0.0004350315676752954),
 ('sampah', 3.718218527139277e-06),
 ('sumpah', 3.718218527139277e-06)]