Init functions


In [5]:
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

Loading book 


In [7]:
WORDS = Counter(words(open('../corpus/Selma/gosta.txt').read()))

In [8]:
words('Hej jag heter frans')

['hej', 'jag', 'heter', 'frans']

In [14]:
P('jag')

0.00904367755572934

In [17]:
correction('heej')

'hej'

In [19]:
candidates('heee')

{'bege',
 'ene',
 'hade',
 'hane',
 'hare',
 'heder',
 'hej',
 'hel',
 'hela',
 'helt',
 'hem',
 'hems',
 'henne',
 'herr',
 'herre',
 'hesa',
 'het',
 'heta',
 'heter',
 'hett',
 'hette',
 'huse',
 'lede',
 'leve',
 'nere',
 'vete'}

In [24]:
known(set(['hej', 'du', 'Frans']))

{'du', 'hej'}

In [26]:
edits1('hej')

{'aej',
 'ahej',
 'bej',
 'bhej',
 'cej',
 'chej',
 'dej',
 'dhej',
 'eej',
 'ehej',
 'ehj',
 'ej',
 'fej',
 'fhej',
 'gej',
 'ghej',
 'haej',
 'haj',
 'hbej',
 'hbj',
 'hcej',
 'hcj',
 'hdej',
 'hdj',
 'he',
 'hea',
 'heaj',
 'heb',
 'hebj',
 'hec',
 'hecj',
 'hed',
 'hedj',
 'hee',
 'heej',
 'hef',
 'hefj',
 'heg',
 'hegj',
 'heh',
 'hehj',
 'hei',
 'heij',
 'hej',
 'heja',
 'hejb',
 'hejc',
 'hejd',
 'heje',
 'hejf',
 'hejg',
 'hejh',
 'heji',
 'hejj',
 'hejk',
 'hejl',
 'hejm',
 'hejn',
 'hejo',
 'hejp',
 'hejq',
 'hejr',
 'hejs',
 'hejt',
 'heju',
 'hejv',
 'hejw',
 'hejx',
 'hejy',
 'hejz',
 'hek',
 'hekj',
 'hel',
 'helj',
 'hem',
 'hemj',
 'hen',
 'henj',
 'heo',
 'heoj',
 'hep',
 'hepj',
 'heq',
 'heqj',
 'her',
 'herj',
 'hes',
 'hesj',
 'het',
 'hetj',
 'heu',
 'heuj',
 'hev',
 'hevj',
 'hew',
 'hewj',
 'hex',
 'hexj',
 'hey',
 'heyj',
 'hez',
 'hezj',
 'hfej',
 'hfj',
 'hgej',
 'hgj',
 'hhej',
 'hhj',
 'hiej',
 'hij',
 'hj',
 'hje',
 'hjej',
 'hjj',
 'hkej',
 'hkj',
 'hlej'

In [27]:
edits2('hej')

<generator object edits2.<locals>.<genexpr> at 0x7f516c9695f0>