### Set-up and initialization

In [1]:
import fictionary
import hunspell

import numpy as np
import random

In [2]:
with open('realwords.txt', 'r') as f:
    realwords = set(_.strip() for _ in f.readlines())
banned = set(_.lower() for _ in realwords)

In [3]:
hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')

### Sampling and output

In [None]:
random.seed(3473)
samples = 2400
fakes = set(fictionary.words(2*samples, min_length=4, max_length=12, dictionary='all'))
fakes -= banned
while len(fakes) < 2*samples:
    fakes |= set(fictionary.words(2*samples-len(fakes), min_length=4, max_length=12, dictionary='all'))
    fakes -= banned
fakes = sorted(fakes)
random.shuffle(fakes)

In [None]:
corrected = []
for fw in fakes:
    if len(corrected) < samples:
        suggestions = hobj.suggest(fw)
        if suggestions:
            corrected.append(suggestions[0])

with open('../fake_words.csv', 'w') as f:
    f.write('error,corrected\n')
    for w1, w2 in zip(fakes, corrected):
        f.write(f"{w1},{w2}\n")

In [None]:
random.seed(98986)
samples = 2400
fakes = set(fictionary.words(2*samples, min_length=4, max_length=12, dictionary='all'))
fakes -= banned
while len(fakes) < 2*samples:
    fakes |= set(fictionary.words(2*samples-len(fakes), min_length=4, max_length=12, dictionary='all'))
    fakes -= banned
fakes = sorted(fakes)
random.shuffle(fakes)

In [None]:
corrected = []
for fw in fakes:
    if len(corrected) < samples:
        suggestions = hobj.suggest(fw)
        if suggestions:
            i = 0
            while i < len(suggestions) and len(suggestions[i]) > 12:
                i += 1
            if i < len(suggestions):
                corrected.append(suggestions[i])

with open('../fake_words_maxlen12.csv', 'w') as f:
    f.write('error,corrected\n')
    for w1, w2 in zip(fakes, corrected):
        f.write(f"{w1},{w2}\n")