## Misspellings

#### Set-up and initialization

In [1]:
import os
import random
import urllib.request

In [2]:
try:
    os.mkdir('temp')
except FileExistsError:
    pass

if os.path.isfile('./realwords.txt'):
    with open('./realwords.txt') as rw_file:
        realwords = {_.strip() for _ in rw_file.readlines()}
else:
    realwords = set()

In [3]:
banned_pairs = {
    'calendar,calender',
    'payed,paid',
    'autor,author',
    'substancial,substantial',
    'wille,will',
    'gardner,gardener',
    'ocasional,occasional',
    'trafic,traffic',
}
banned_pairs = {tuple(pair.split(',')) for pair in banned_pairs}

for fn in ['cognates', 'cognates_maxlen12', 'homophones', 'homophones_maxlen12']:
    with open(f'../{fn}.csv') as fh:
        banned_pairs |= {tuple(pair.strip().split(',')) for pair in fh.readlines()[1:]}
banned_pairs |= {(_[1], _[0]) for _ in banned_pairs}

#### Grab basis dataset

In [4]:
urls = [
    'https://raw.githubusercontent.com/chrislit/abydos/master/tests/corpora/misspellings.csv',
    'https://raw.githubusercontent.com/chrislit/abydos/master/tests/corpora/wikipediaCommonMisspellings.csv'
]
for url in urls:
    urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

In [5]:
misspellings = set()
for fn in ['wikipediaCommonMisspellings', 'misspellings']:
    with open(os.path.join('temp', f'{fn}.csv')) as file:
        next(file)
        for line in file:
            misspellings.add(line.strip())
misspellings = {tuple(_.split(',')) for _ in misspellings}

for _, word in misspellings:
    realwords.add(word)

misspellings -= banned_pairs

#### Randomly select from the misspellings and write output

In [6]:
random.seed(832)
misspellings1 = sorted(misspellings)
random.shuffle(misspellings1)
with open('../misspellings.csv', 'w') as file:
    file.write('error,corrected\n')
    for pair in misspellings1[:2400]:
        file.write(','.join(pair)+'\n')

In [7]:
random.seed(23487)
misspellings2 = set()
for a,b in misspellings:
    if len(a) < 13 and len(b) < 13:
        misspellings2.add((a,b))
misspellings2 = sorted(misspellings2)
random.shuffle(misspellings2)
with open('../misspellings_maxlen12.csv', 'w') as file:
    file.write('error,corrected\n')
    for pair in misspellings2[:2400]:
        file.write(','.join(pair)+'\n')

#### Save our realwords list

In [8]:
realwords = sorted(realwords)
with open('./realwords.txt', 'w') as rw_file:
    for word in realwords:
        rw_file.write(word+'\n')