## Typos

#### Set-up and initialization

In [1]:
import urllib
import os
import random

import numpy as np

In [2]:
url = 'http://luululu.com/tweet/typo-corpus-r1.txt'
if not os.path.isfile(os.path.join('temp', 'typo-corpus-r1.txt')):
    urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

In [3]:
pairs = []
with open(os.path.join('temp', url.split('/')[-1])) as file:
    for line in file:
        a,b = line.strip().split('\t')[:2]
        if (13 > len(a) > 3) and (13 > len(b) > 3):
            pairs.append((a,b))

In [4]:
banned_pairs = set()
for fn in ['cognates', 'cognates_maxlen12', 'homophones', 'homophones_maxlen12',
           'misspellings', 'misspellings_maxlen12', 'surnames', 'surnames_maxlen12',
           'forenames', 'forenames_maxlen12']:
    with open(f'../{fn}.csv') as fh:
        banned_pairs |= {tuple(pair.strip().split(',')) for pair in fh.readlines()[1:]}
banned_pairs |= {(_[1], _[0]) for _ in banned_pairs}
banned_pairs |= {(_[0].lower(), _[1].lower()) for _ in banned_pairs}
banned_pairs |= {(_[1].lower(), _[0].lower()) for _ in banned_pairs}
banned_pairs |= {(_[0][0].upper()+_[0][1:], _[1][0].upper()+_[1][1:]) for _ in banned_pairs}
banned_pairs |= {(_[1][0].upper()+_[1][1:], _[0][0].upper()+_[0][1:]) for _ in banned_pairs}

#### Sampling and output

In [5]:
samples=2400
pairs = sorted(set(pairs))
np.random.seed(1337)        
typos = {pairs[_] for _ in np.random.choice(len(pairs), size=samples+600, replace=False)}

typos = sorted(typos)

random.seed(3713)
random.shuffle(typos)

with open('../typos.csv', 'w') as f:
    f.write('error,correct\n')
    count = 0
    for w1, w2 in typos:
        if count < samples and (w1,w2) not in banned_pairs and (w2,w1) not in banned_pairs:
            banned_pairs.add((w1,w2))
            f.write(f"{w1},{w2}\n")
            count += 1