#### regen datasets

In [1]:
import os
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor

In [2]:
notebooks = os.listdir('.')
notebooks = sorted(filter(lambda fn: fn.endswith('.ipynb') and fn[:2].isdigit(), notebooks))

In [3]:
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')

for notebook_filename in notebooks:
    print(f'executing notebook {notebook_filename}')
    with open(notebook_filename) as f:
        nb = nbformat.read(f, as_version=4)
        ep.preprocess(nb, {'metadata': {'path': '.'}})
    with open(notebook_filename, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)

executing notebook 01 Regenerate cognates.ipynb
executing notebook 02 Regenerate homophones.ipynb
executing notebook 03 Regenerate misspellings.ipynb
executing notebook 04 Regenerate surnames variants.ipynb
executing notebook 05 Regenerate forename variants.ipynb
executing notebook 06 Regenerate typos.ipynb
executing notebook 07 Regenerate fake words.ipynb
executing notebook 08 Regenerate random English pairs.ipynb
executing notebook 09 Regenerate DNA sequences.ipynb
executing notebook 10 Regenerate protein sequences.ipynb
executing notebook 11 Regenerate inflected verbs.ipynb
executing notebook 12 Regenerate Porter2 stemmed.ipynb


#### uniqueness checking

In [4]:
comparables = []
comp_dict = {}

ds = ['cognates_maxlen12', 'homophones_maxlen12', 'misspellings_maxlen12',
      'surnames_maxlen12', 'forenames_maxlen12', 'typos',
      'fake_words_maxlen12', 'random_words_maxlen12',
      'hg38_maxlen12', 'proteins_len12',
      'conjugated_maxlen12', 'porter_maxlen12']

for fn in ds:
    with open(f'../{fn}.csv') as file:
        next(file)
        lc = 0
        comp_dict[fn] = set()
        for line in file:
            comparables.append(line.strip().lower())
            comp_dict[fn].add(line.strip().lower())
            lc += 1
        print(f'{lc} {fn}')

len(set(comparables))

2400 cognates_maxlen12
2400 homophones_maxlen12
2400 misspellings_maxlen12
2400 surnames_maxlen12
2400 forenames_maxlen12
2400 typos
2400 fake_words_maxlen12
2400 random_words_maxlen12
2400 hg38_maxlen12
2400 proteins_len12
2400 conjugated_maxlen12
2400 porter_maxlen12


28800

In [5]:
from itertools import combinations
for a,b in combinations(ds, 2):
    inter = comp_dict[a] & comp_dict[b]
    if inter:
        print(a, b, inter)
for a in ds:
    print(a, len(comp_dict[a]))

cognates_maxlen12 2400
homophones_maxlen12 2400
misspellings_maxlen12 2400
surnames_maxlen12 2400
forenames_maxlen12 2400
typos 2400
fake_words_maxlen12 2400
random_words_maxlen12 2400
hg38_maxlen12 2400
proteins_len12 2400
conjugated_maxlen12 2400
porter_maxlen12 2400


In [6]:
comparables = []
comp_dict = {}

ds = ['cognates', 'homophones', 'misspellings',
      'surnames', 'forenames', 'typos',
      'fake_words', 'random_words',
      'hg38', 'proteins',
      'conjugated', 'porter']

for fn in ds:
    with open(f'../{fn}.csv') as file:
        next(file)
        lc = 0
        comp_dict[fn] = set()
        for line in file:
            comparables.append(line.strip().lower())
            comp_dict[fn].add(line.strip().lower())
            lc += 1
        print(f'{lc} {fn}')

len(set(comparables))

2400 cognates
2400 homophones
2400 misspellings
2400 surnames
2400 forenames
2400 typos
2400 fake_words
2400 random_words
2400 hg38
2400 proteins
2400 conjugated
2400 porter


28800

In [7]:
from itertools import combinations
for a,b in combinations(ds, 2):
    inter = comp_dict[a] & comp_dict[b]
    if inter:
        print(a, b, inter)
for a in ds:
    print(a, len(comp_dict[a]))

cognates 2400
homophones 2400
misspellings 2400
surnames 2400
forenames 2400
typos 2400
fake_words 2400
random_words 2400
hg38 2400
proteins 2400
conjugated 2400
porter 2400
