## Homophones

#### Set-up and initialization

In [1]:
import os
import random
import urllib.request
from itertools import combinations

import pandas as pd

In [2]:
try:
    os.mkdir('temp')
except FileExistsError:
    pass

if os.path.isfile('./realwords.txt'):
    with open('./realwords.txt') as rw_file:
        realwords = {_.strip() for _ in rw_file.readlines()}
else:
    realwords = set()

#### Grab basis dataset

In [3]:
#urls = [
#    'https://raw.githubusercontent.com/TSMMark/homophone/master/lib/assets/homophone_list.csv',
#    'http://members.peak.org/~jeremy/dictionaryclassic/chapters/homophones.php'
#]
#for url in urls:
#    urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

In [4]:
df = pd.read_csv(os.path.join('temp', 'homophone_list.csv'))
df = df.iloc[:,-2:]
df = df.to_dict('records')

extr = {}
for rec in df:
    w = rec['spelling']
    n = rec['relation_id']
    if n not in extr:
        extr[n] = set()
    extr[n].add(w)

homophones = []
for rec in extr.values():
    homophones += combinations(rec, 2)

In [5]:
with open(os.path.join('temp', 'homophones.php')) as file:
    wordlist = file.read()

wordlist = wordlist[wordlist.find('<pre>')+5:wordlist.find('</pre>')].strip().split('\n')
wordlist = [_.split(',') for _ in wordlist]

for words in wordlist:
    words = [_.strip() for _ in words]
    homophones += combinations(words, 2)

In [6]:
homophones = sorted(set(homophones), key=lambda tup: tup[0])
for word1, word2 in homophones:
    realwords.add(word1)
    realwords.add(word2)

#### Randomly select from the misspellings and write output

In [7]:
def reorder(pair):
    return pair[::-1] if random.choice((True, False)) else pair

In [8]:
random.seed(2345)
homophones = [sorted(_) for _ in sorted(homophones)]

homophones1 = sorted(homophones)
random.shuffle(homophones1)
with open('../homophones.csv', 'w') as file:
    file.write('word1,word2\n')
    rev_val = True
    for pair in homophones1[:2400]:
        file.write(','.join(reorder(pair))+'\n')

In [9]:
homophones2 = []
for a,b in sorted(homophones):
    if len(a) < 13 and len(b) < 13:
        homophones2.append((a,b))
homophones2 = sorted(homophones2)
random.seed(437)
random.shuffle(homophones2)
with open('../homophones_maxlen12.csv', 'w') as file:
    file.write('word1,word2\n')
    rev_val = True
    for pair in homophones2[:2400]:
        file.write(','.join(reorder(pair))+'\n')

#### Save our realwords list

In [10]:
realwords = sorted(realwords)
with open('./realwords.txt', 'w') as rw_file:
    for word in realwords:
        rw_file.write(word+'\n')