## Homophones

#### Set-up and initialization

In [1]:
import os
import random
import urllib.request
from itertools import combinations

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
try:
    os.mkdir('temp')
except FileExistsError:
    pass

if os.path.isfile('./realwords.txt'):
    with open('./realwords.txt') as rw_file:
        realwords = {_.strip() for _ in rw_file.readlines()}
else:
    realwords = set()

In [3]:
banned_pairs = {
    'nickel,nickle',
}
banned_pairs = {tuple(pair.split(',')) for pair in banned_pairs}
for fn in ['cognates', 'cognates_maxlen12']:
    with open(f'../{fn}.csv') as fh:
        banned_pairs |= {tuple(pair.strip().split(',')) for pair in fh.readlines()[1:]}
banned_pairs |= {(_[1], _[0]) for _ in banned_pairs}

In [4]:
additional_sets = {
    ("a while","awhile"),
    ("all ready","already"),
    ("all together","altogether"),
    ("androgenous","androgynous"),
    ("barer","bearer"),
    ("baring","bearing"),
    ("baroness","barrenness"),
    ("basks","Basques"),
    ("batsman","batsmen"),
    ("broom","brume"),
    ("cannonry","canonry"),
    ("cavalryman","cavalrymen"),
    ("cays","quays"),
    ("coaming","combing"),
    ("coarser","courser"),
    ("coif","quaff"),
    ("courier","currier"),
    ("crappie","crappy"),
    ("crew","krewe"),
    ("cued","queued"),
    ("cueing","queueing"),
    ("cuer","queuer"),
    ("deal","dele"),
    ("depravation","deprivation"),
    ("dies","dyes"),
    ("eaves","eves"),
    ("entrance","entrants"),
    ("envoi","envoy"),
    ("fainted","feinted"),
    ("fainting","feinting"),
    ("faints","feints"),
    ("fairs","fares"),
    ("fakers","fakirs"),
    ("fates","fetes"),
    ("faux","foe"),
    ("filets","fillets"),
    ("flay","fley"),
    ("fleas","flees"),
    ("fliers","flyers"),
    ("florescence","fluorescents"),
    ("floured","flowered"),
    ("flours","flowers"),
    ("fluorescence","fluorescents"),
    ("foreward","foreword"),
    ("gaol","jail"),
    ("gnatty","natty"),
    ("hippie","hippy"),
    ("ileum","ilium"),
    ("lade","layed"),
    ("laid","layed"),
    ("lamas","llamas"),
    ("lead","lede"),
    ("lies","lyes"),
    ("lies","lyse"),
    ("light","lite"),
    ("lyes","lyse"),
    ("may be","maybe"),
    ("murre","myrrh"),
    ("overbilled","overbuild"),
    ("paeon","peon"),
    ("peaking","peeking"),
    ("poling","polling"),
    ("psi","scye"),
    ("raining","reigning"),
    ("raining","reining"),
    ("raise","res"),
    ("raising","rasing"),
    ("raising","razing"),
    ("rase","res"),
    ("rasing","razing"),
    ("rays","res"),
    ("raze","res"),
    ("reave","rive"),
    ("receipted","reseated"),
    ("receipts","reseats"),
    ("reeve","rive"),
    ("reigning","reining"),
    ("righting","writing"),
    ("said","sed"),
    ("sailer","sailor"),
    ("sain","sane"),
    ("sain","seine"),
    ("scold","skald"),
    ("she","sidhe"),
    ("shea","sidhe"),
    ("shivaree","shivery"),
    ("soccer","socker"),
    ("sodder","solder"),
    ("spear","speer"),
    ("speel","spiel"),
    ("spier","spire"),
    ("steals","steels"),
    ("steals","steles"),
    ("steels","steles"),
    ("stolen","stollen"),
    ("talkie","talky"),
    ("taro","tarot"),
    ("teared","tiered"),
    ("tenuis","tenuous"),
    ("tore","torr"),
    ("vices","vises"),
    ("wailed","whaled"),
    ("wails","whales"),
    ("waiting","weighting"),
    ("watt","wot"),
    ("what","wot"),
    ("whiny","winy"),
    ("why'd","wide"),
}

#### Grab basis dataset

In [5]:
if (not os.path.isfile(os.path.join('temp', 'homophone_list.csv')) or
    not os.path.isfile(os.path.join('temp', 'homophones.php'))):
    urls = [
        'https://raw.githubusercontent.com/TSMMark/homophone/master/lib/assets/homophone_list.csv',
        'http://members.peak.org/~jeremy/dictionaryclassic/chapters/homophones.php',
    ]
    for url in urls:
        urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

In [6]:
df = pd.read_csv(os.path.join('temp', 'homophone_list.csv'))
df = df.iloc[:,-2:]
df = df.to_dict('records')

extr = {}
for rec in df:
    w = rec['spelling']
    n = rec['relation_id']
    if n not in extr:
        extr[n] = set()
    extr[n].add(w)

homophones = []
for rec in extr.values():
    homophones += combinations(rec, 2)

In [7]:
with open(os.path.join('temp', 'homophones.php')) as file:
    wordlist = file.read()

wordlist = wordlist[wordlist.find('<pre>')+5:wordlist.find('</pre>')].strip().split('\n')
wordlist = [_.split(',') for _ in wordlist]

for words in wordlist:
    words = [_.strip() for _ in words]
    homophones += combinations(words, 2)

In [8]:
url = 'http://www.singularis.ltd.uk/bifroest/misc/homophones-list.html'
res = requests.get(url, headers={'Cache-Control': 'no-cache'})
soup = BeautifulSoup(res.content,'lxml')
lis = soup.find_all('li')

for words in lis:
    words = [_.strip() for _ in words.get_text().split(',')]
    homophones += combinations(words, 2)

In [9]:
homophones += sorted(additional_sets)

In [10]:
homophones = sorted(set(homophones)-banned_pairs)
homophones = sorted(set(tuple(sorted(_)) for _ in homophones if _[0].lower()!=_[1].lower()))

In [11]:
for word1, word2 in homophones:
    realwords.add(word1)
    realwords.add(word2)

#### Randomly select from the misspellings and write output

In [12]:
def reorder(pair):
    return pair[::-1] if random.choice((True, False)) else pair

In [13]:
random.seed(2345)

homophones1 = sorted([reorder(_) for _ in homophones])
random.shuffle(homophones1)
rev_homophones = [pair[::-1] for pair in homophones1]
random.shuffle(rev_homophones)
homophones1 += rev_homophones

with open('../homophones.csv', 'w') as file:
    file.write('word1,word2\n')
    rev_val = True
    for pair in homophones1[:2400]:
        file.write(','.join(reorder(pair))+'\n')

In [14]:
random.seed(437)

homophones2 = []
for a,b in sorted([reorder(_) for _ in homophones]):
    if len(a) < 13 and len(b) < 13:
        homophones2.append((a,b))
random.shuffle(homophones2)
rev_homophones = [pair[::-1] for pair in homophones2]
random.shuffle(rev_homophones)
homophones2 += rev_homophones

with open('../homophones_maxlen12.csv', 'w') as file:
    file.write('word1,word2\n')
    rev_val = True
    for pair in homophones2[:2400]:
        file.write(','.join(reorder(pair))+'\n')

#### Save our realwords list

In [15]:
realwords = sorted(realwords)
with open('./realwords.txt', 'w') as rw_file:
    for word in realwords:
        rw_file.write(word+'\n')