## Homophones

#### Set-up and initialization

In [1]:
import os
import random
import re
import urllib.request
from itertools import combinations, permutations

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
try:
    os.mkdir('temp')
except FileExistsError:
    pass

if os.path.isfile('./realwords.txt'):
    with open('./realwords.txt') as rw_file:
        realwords = {_.strip() for _ in rw_file.readlines()}
else:
    realwords = set()

In [3]:
banned_pairs = set()
for fn in ['cognates', 'cognates_maxlen12', 'homophones', 'homophones_maxlen12',
           'misspellings', 'misspellings_maxlen12', 'surnames', 'surnames_maxlen12',
           'forenames', 'forenames_maxlen12', 'typos',
           'fake_words', 'fake_words_maxlen12', 'random_words', 'random_words_maxlen12']:
    with open(f'../{fn}.csv') as fh:
        new_pairs = {tuple(pair.strip().split(',')) for pair in fh.readlines()[1:]}
        banned_pairs |= new_pairs
banned_pairs |= {(_[1], _[0]) for _ in banned_pairs}
banned_pairs |= {(_[0].lower(), _[1].lower()) for _ in banned_pairs}
banned_pairs |= {(_[1].lower(), _[0].lower()) for _ in banned_pairs}
banned_pairs |= {(_[0][0].upper()+_[0][1:], _[1][0].upper()+_[1][1:]) for _ in banned_pairs}
banned_pairs |= {(_[1][0].upper()+_[1][1:], _[0][0].upper()+_[0][1:]) for _ in banned_pairs}

#### Grab basis dataset

In [4]:
def clean(s):
    s = s.replace('†', '')
    s = s.replace('*', '')
    s = s.replace('/', ' ')
    s = re.sub(r'\(.+?\)', '', s)
    return s.strip()

url = "https://en.wiktionary.org/wiki/Appendix:English_irregular_verbs"

res = requests.get(url, headers={'Cache-Control': 'no-cache'})
soup = BeautifulSoup(res.content,'lxml')
conj = []
for row in soup.table.tbody.find_all('tr')[1:]:
    cell = row.find('td').find_all('i')
    conj += cell

conj = [clean(_.text).split() for _ in conj]

In [5]:
url = "https://www.usingenglish.com/reference/irregular-verbs/"

res = requests.get(url, headers={'Cache-Control': 'no-cache'})
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))

conj += list(df[0].dropna().apply(lambda x: clean(' '.join(str(_).lower() for _ in x)).replace(',', '').split(), axis=1))

In [6]:
conj_dict = {}
for words in conj:
    if words[0] not in conj_dict:
        conj_dict[words[0]] = set()
    conj_dict[words[0]] |= set(words)

In [7]:
url = 'https://raw.githubusercontent.com/en-wl/wordlist/master/agid/infl.txt'
if not os.path.isfile(os.path.join('temp', 'infl.txt')):
    urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

with open(os.path.join('temp', 'infl.txt')) as fh:
    lines = [_.strip() for _ in fh.readlines()]

In [8]:
def clean(s):
    for _ in '|:~<>!{}':
        if _ in s:
            return ''
    if s[0].isdigit():
        return ''
    return s.replace(',', '')
    
    
new_lines = []
for line in lines:
    if line[0] != line[0].upper():
        if '?' not in line and 'V' in line:
            line = [clean(_) for _ in line.split() if clean(_)]
            if len(line) > 1:
                new_lines.append(line)

In [9]:
random.seed(377)
random.shuffle(new_lines)

In [10]:
n = 500
for line in new_lines:
    if line[0] in conj_dict:
        conj_dict[line[0]] |= set(line[1:])
    elif n > 0:
        n -= 1
        conj_dict[line[0]] = set(line[1:])

In [11]:
conj_list = []
for infinitive in sorted(conj_dict.keys()):
    for conjugate in sorted(conj_dict[infinitive]):
        if infinitive != conjugate and infinitive and conjugate:
            conj_list.append((conjugate, infinitive))

#### Randomly select from the misspellings and write output

In [12]:
conj = conj_list[:]

random.seed(227)
random.shuffle(conj)
wc = 0

with open('../conjugated.csv', 'w') as file:
    file.write('conjugated,infinitive\n')
    for pair in conj:
        if pair not in banned_pairs and wc < 2400:
            file.write(','.join(pair)+'\n')
            wc += 1

In [13]:
conj = list(filter(lambda x: len(x[0]) < 13 and len(x[1]) < 13, conj_list))

random.seed(77271)
random.shuffle(conj)
wc = 0

with open('../conjugated_maxlen12.csv', 'w') as file:
    file.write('conjugated,infinitive\n')
    for pair in conj:
        if pair not in banned_pairs and wc < 2400:
            file.write(','.join(pair)+'\n')
            wc += 1

#### Write out list of real words

In [14]:
for line in new_lines:
    if line[0] in conj_dict:
        conj_dict[line[0]] |= set(line[1:])
    else:
        conj_dict[line[0]] = set(line[1:])

realwords = set(conj_dict.keys())
for wl in conj_dict.values():
    realwords |= wl

realwords = sorted(realwords)
with open('./realwords_conj.txt', 'w') as rw_file:
    for word in sorted(realwords):
        rw_file.write(word+'\n')