## Cognates

#### Set-up and initialization

In [1]:
import os
import random
import urllib.request
from itertools import combinations

import re
import pandas as pd
import requests

from bs4 import BeautifulSoup
from tabula import read_pdf

In [2]:
try:
    os.mkdir('temp')
except FileExistsError:
    pass

if os.path.isfile('./realwords.txt'):
    with open('./realwords.txt') as rw_file:
        realwords = {_.strip() for _ in rw_file.readlines()}
else:
    realwords = set()

### German
#### Gather German cognates

In [3]:
url = "https://en.wiktionary.org/wiki/Appendix:List_of_German_cognates_with_English#Borrowings_into_Old_German_and_Old_English"

res = requests.get(url, headers={'Cache-Control': 'no-cache'})
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))

In [4]:
de_cognates = []
for frame in df:
    for n in range(1, frame.shape[0]):
        row = frame.iloc[n]
        de_cognates.append((row[0], row[1]))

def clean_de_cognates(cognates):
    cognates = sorted(set(cognates))

    new_cognates = []
    for german, english in cognates:
        if ';' in german and ';' in english:
            german_l = [_.strip() for _ in german.split(';')]
            english_l = [_.strip() for _ in english.split(';')]
            if len(german_l) == len(english_l):
                new_cognates += zip(german_l, english_l)
            else:
                print(german_l, english_l)
                new_cognates.append((german, english))
        else:
            new_cognates.append((german, english))
    cognates = sorted(set(new_cognates))

    new_cognates = []
    for german, english in cognates:
        if '/' in german or '/' in english:
            german = german.split('/')
            english = english.split('/')
            for g in german:
                for e in english:
                    new_cognates.append((g.strip(), e.strip()))
        else:
            new_cognates.append((german, english))
    cognates = sorted(set(new_cognates))
    
    new_cognates = []
    for german, english in cognates:
        if (re.findall(r'(^|[a-zA-Z]+)\(.+?\)', german) or re.findall(r'(^|[a-zA-Z]+)\(.+?\)', english)):

            extra_german = re.sub(r'\(.+?\)', '', german).strip()
            extra_english = re.sub(r'\(.+?\)', '', english).strip()
            for g in (german, extra_german):
                for e in (english, extra_english):
                    new_cognates.append((g, e))
        else:
            new_cognates.append((german, english))
    cognates = sorted(set(new_cognates))
    
    for i, pair in enumerate(cognates):
        german, english = pair

        for char in ';,':
            german = re.sub(char+'.*', '', german)
            english = re.sub(char+'.*', '', english)

        german = re.sub(r'\[.+?\]', '', german)
        english = re.sub(r'\[.+?\]', '', english)

        german = re.sub(r' \(.+', '', german)
        english = re.sub(r' \(.+', '', english)
        german = re.sub(r' \[.+', '', german)
        english = re.sub(r' \[.+', '', english)

        german = german.replace('*', '')
        english = english.replace('*', '')

        for char in '()':
            german = german.replace(char, '')
            english = english.replace(char, '')

        cognates[i] = german.strip(), english.strip()

        if not re.findall('[A-Za-z]', german) or not re.findall('[A-Za-z]', english):
            cognates[i] = ('',)

    cognates = sorted(set(cognates))
    if ('',) in cognates:
        cognates.remove(('',))
    return cognates

de_cognates = clean_de_cognates(de_cognates)

#### Save our German cognates & realwords list

In [5]:
with open('temp/de_cognates.csv', 'w') as file:
    file.write('german,english\n')
    for de,en in de_cognates:
        file.write(f'{de},{en}\n')

In [6]:
realwords = sorted(realwords | {_[1] for _ in de_cognates})
with open('./realwords.txt', 'w') as rw_file:
    for word in sorted(realwords):
        rw_file.write(word+'\n')

### French
#### Gather French cognates

In [7]:
url = 'https://steinhardt.nyu.edu/scmsAdmin/media/users/xr1/glossaries/ELA/GlossaryCognatesFrenchUpdated5-5-2014.pdf'
urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

('temp/GlossaryCognatesFrenchUpdated5-5-2014.pdf',
 <http.client.HTTPMessage at 0x7fcfd37e89e8>)

In [8]:
# Manual corrections from Google Translate, based on the symmetric difference to the sets of cognates
drop_french = {'vote', 'volley-ball', 'trumpet', 'tourist', 'splendid', 'reméde', 'problem', 'plant',
               'opèrer', 'objèt', 'music', 'magnetic', 'magician', 'lígne', 'lens', 'incrédíble', 'general',
               'gas', 'fantstique', 'egoïste', 'electricity', 'comentaire', 'brilliant-te', 'authorize',
               'astronomer', 'artístique', 'aggrégat', 'Idée', 'Different-e', 'special-e', 'medaille',
               'inmédiatement',
}
drop_english = {'statistics', 'effective', 'Study'}
swap_french = {'prudent', 'study', 'honor', 'electric'}

In [9]:
df = read_pdf(os.path.join('temp', url.split('/')[-1]), pages='4-25', pandas_options={'header': None})
df = df.fillna('')

In [10]:
lines = [' '.join(str(__) for __ in _).strip() for _ in df.get_values()]
fr_cognates = []
cog = []
for line in lines:
    if 'English' in line or 'French' in line:
        continue
    line = line.split(' ')
    if len(cog) == 2:
        cog = []
    for word in line:
        if word in {'', '(to)', '(se)'}:
            continue
        for part in '0123456789':
            word = word.replace(part, '')
        for part in ['(to)', '(se)']:
            word = word.replace(part, '')
        if word:
            cog.append(word)
    if cog:
        if len(cog) == 3:
            cog = [cog[0]+cog[1], cog[2]]
        if len(cog) != 2:
            print(cog)
        else:
            cog = cog[::-1]
            if cog[0] not in drop_french and cog[1] not in drop_english:
                if cog[0] in swap_french:
                    cog = cog[::-1]
                fr_cognates.append(tuple(cog))

['participate']
['participate']


In [11]:
df = read_pdf(os.path.join('temp', url.split('/')[-1]), pages='27-48', pandas_options={'header': None})
df = df.fillna('')

In [12]:
lines = [' '.join(str(__) for __ in _).strip() for _ in df.get_values()]
fr_cognates2 = []
cog = []
for line in lines:
    if 'English' in line or 'French' in line:
        continue
    line = line.split(' ')
    if len(cog) == 2:
        cog = []
    for word in line:
        if word in {'', '(to)', '(se)'}:
            continue
        for part in '0123456789':
            word = word.replace(part, '')
        for part in ['(to)', '(se)']:
            word = word.replace(part, '')
        if word:
            cog.append(word)
    if cog:
        if len(cog) == 3:
            cog = [cog[0]+cog[1], cog[2]]
        if len(cog) != 2:
            print(cog)
        else:
            if cog[0] not in drop_french and cog[1] not in drop_english:
                if cog[0] in swap_french:
                    cog = cog[::-1]
                fr_cognates2.append(tuple(cog))

['banjo']
['banjo']
['pionnier']
['pionnier']


In [13]:
fr_cognates = sorted(set(fr_cognates) | set(fr_cognates2))

In [14]:
fr_cognates2 = []
for fr, en in fr_cognates:
    if '-' not in fr:
        fr_cognates2.append((fr, en))
    else:
        fr = fr.split('-')
        fr_cognates2.append((fr[0], en))
        fr_cognates2.append((''.join(fr), en))
fr_cognates = sorted(set(fr_cognates2))

In [15]:
more_cognates = """Crown,couronne
Custom,coutume
Assizes,assises
Franchise,franchise
Joust,joute
Marriage,mariage
Parliament,parlement
Heir,héritier
Chef,chef
chief,chef
Caterer,approvisionneur
Pay,payer
Ticket,ticket
Purchase,acheter
Rental,loyer
Debt,dette
Affair,affaire
Court,cour
Aunt,tante
Chamber,chambre
Chair,chaise
Cushion,coussin
Cabbage,choux
Bacon,bacon
Cauldron,chaudron
Mustard,moutarde
Mutton,mouton
Beef,bœuf
Pork,porc
Poultry,poulet
Claret,clairet
Mince,émincer
Stew,ragoût
Veal,veau
Banquet,banquet
Carrot,carotte
Aperitif,apéritif
Pony,poney
toilet,Toilette
Causeway,chaussée
Kennel,chenil
Solace,consolation
Square,carré
Change,changer
Chapel,chapelle
Choice,choix
Mischief,méchanceté
Achieve,achever"""

In [16]:
for line in more_cognates.split('\n'):
    fr, en = line.lower().split(',')
    fr_cognates.append((fr, en))
fr_cognates = sorted(set(fr_cognates))
len(fr_cognates)

485

#### Save our French cognates & realwords list

In [17]:
with open('temp/fr_cognates.csv', 'w') as file:
    file.write('french,english\n')
    for fr,en in fr_cognates:
        file.write(f'{fr},{en}\n')

In [18]:
realwords = sorted(set(realwords) | {_[1] for _ in fr_cognates})
with open('./realwords.txt', 'w') as rw_file:
    for word in sorted(realwords):
        rw_file.write(word+'\n')

### Spanish
#### Gather Spanish cognates

In [19]:
url = 'https://www.realfastspanish.com/vocabulary/spanish-cognates'
res = requests.get(url, headers={'Cache-Control': 'no-cache',
                                'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'})
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))

In [20]:
es_cognates = []
for frame in df:
    for n in range(1, frame.shape[0]):
        row = frame.iloc[n]
        es_cognates.append((row[0].lower(), row[1].lower()))

In [21]:
url = 'http://www.cognates.org/pdf/mfcogn.pdf'
urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

('temp/mfcogn.pdf', <http.client.HTTPMessage at 0x7fcfd34c63c8>)

In [22]:
frames = read_pdf(os.path.join('temp', url.split('/')[-1]), pages='3-66', pandas_options={'header': None},
                  multiple_tables=True)

In [23]:
es_cognates = []
df = frames[0].fillna('').iloc[1:]
df[1] = df[1]+' '+df[2]
df.drop(2, axis=1, inplace=True)

pos = ['prefijo\.', 'prefix\.', 'intj\.', 'prep\.', 'conj\.', 'abbr\.', 'abr\.', 'adv\.', 'adj\.',
       'v\.', 'n\.', 's\.']

for en, es in df.get_values():
    en = en.replace('MFW', '')
    es = es.replace('PMF', '')

    for p in pos:
        en = re.sub(' '+p+'.*', '', en)
        es = re.sub(' '+p+'.*', '', es)

    en = en.strip()
    es = es.strip()
    
    es_cognates.append((es, en))

for df in frames[1:]:
    df = df.fillna('')
    for _, en, _, es in df.get_values():
        for p in pos:
            en = re.sub(p+'.*', '', en)
            es = re.sub(p+'.*', '', es)

        en = en.strip()
        es = es.strip()

        if ',' in en:
            en = en.split(',')
            en = ' '.join(en[::-1]).strip()
        if ',' in es:
            es = es.split(',')
            es = ' '.join(es[::-1]).strip()
            
        es_cognates.append((es, en))

es_cognates = sorted(set(es_cognates))
len(es_cognates)

3877

#### Save our Spanish cognates & realwords list

In [24]:
with open('temp/es_cognates.csv', 'w') as file:
    file.write('spanish,english\n')
    for es,en in es_cognates:
        file.write(f'{es},{en}\n')

In [25]:
realwords = sorted(set(realwords) | {_[1] for _ in es_cognates})
with open('./realwords.txt', 'w') as rw_file:
    for word in sorted(realwords):
        rw_file.write(word+'\n')

#### Randomly select from the misspellings and write output

In [26]:
probs = []
for ds in [de_cognates, es_cognates, fr_cognates]:
    probs.append(len(ds))
probs = [pr/sum(probs) for pr in probs]
probs

[0.30408423739629864, 0.6185386088066369, 0.07737715379706446]

In [27]:
random.seed(32132)
datasets = [list(de_cognates), list(es_cognates), list(fr_cognates)]
for i, ds in enumerate(datasets):
    datasets[i] = sorted(set(datasets[i]))
    random.shuffle(datasets[i])

cognates = []
pos = 0
while len(cognates) < 2400:
    for i, ds in enumerate(datasets):
        if pos < len(ds):
            a,b = ds[pos]
            if random.random() > probs[i] and a != b:
                cognates.append(ds[pos])
    pos += 1

with open('../cognates.csv', 'w') as file:
    file.write('cognate,english\n')
    for pair in cognates:
        file.write(','.join(pair)+'\n')

In [28]:
random.seed(4882)
datasets = [list(de_cognates), list(es_cognates), list(fr_cognates)]
for i, ds in enumerate(datasets):
    ds2 = []
    for a,b in sorted(set(datasets[i])):
        if len(a) < 13 and len(b) < 13 and a != b:
            ds2.append((a,b))
    datasets[i] = ds2
    random.shuffle(datasets[i])

cognates = []
pos = 0
while len(cognates) < 2400:
    for i, ds in enumerate(datasets):
        if random.random() > probs[i] and pos < len(ds):
            cognates.append(ds[pos])
    pos += 1

with open('../cognates_maxlen12.csv', 'w') as file:
    file.write('cognate,english\n')
    for pair in cognates:
        file.write(','.join(pair)+'\n')