In [1]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import unicodeblock.blocks
import re
from tqdm import tqdm
from random import random

In [2]:
links = {}

with open('../language-pairs.json', 'r') as f:
    pairs = json.loads(f.read())
    
    for pair in pairs:
        links[pair] = pairs[pair]['wiki']

links

{'Hindi-Persian': 'https://en.m.wiktionary.org/wiki/Category:Hindi_terms_borrowed_from_Persian',
 'English-French': 'https://en.m.wiktionary.org/wiki/Category:English_terms_borrowed_from_French'}

In [3]:
def get_all_borrowed_words(dest, source, invalid=["Unsupported titles/Space"]):
    title = f"Category:{dest}_terms_borrowed_from_{source}"
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }

    url = f"https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle={title}&cmlimit=max"
    
    borrowed_words = []
    while(True):
        r = requests.get(url,params)
        try:
            cmcontinue = r.json()['continue']['cmcontinue']
            for cmember in r.json()['query']['categorymembers']:
                if len(cmember['title']) > 1 and \
                    not cmember['title'].startswith('-') and \
                    not cmember['title'].endswith('-') and \
                    cmember['title'] not in invalid and \
                    unicodeblock.blocks.of(cmember['title'][0]) not in ['DIGIT', 'BASIC_PUNCTUATION']:
                    borrowed_words.append(cmember['title'])
                    if len(borrowed_words) % 1000 == 0:
                        print(f"Got {len(borrowed_words)}")
            url = url.split("&cmcontinue")[0]
            url+=f"&cmcontinue={cmcontinue}"
        except KeyError:
            break
    print(f"Done, {len(borrowed_words)}")
    return borrowed_words

In [4]:
'''
def get_all_loans_and_false_friends:
    for each word in borrowed words:
        1. get word page from wiktionary
        2. extract source word from source language
        3. extract false friends from language!=source language

    return loan_pairs, false_friends
'''

def get_source_word(soup, src_lang, lst_invalid_words):
    for src_soup in soup.find_all("span", class_="etyl"):
        if src_soup != None and src_soup.find(lambda tag: tag.name == 'a' and src_lang.lower() in tag.text.lower()):
            
            src_soup_final = src_soup.find_next("i")
            src_word = src_soup_final.text if src_soup_final != None else ''
            if src_word not in lst_invalid_words:
                source_word = src_word
            else:
                src_soup_final = src_soup.find_next("strong")
                src_word = src_soup_final.text if src_soup_final != None else ''
                source_word = src_word
            return source_word
    
    return ''

def get_false_friend(borrowed_word, soup, src_lang, dest_lang, lst_invalid_words):
    all_false_friends = []
    for header in soup.find_all("h2", id=True):
        if dest_lang.lower() in [header.get('id').lower(), header.text.lower()]:
            try:
                for next_header in header.parent.find_all('h3'):
                    if re.match("Etymology [1-9]",next_header.text):
                        # get next sibling and then find_all span etyl?
                        next_sibs = next_header.find_next_siblings()
                        if len(next_sibs) > 0:
                            all_etyms = next_sibs[0].find_all("span", class_="etyl")
                            if len(all_etyms) > 0:
                                etym = all_etyms[0]
                                for tag in etym.find(lambda tag: tag.name == 'a'):

                                    if src_lang.lower() not in tag.text.lower():
                                        false_friend_soup = etym.find_next("i")
                                        ff_word = false_friend_soup.text if false_friend_soup != None else ''
                                        ff_word = re.split('[,;:]',ff_word)[0].strip()
                                        if ff_word in lst_invalid_words:
                                            false_friend_soup = etym.find_next("strong")
                                            ff_word = false_friend_soup.text if false_friend_soup != None else ''
                                            ff_word = re.split('[,;:]',ff_word)[0].strip()

                                        meaning = etym.parent.parent.find_all('ol')[0].text.split('\n')[0]
                                        meaning = re.sub(r'\(.+?\)',r'',meaning)
                                        meaning = re.sub(r'\[.+?\]',r'',meaning)
                                        meaning = meaning.replace('  ', ' ')
                                        meaning = re.split('[.;:]',meaning)[0].strip()

                                        all_false_friends.append([borrowed_word,ff_word,tag.text.lower(),meaning])
            except:
                pass
    
    return all_false_friends

            
def get_all_loans_and_false_friends(borrowed_words, dest_lang, src_lang, lst_invalid_words=['plural', 'not comparable', 'Urdu spelling'], min_timeout=10, timeout_after_words=100):

    loan_pairs = [] # [[borrowed_word, source_word]]
    false_friends = [] # 
    p_bar = tqdm(borrowed_words)
    for i, word in enumerate(p_bar):
        while True:
            try:
                params = {
                    'action': 'query',
                    'format': 'json',
                    'prop': 'extracts',
                    'exintro': True,
                    'explaintext': True,
                }

                url = 'https://en.wiktionary.org/w/rest.php/v1/page/' + word + '/html'

                response = requests.get(url,params, timeout=300)

                soup = BeautifulSoup(response.content, 'html.parser')
                source_word = get_source_word(
                    soup, src_lang, lst_invalid_words)
                false_friend = get_false_friend(
                    word, soup, src_lang, dest_lang, lst_invalid_words)
                loan_pairs.append([word, source_word])
                false_friends.extend(false_friend)
                p_bar.set_description("Processed word: {}".format(word))
                if i>0 and i%timeout_after_words==0:
                    sleep_time = (random() * min_timeout) + min_timeout
                    p_bar.set_description("Collected {} word pairs, sleeping for {}seconds".format(i, sleep_time))
                    time.sleep(sleep_time)
                break
            except Exception as e:
                p_bar.set_description("Word: {}, Error: {}, sleeping for 1 minute".format(word, e))
                time.sleep(60)

        

    return loan_pairs, false_friends

In [5]:
for pair in links:
    if os.path.exists("results/{}.csv".format(pair)):
        overwrite = input("{}.csv exists. Overwrite existing file? (y/n) ".format(pair))
    if overwrite == "y" or not os.path.exists("results/{}.csv".format(pair)):
        [dest, src] = pair.split('-')
        print(pair)
        words = get_all_borrowed_words(dest, src, invalid=["Unsupported titles/Space"])
        print("Getting loan pairs and false friends")
        loan_words, false_friends = get_all_loans_and_false_friends(words, dest, src)

        df_loans = pd.DataFrame(loan_words, columns=['loan_word', 'original_word'])
        df_false_friends = pd.DataFrame(false_friends, columns=['loan_word', 'original_word', 'other_etymology', 'other_meaning'])
        df_loans.to_csv("results/{}.csv".format(pair), index=False)
        df_false_friends.to_csv("results/{}_false_friends.csv".format(pair), index=False)
        # print(df_false_friends)
        # print(df_loans)
        print(pair, "done\n")
        print()
        

English-French
Got 1000
Got 2000
Got 3000
Got 4000
Done, 4981
Getting loan pairs and false friends


Processed word: souffle:  95%|█████████▍| 4720/4981 [1:13:12<05:25,  1.25s/it]                                                                     