In [18]:
import numpy as np
import requests
import pandas
import json
import unicodeblock.blocks
import os

In [19]:
# expected Unicode blocks for certain scripts
unicode_block_map = {
    "Latin" : ['BASIC_LATIN', 'LATIN_1_SUPPLEMENT', 'LATIN_EXTENDED_LETTER', 'LATIN_EXTENDED_A', 'LATIN_EXTENDED_B', 'LATIN_EXTENDED_C'],
    "Greek" : ['GREEK'],
    "Cyrillic" : ['CYRILLIC', 'CYRILLIC_SUPPLEMENTARY', 'CYRILLIC_EXTENDED_A', 'CYRILLIC_EXTENDED_B'],
    "Arabic" : ['ARABIC', 'ARABIC_SUPPLEMENT', 'ARABIC_PRESENTATION_FORMS_A', 'ARABIC_PRESENTATION_FORMS_B'],
    "Devanagari" : ['DEVANAGARI', 'VEDIC_EXTENSIONS', 'DEVANAGARI_EXTENDED'],
    "Bengali" : ['BENGALI']
}

In [26]:
def get_all_lemmas(language, expected_unicode, invalid=["Unsupported titles/Space"]):
    title = f"Category:{language}_lemmas"
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }

    url = f"https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle={title}&cmlimit=max"
    
    lemmas = []
    while(True):
        r = requests.get(url,params)
        try:
            cmcontinue = r.json()['continue']['cmcontinue']
            for cmember in r.json()['query']['categorymembers']:
                if len(cmember['title']) > 1 and \
                    not cmember['title'].startswith('-') and \
                    not cmember['title'].endswith('-') and \
                    cmember['title'] not in invalid and \
                    not any(c.isdigit() for c in cmember['title']) and \
                    not cmember['title'].isupper() and \
                    unicodeblock.blocks.of(cmember['title'][0]) not in ['BASIC_PUNCTUATION'] and \
                    [unicodeblock.blocks.of(c) in unicode_block_map[expected_unicode] for c in cmember['title']].count(True) > len(cmember['title'])*.8:
                    lemmas.append(cmember['title'])
                    if len(lemmas) % 1000 == 0:
                        print(f"Got {len(lemmas)}")
            url = url.split("&cmcontinue")[0]
            url+=f"&cmcontinue={cmcontinue}"
        except KeyError:
            break
            
    df = pandas.DataFrame(np.array(lemmas).reshape(-1,1), columns=["word"])
    
    print(f"{language} finished ({len(lemmas)} total)\n")
    
    df.to_csv(f'results/{language}-AllLemmas.csv', index=False)

In [27]:
with open("../language-pairs.json", 'r') as f:
    pairs = json.loads(f.read())

for pair in pairs:
    print(pair)
    L2 = pairs[pair]['source']['name']
    L2_unicode = pairs[pair]['source']['unicode']
    
    if os.path.exists("results/{}-AllLemmas.csv".format(L2)):
        overwrite = input("{}-AllLemmas.csv exists. Overwrite existing file? (y/n) ".format(L2))
        if overwrite == "y":
            get_all_lemmas(L2, L2_unicode)
    else:
        get_all_lemmas(L2, L2_unicode)

Hindi-Persian
Persian-AllLemmas.csv exists. Overwrite existing file? (y/n) y
Got 1000
Got 2000
Got 3000
Got 4000
Got 5000
Got 6000
Got 7000
Got 8000
Got 9000
Persian finished (9876 total)

English-French
French-AllLemmas.csv exists. Overwrite existing file? (y/n) y
Got 1000
Got 2000
Got 3000
Got 4000
Got 5000
Got 6000
Got 7000
Got 8000
Got 9000
Got 10000
Got 11000
Got 12000
Got 13000
Got 14000
Got 15000
Got 16000
Got 17000
Got 18000
Got 19000
Got 20000
Got 21000
Got 22000
Got 23000
Got 24000
Got 25000
Got 26000
Got 27000
Got 28000
Got 29000
Got 30000
Got 31000
Got 32000
Got 33000
Got 34000
Got 35000
Got 36000
Got 37000
Got 38000
Got 39000
Got 40000
Got 41000
Got 42000
Got 43000
Got 44000
Got 45000
Got 46000
Got 47000
Got 48000
Got 49000
Got 50000
Got 51000
Got 52000
Got 53000
Got 54000
Got 55000
Got 56000
Got 57000
Got 58000
Got 59000
Got 60000
Got 61000
Got 62000
Got 63000
Got 64000
Got 65000
Got 66000
Got 67000
Got 68000
Got 69000
Got 70000
Got 71000
Got 72000
Got 73000
Got 74000
Got