In [1]:
import numpy as np
import requests
import pandas
import json
import unicodeblock.blocks
import os

In [2]:
# expected Unicode blocks for certain scripts
unicode_block_map = {
    "Latin" : ['BASIC_LATIN', 'LATIN_1_SUPPLEMENT', 'LATIN_EXTENDED_LETTER', 'LATIN_EXTENDED_A', 'LATIN_EXTENDED_B',\
               'LATIN_EXTENDED_C'],
    "Greek" : ['GREEK'],
    "Cyrillic" : ['CYRILLIC', 'CYRILLIC_SUPPLEMENTARY', 'CYRILLIC_EXTENDED_A', 'CYRILLIC_EXTENDED_B'],
    "Arabic" : ['ARABIC', 'ARABIC_SUPPLEMENT', 'ARABIC_PRESENTATION_FORMS_A', 'ARABIC_PRESENTATION_FORMS_B'],
    "Devanagari" : ['DEVANAGARI', 'VEDIC_EXTENSIONS', 'DEVANAGARI_EXTENDED'],
    "Bengali" : ['BENGALI'],
    "Gurmukhi" : ['GURMUKHI'],
    "Tamil" : ['TAMIL'],
    "Telugu" : ['TELUGU'],
    "Malayalam" : ['MALAYALAM'],
    "Myanmar" : ['MYANMAR', 'MYANMAR_EXTENDED_A'],
    "Chinese" : ['CJK_RADICALS_SUPPLEMENT', 'CJK_SYMBOLS_AND_PUNCTUATION', 'CJK_STROKES',\
                 'ENCLOSED_CJK_LETTERS_AND_MONTHS', 'CJK_COMPATIBILITY', 'CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A',\
                 'CJK_UNIFIED_IDEOGRAPHS', 'CJK_COMPATIBILITY_IDEOGRAPHS', 'CJK_COMPATIBILITY_FORMS',\
                 'CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B', 'CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C',\
                 'CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D', 'CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT']
}

In [3]:
def get_all_lemmas(language, expected_unicode, invalid=["Unsupported titles/"]):
    title = f"Category:{language}_lemmas"
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }

    url = f"https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle={title}&cmlimit=max"
    
    lemmas = []
    while(True):
        r = requests.get(url,params)
        try:
            cmcontinue = r.json()['continue']['cmcontinue']
            for cmember in r.json()['query']['categorymembers']:
                if len(cmember['title']) > 1 and \
                    not cmember['title'].startswith('-') and \
                    not cmember['title'].endswith('-') and \
                    not any(inv in cmember['title'] for inv in invalid) and \
                    not any(c.isdigit() for c in cmember['title']) and \
                    not cmember['title'].isupper() and \
                    unicodeblock.blocks.of(cmember['title'][0]) not in ['BASIC_PUNCTUATION'] and \
                    [unicodeblock.blocks.of(c) in unicode_block_map[expected_unicode] for c in cmember['title']].count(True) > len(cmember['title'])*.8:
                    lemmas.append(cmember['title'])
                    if len(lemmas) % 1000 == 0:
                        print(f"Got {len(lemmas)}")
            url = url.split("&cmcontinue")[0]
            url+=f"&cmcontinue={cmcontinue}"
        except KeyError:
            break
            
    df = pandas.DataFrame(np.array(lemmas).reshape(-1,1), columns=["word"])
    
    print(f"{language} finished ({len(lemmas)} total)\n")
    
    df.to_csv(f'results/{language}-AllLemmas.csv', index=False)

In [4]:
with open("../language-pairs.json", 'r') as f:
    pairs = json.loads(f.read())

for pair in pairs:
    print(pair)
    L2 = pairs[pair]['source']['name']
    L2_unicode = pairs[pair]['source']['unicode']
    
    if os.path.exists("results/{}-AllLemmas.csv".format(L2)):
        overwrite = input("{}-AllLemmas.csv exists. Overwrite existing file? (y/n) ".format(L2))
        if overwrite == "y":
            get_all_lemmas(L2, L2_unicode)
    else:
        get_all_lemmas(L2, L2_unicode)

Romanian-Hungarian
Got 1000
Got 2000
Got 3000
Got 4000
Got 5000
Got 6000
Got 7000
Got 8000
Got 9000
Got 10000
Got 11000
Got 12000
Got 13000
Got 14000
Got 15000
Got 16000
Got 17000
Got 18000
Got 19000
Got 20000
Got 21000
Got 22000
Got 23000
Got 24000
Got 25000
Got 26000
Hungarian finished (26372 total)

