# Follow filtering

In [1]:
import json
from pathlib import Path
import ujson
from dutchanalyzer.config import *
from dutchanalyzer.utilities.utils import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.replacement_utils import *
from pathlib import Path
from dotenv import load_dotenv
import datetime
from pprint import pprint
from tqdm import tqdm

In [2]:
import pandas as pd

In [3]:
eng_save_path = Path(WIKT_PREPROCESSING_DIR, 'en')
nld_save_path = Path(WIKT_PREPROCESSING_DIR, 'nl')

In [7]:
previous_save_path = Path(WIKT_PREPROCESSING_DIR, '12-11-25')

In [4]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))
folders = {'en': ['EEF', 'ENF'], 'nl':['NEF', 'NNF']}


for k, v in folders.items():
    for f in v:
        Path.mkdir(Path(current_save_folder, k, f), parents=True, exist_ok=True)

In [5]:
# Paths
NNR_file = Path(NNR_DIR, 'NNR.jsonl')
NER_file = Path(NER_DIR, 'NER.jsonl')
EER_file = Path(EER_DIR, 'EER.jsonl')
ENR_file = Path(ENR_DIR, 'ENR.jsonl')

NNF_folder = Path(WIKT_CLEANING_DIR, 'nl', 'NNF')
NEF_folder = Path(WIKT_CLEANING_DIR, 'nl', 'NEF')
EEF_folder = Path(WIKT_CLEANING_DIR, 'en', 'EEF')
ENF_folder = Path(WIKT_CLEANING_DIR, 'en', 'ENF')

all_words_file = Path(WIKT_CLEANING_DIR, 'all_words.jsonl')
eef_words_file = Path(EEF_folder, 'eef_words.jsonl')
enf_words_file = Path(ENF_folder, 'enf_words.jsonl')


## Extracting Words/Pos/Senses

- Extract words and parts of speech to dict, add all senses to dict

In [24]:
all_words_dict = {}
repeat_words = []
entries_batch = []
error_lines = []

In [144]:
def extract_words_senses(raw_entry: dict):
    word = raw_entry.get("word")
    pos = raw_entry.get("pos", 'unknown')
    senses = raw_entry.get("senses")
    translations = raw_entry.get("translations")
    lang_code = raw_entry.get("lang_code")
    forms = raw_entry.get('forms')
    synonyms = raw_entry.get('synonyms')
    if not forms:
        forms = raw_entry.get('form_of')
    glosses = []
    sense_translations = []
    word_entry = {'word': word,
                  'pos': pos, 
                  'lang_code': lang_code}
    if senses:
        word_entry['senses'] = {}
        for i, sense in enumerate(senses):
            new_sense = {}
            if 'glosses' in sense:
                glosses = sense['glosses']
                new_sense['glosses'] = glosses
            if 'translations' in sense or 'translation' in sense:
                sense_translations = sense['translations']
                if not sense_translations:
                    sense_translations = sense_translations['translation']
                new_sense['translations'] = sense_translations
            if 'form_of' in sense or 'forms' in sense:
                forms = sense.get('form_of')
                if not forms: forms = sense['forms']
                new_sense['forms'] = forms
            if 'alt_of' in sense:
                new_sense['alt_of'] = sense.get('alt_of')
            if 'synonyms' in sense:
                new_sense['synonyms'] = sense.get('synonyms')
            word_entry['senses'][i] = new_sense

    if translations:
        word_entry['translations'] = translations
    if forms:
        word_entry['forms'] = forms
    if synonyms:
        word_entry['synonyms'] = synonyms
    if 'etymology_templates' in raw_entry:
        word_entry['etymology_templates'] = raw_entry['etymology_templates']
    if 'wl_code' in raw_entry:
        word_entry['wl_code'] = raw_entry['wl_code']
    return word_entry

In [30]:
def filter_obj(obj):
    lang_codes_to_keep = ['nl', 'en', 'simple', 'ang', 'dum', 'nds', 'odt', 'nds-nl', 'enm']
    new_senses = []
    senses = obj.get('senses')
    if senses:
        for sense in senses:
            if 'attestations' in sense:
                sense.pop('attestations')
            # if 'examples' in sense:
            #     sense.pop('examples')
            if 'categories' in sense:
                cats = [c for c in sense['categories'] if not ('Pages with' in c and ('entries' in c or 'entry' in c))]
                sense['categories'] = cats
            new_senses.append(sense)

        obj['senses'] = new_senses
    if 'etymology_templates' in obj:
        new_templates = []
        for template in obj['etymology_templates']:
            if 'args' in template:
                if template['args'].get('1', '') in lang_codes_to_keep:
                    new_templates.append(template)
        if new_templates:
            obj['etymology_templates'] = new_templates
    if 'translations' in obj:
        translations = obj['translations']
        new_translations = [t for t in translations if t.get("lang_code") in lang_codes_to_keep]
        obj['translations'] = translations

In [None]:
def sort_filter_sense(obj: dict, pop_examples=True) -> dict:
    new_sense = {}
  
    glosses = obj.pop('glosses', '')
    translations = obj.pop('translations', '')
    translations = sort_translations(translations)
    form_of = obj.pop('form_of', '')
    if not form_of:
        form_of = obj.get('forms', '')
    
    obj.pop('senseid', '')
    obj.pop('wikidata', '')
    obj.pop('wikipedia', '')
    obj.pop('attestations', '')
    obj.pop('head_nr', '')

    if pop_examples:
        obj.pop('examples','')
    new_sense['glosses'] = glosses
    if form_of:
        new_sense['form_of'] = form_of
    synonyms = obj.get('synonyms')
    if synonyms:
        synonyms = sort_dict_list(synonyms, 'word', True)
        new_sense['synonyms'] = synonyms
    
    
    sorted_keys = sorted(list(obj.keys()))
    for key in sorted_keys:
        if key not in new_sense and key != 'translations':
            new_sense[key] = obj[key]
    if translations:
        new_sense['translations'] = translations

    return new_sense

In [None]:
def filter_categories(obj: list):
    new_categories = []

    if not obj or not isinstance(obj, list):
        return None
    
    for i, category in enumerate(obj):
        if isinstance(category, str):
            cut_cat = ''
            if category.startswith('Terms with') and category.endswith('translations') and 'incorrect' not in category:
                cut_cat = category.removeprefix('Terms with')
                cut_cat = cut_cat.removesuffix('translations')
                cut_cat = cut_cat.strip()
                lang_code2 = lookup_lang_code(cut_cat)
                if lang_code2:
                    new_categories.append(category)
            elif category.startswith('Requests for'):
                cut_cat = category.removeprefix('Requests for review of ')
                cut_cat = category.removeprefix('Requests for attention concerning ')
                cut_cat = category.removeprefix('Requests for translations into ')

                cut_cat = category.removesuffix(' translations')
                cut_cat = category.removesuffix(' entries')

                cut_cat = cut_cat.strip()
                lang_code2 = lookup_lang_code(cut_cat)
                if lang_code2:
                    new_categories.append(category)
                

            elif category.startswith('Woorden in het'):
                cut_cat = category.removeprefix('Woorden in het ')
                lang_code2 = lookup_lang_code(cut_cat.strip())
                if lang_code2:
                    new_categories.append(category)  
            
            elif 'transliterations' in category:
                cut_cat = category.removeprefix('Automatic ')
                cut_cat = cut_cat.removesuffix(' terms with redundant transliterations')
                cut_cat = cut_cat.removesuffix(' terms with non-redundant manual transliterations')
                cut_cat = cut_cat.removesuffix(' transliterations containing ambiguous characters')
                lang_code2 = lookup_lang_code(cut_cat.strip())
                if lang_code2:
                    new_categories.append(category)
            elif category.endswith('terms in nonstandard scripts'):
                cut_cat = category.removesuffix(' terms in nonstandard scripts')
                lang_code2 = lookup_lang_code(cut_cat.strip())
                if lang_code2:
                    new_categories.append(category)
            elif category.startswith("Woorden met") and 'referenties' in category:
                continue
            elif "Woorden in het Nederlands met audioweergave" == category:
                continue
            elif 'examples' in category:
                continue
            else:
                new_categories.append(category)
    
    return new_categories

In [None]:
def standardize_translation(obj: dict, lang_codes_to_keep=[], keep_no_lang=False, source='EEF', sense_index=-1) -> dict | None:
    if lang_codes_to_keep == []:
        lang_codes_to_keep = ['nl', 'en', 'simple', 'ang', 'dum', 'nds', 'odt', 'nds-nl', 'enm', 'eng', 'nld']

    word = obj.get('word', '')
    sense = obj.get('sense', '')

    if word == '' and sense == '':
        return None
    
    lang_code = obj.get('lang_code', '').lower()
    lang = obj.get('lang', '').lower()
    
    if lang_code == '':
        lang_code = obj.get('code', '').lower()
        
    new_translation = {}
    if keep_no_lang == False and lang_code == '' and lang == '':
        return None

    if lang_code == '' and lang == '':
        lang_code = 'unk'
        lang = 'unknown'

    elif lang == '':
        lang = lookup_lang_from_code(lang_code)

    elif lang_code == '':
        lang_code = lookup_lang_code(lang)

        if lang_code not in lang_codes_to_keep:
            return None
    
    standard_lang = lookup_lang_from_code(lang_code)
    if not standard_lang:
        return None    
    new_translation = {'word': word,
                        'lang_code': lang_code,
                        'lang': lang,
                        'standard_lang': standard_lang}
    
    obj_items = sorted(obj.items())
    for key, val in obj_items:
        if key not in new_translation:
            new_translation[key] = val
    return new_translation

In [None]:
def sort_standardize_entry(obj: dict, pop_examples=True) -> dict:
    new_obj = {}
    word = obj.get('word')
    if not word:
        return None
    if has_cjk_or_arabic_fast(word):
        return None
    pos = obj.get('pos')
    if not pos:
        return None
    if pos == 'name' or pos == 'abbrevation' or pos == 'proverb':
        return None
    
    new_obj['word'] = word
    new_obj['pos'] = pos
    code = obj.get('lang_code')
    lang = obj.get('lang', '').lower()
    if not code:
        code = obj.get('code')
        if code:
            standard_lang = lookup_lang_from_code(code)
            if standard_lang:
                obj['lang_code'] = code
                
                obj.pop('code')
            else:
                return None
        else:
            code = lookup_lang_code(lang)
            if not code:
                return None
    
    standard_lang = obj.get('standard_lang')
    if not standard_lang:
        standard_lang = lookup_lang_from_code(code)
    
    if not standard_lang:
        return None
    
    if lang == '':
        lang = standard_lang
    new_obj['lang_code'] = code
    new_obj['lang'] = lang
    new_obj['standard_lang'] = standard_lang

    try:
        ## Filter Categories
        
        categories = obj.get('categories')
        if categories:
            categories = filter_categories(categories)
            if categories:
                
                new_obj['categories'] = sorted(categories)
            
            else: new_obj['categories'] = []
        else:
            new_obj['categories'] = []
        
    except:
        print('categories failed ', categories)
        raise
    
    
    ## Filter Senses
    new_senses = []
    senses = obj.get('senses')
    if not senses:
        return None
    if senses:
        for sense in senses:
            new_sense = sort_filter_sense(sense)
            if new_sense and sense not in new_senses:
                new_senses.append(new_sense)
        if not new_senses:
            new_senses = []
    new_obj['senses'] = new_senses

    if 'forms' in obj:
        new_obj['forms'] = obj['forms']
    ## Filter Etymologies
    new_etymology_templates = []
    if 'etymology_templates' in obj:
        for ety_template in obj['etymology_templates']:
            if 'args' in ety_template:
                if '1' in ety_template['args']:
                    lc = ety_template['args']['1']
                    if lookup_lang_from_code(lc):
                        new_etymology_templates.append(ety_template)
        if new_etymology_templates:
            new_obj['etymology_templates'] = new_etymology_templates

    ## Filter Sounds 
    new_sounds = []
    if 'sounds' in obj:
        for sound in obj['sounds']:
            sound.pop('ogg_url', '')
            sound.pop('mp3_url', '')
            sound.pop('audio', '')
            if sound:
                new_sounds.append(sound)
    ## Add Remaining Keys
    obj_keys = sorted(list(obj.keys()))
    for key in obj_keys:
        if key not in new_obj and key != 'translations':
            if key == 'sounds' and new_sounds:
                new_obj['sounds'] = new_sounds
            else:
                new_obj[key] = obj[key]

    ## Filter Translations
    new_translations = []
    translations = obj.get('translations')
    try:
        if translations:
            for t in translations:
                
                translation = standardize_translation(t)
                if translation:
                    new_translations.append(translation)
            if new_translations:

                new_translations = sort_translations(new_translations)
            
        new_obj['translations'] = new_translations
    except:
        print('translations failed', new_translations)
        raise

    if new_obj:
        return new_obj
    return None

In [None]:
def process_obj(in_file, entries_out_file, wl_code, definitions_out_file=None, batch_size=1000, break_point=-1):
    batch = []
    entries_batch = []
    error_lines = []
    with open(in_file, 'r', encoding='utf-8') as f:
        with open(entries_out_file, 'w+', encoding='utf-8') as out:
            
            for i, line in tqdm(enumerate(f)):
                if break_point > 0:
                    if i > break_point:
                        display(entries_batch)
                        break
                if line:
                    try:
                        obj = json.loads(line)
                        obj = sort_standardize_entry(obj)
                        if obj:
                            obj['wl_code'] = wl_code
                            entries_batch.append(obj)

                            if len(entries_batch) > batch_size:
                                for entry in entries_batch:
                                    json.dump(entry, out, ensure_ascii=False)
                                    out.write('\n')
                                entries_batch = []
                            if definitions_out_file:
                                word_entry = extract_words_senses(obj)
                                batch.append(word_entry)
                                if len(batch) > batch_size:
                                    with open(definitions_out_file, 'a+', encoding='utf-8') as def_out: 
                                        for entry in batch:
                                            json.dump(entry, def_out, ensure_ascii=False)
                                            def_out.write('\n')
                                        batch = []
                    except Exception as e:
                        error_lines.append((i, obj))
                        display(line)
                        print("Error on line: ", i, " Error: ", e)
                        break
            if entries_batch:
                for entry in entries_batch:
                    json.dump(entry, out, ensure_ascii=False)
                    out.write('\n')  
            if batch and definitions_out_file:
                with open(definitions_out_file, 'a+', encoding='utf-8') as def_out: 
                    for entry in batch:
                        json.dump(entry, def_out, ensure_ascii=False)
                        def_out.write('\n')
    return entries_batch, batch, error_lines

In [155]:
ERAW_FILE = Path(RAW_KAIKKI_DIR, 'en', 'kaikki_en-raw-wiktextract-data.jsonl') 
file = ERAW_FILE
wl_code = 'ERAW'
out_file = Path(current_save_folder, 'en', 'ERAW.jsonl')
ERAW_definitions_file = Path(current_save_folder, 'en', 'ERAW_definitions.jsonl')
entries_batch, batch, error_lines = process_obj(file, out_file, wl_code, ERAW_definitions_file, 1000)             

10329308it [08:43, 19717.44it/s]


In [None]:
file = Path(current_save_folder, 'en', 'ERAW.jsonl')
wl_code = 'EER'
out_file = Path(current_save_folder, 'en', 'EER.jsonl')
total_lines = count_lines_with_progress(file)
batch_en = []
batch_nl = []
other_batch = []
ENF_file_curr = Path(current_save_folder, 'en', 'ENF.jsonl')
other_file = Path(current_save_folder, 'other_langs.jsonl')
with open(file, 'r', encoding='utf-8') as f:
    with open(out_file, 'w+', encoding='utf-8') as out:
        for i, line in tqdm(enumerate(f), total=total_lines):
            loaded = json.loads(line)
            if loaded:
                if loaded['lang_code'] == 'en':
                    batch_en.append(loaded)
                    if len(batch_en) > 1000:
                        for obj in batch_en:
                            json.dump(obj, out, ensure_ascii=False)
                            out.write('\n')
                        batch_en = []
                elif loaded['lang_code'] == 'nl':
                    batch_nl.append(loaded)
                    if len(batch_nl) > 1000:
                        with open(ENF_file_curr, 'a+', encoding='utf-8') as nout:
                            for obj in batch_nl:
                                json.dump(obj, nout, ensure_ascii=False)
                                nout.write('\n')
                        batch_nl = []
                else:
                    other_batch.append(loaded)
        if batch_en:
            for obj in batch_en:
                json.dump(obj, out, ensure_ascii=False)
                out.write('\n')
        if batch_nl:
            with open(ENF_file_curr, 'a+', encoding='utf-8') as nout:
                for obj in batch_nl:
                    json.dump(obj, nout, ensure_ascii=False)
                    nout.write('\n')
if other_batch:
    with open(other_file, 'a+', encoding='utf-8') as out:
        for obj in other_batch:
            json.dump(obj, out, ensure_ascii=False)
            out.write('\n')


Counting Lines: 100%|██████████| 1.66G/1.66G [00:01<00:00, 1.23GB/s]
100%|██████████| 1481820/1481820 [03:11<00:00, 7756.90it/s] 


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\elise\\SynologyDrive\\Dev\\DutchAnalyzerPublic\\DutchAnalyzer\\data\\interim\\cleaning\\wikt\\20-11-25'

In [158]:
other_file = Path(current_save_folder, 'other_langs.jsonl')
if other_batch:
    with open(other_file, 'a+', encoding='utf-8') as out:
        for obj in other_batch:
            json.dump(obj, out, ensure_ascii=False)
            out.write('\n')

In [160]:
NRAW_FILE = Path(RAW_KAIKKI_DIR, 'nl', 'kaikki_nl-raw-extract.jsonl') 
file = NRAW_FILE
wl_code = 'NRAW'
out_file = Path(current_save_folder, 'nl', 'NRAW.jsonl')
NRAW_definitions_file = Path(current_save_folder, 'nl', 'NRAW_definitions.jsonl')
nlentries_batch, nlbatch, error_lines = process_obj(file, out_file, wl_code, NRAW_definitions_file, 1000)     

1050145it [02:21, 7423.80it/s]


In [161]:
file = Path(current_save_folder, 'nl', 'NRAW.jsonl')
wl_code = 'NNR'
out_file = Path(current_save_folder, 'nl', 'NNR.jsonl')
total_lines = count_lines_with_progress(file)
batch_en = []
batch_nl = []
other_batch = []
NEF_file_curr = Path(current_save_folder, 'nl', 'NEF.jsonl')
other_file = Path(current_save_folder, 'other_langs.jsonl')
with open(file, 'r', encoding='utf-8') as f:
    with open(out_file, 'w+', encoding='utf-8') as nout:
        for i, line in tqdm(enumerate(f), total=total_lines):
            loaded = json.loads(line)
            if loaded:
                if loaded['lang_code'] == 'en':
                    batch_en.append(loaded)
                    if len(batch_en) > 1000:
                        with open(NEF_file_curr, 'a+', encoding='utf-8') as eout:
                            for obj in batch_en:
                                obj['wl_code'] = 'NEF'
                                json.dump(obj, eout, ensure_ascii=False)
                                eout.write('\n')
                        batch_en = []
                elif loaded['lang_code'] == 'nl':
                    batch_nl.append(loaded)
                    if len(batch_nl) > 1000:
                        for obj in batch_nl:
                            obj['wl_code'] = 'NNF'
                            json.dump(obj, nout, ensure_ascii=False)
                            nout.write('\n')
                        batch_nl = []
                else:
                    other_batch.append(loaded)
        if batch_en:
            with open(NEF_file_curr, 'a+', encoding='utf-8') as eout:
                for obj in batch_en:
                    obj['wl_code'] = 'NEF'
                    json.dump(obj, eout, ensure_ascii=False)
                    eout.write('\n')
        if batch_nl:
            for obj in batch_nl:
                obj['wl_code'] = 'NNF'
                json.dump(obj, nout, ensure_ascii=False)
                nout.write('\n')
if other_batch:
    with open(other_file, 'a+', encoding='utf-8') as out:
        for obj in other_batch:
            obj['wl_code'] = 'NOF'
            json.dump(obj, out, ensure_ascii=False)
            out.write('\n')

Counting Lines: 100%|██████████| 797M/797M [00:00<00:00, 1.18GB/s]
100%|██████████| 624735/624735 [01:27<00:00, 7101.02it/s] 


In [None]:
batch = []
entries_batch = []
file = ENR_file
wl_code = 'ENF'
out_file = Path(current_save_folder, 'en', 'ENF.jsonl')
ENF_definitions_file = Path(WIKT_CLEANING_DIR, 'en', 'ENF_definitions.jsonl')
with open(file, 'r', encoding='utf-8') as f:
    with open(out_file, 'w+', encoding='utf-8') as out:
        for i, line in tqdm(enumerate(f)):
            loaded = json.loads(line)
           
            if loaded:
                try:
                    filter_obj(loaded)
                    loaded['wl_code'] = wl_code
                    entries_batch.append(loaded)
                    word = loaded.get('word')
                    word_entry = extract_words_senses(loaded)
                    batch.append(word_entry)
                    if len(entries_batch) > 1000:
                            for obj in entries_batch:
                                json.dump(obj, out, ensure_ascii=False)
                                out.write('\n')
                            entries_batch = []
     
                except Exception as e:
                    error_lines.append((i, loaded))
                    display(line)
                    print("Error on line: ", i, " Error: ", e)
                
        if entries_batch:
            for obj in entries_batch:
                json.dump(obj, out, ensure_ascii=False)
                out.write('\n')  
with open(ENF_definitions_file, 'w+', encoding='utf-8') as out:
    for obj in batch:
        json.dump(obj, out, ensure_ascii=False)
        out.write('\n')  

140758it [00:27, 5117.59it/s] 


### Parse Repeats

In [163]:
enf_file = Path(current_save_folder, 'en', 'ENF.jsonl')
nnf_file = Path(current_save_folder, 'nl', 'NNF.jsonl')

In [192]:
enf_sounds = []
with open(enf_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        loaded = json.loads(line)
        if loaded:
            loaded['wl_code'] = 'ENF'
            if loaded.get('sounds'):
                enf_sounds.append(loaded)

In [193]:
display(enf_sounds[0:10])

[{'word': 'woordenboek',
  'pos': 'noun',
  'lang_code': 'nl',
  'lang': 'dutch',
  'standard_lang': 'dutch',
  'categories': [],
  'senses': [{'glosses': ['dictionary'],
    'synonyms': [{'word': 'dictionaire'}],
    'categories': ['Dutch compound terms',
     'Dutch entries with incorrect language header',
     'Dutch lemmas',
     'Dutch neuter nouns',
     'Dutch nouns',
     'Dutch nouns with plural in -en',
     'Dutch terms calqued from Late Latin',
     'Dutch terms derived from Late Latin',
     'Dutch terms interfixed with -en-',
     'Pages with 1 entry',
     'Pages with entries',
     'nl:Dictionaries'],
    'links': [['dictionary', 'dictionary']],
    'tags': ['neuter']}],
  'forms': [{'form': 'woordenboeken', 'tags': ['plural']},
   {'form': 'woordenboekje', 'tags': ['diminutive', 'neuter']},
   {'form': 'woordboek', 'tags': ['alternative', 'obsolete']},
   {'form': 'woordboeck', 'tags': ['alternative', 'obsolete']}],
  'etymology_templates': [{'name': 'af',
    'args': 

In [200]:
def reformat_sounds(obj: dict):
    ipas = []
    rhymes = []
    other = []
    tags = []

    sounds = obj.get('sounds')
    if sounds:
        for sound in sounds:
            ipa = sound.get('ipa')
            rhyme = sound.get('rhymes')
            tag = sound.get('tags')
            if ipa:
                if ipa not in ipas:
                    ipas.append(ipa)
            if rhyme:
                if rhyme not in rhymes:
                    rhymes.append(rhyme)
            if tag:
                if tag not in tags:
                    tags.append(tag)
        if ipas:
            obj['ipa'] = ipas
        if rhymes:
            obj['rhymes'] = rhymes
        if tags:
            obj['sound_tags'] = tags
        obj.pop('sounds')


In [201]:
enf_lines = []
with open(enf_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        loaded = json.loads(line)
        loaded['wl_code'] = 'ENF'
        if loaded:
            reformat_sounds(loaded)
            enf_lines.append(loaded)

In [221]:
enf_head_templates = []
with open(enf_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        loaded = json.loads(line)
        if loaded:
            loaded['wl_code'] = 'ENF'
            if 'head_templates' in loaded:
                obj = {}
                obj['word'] = loaded['word']
                obj['pos'] = loaded['pos']
                obj['head_templates'] = loaded['head_templates']
                obj['forms'] = loaded.get('forms')
                obj['senses'] = loaded['senses']
                
                enf_head_templates.append(obj)

In [222]:
enf_head_templates[0:10]

[{'word': 'woordenboek',
  'pos': 'noun',
  'head_templates': [{'name': 'nl-noun',
    'args': {'1': 'n', '2': '-en', '3': '+'},
    'expansion': 'woordenboek n (plural woordenboeken, diminutive woordenboekje n)'}],
  'forms': [{'form': 'woordenboeken', 'tags': ['plural']},
   {'form': 'woordenboekje', 'tags': ['diminutive', 'neuter']},
   {'form': 'woordboek', 'tags': ['alternative', 'obsolete']},
   {'form': 'woordboeck', 'tags': ['alternative', 'obsolete']}],
  'senses': [{'glosses': ['dictionary'],
    'synonyms': [{'word': 'dictionaire'}],
    'categories': ['Dutch compound terms',
     'Dutch entries with incorrect language header',
     'Dutch lemmas',
     'Dutch neuter nouns',
     'Dutch nouns',
     'Dutch nouns with plural in -en',
     'Dutch terms calqued from Late Latin',
     'Dutch terms derived from Late Latin',
     'Dutch terms interfixed with -en-',
     'Pages with 1 entry',
     'Pages with entries',
     'nl:Dictionaries'],
    'links': [['dictionary', 'dictiona

In [202]:
enf_df = pd.DataFrame(enf_lines)

In [204]:
enf_df.drop(columns=['original_title', 'wikipedia'], inplace=True)

In [205]:
enf_df.drop(columns=['abbreviations'], inplace=True)

In [203]:
enf_df['word_code'] = enf_df['word'] + '_' + enf_df['pos']

In [188]:
enf_duplicates = enf_df[enf_df.duplicated(subset=['word_code'], keep=False)]

In [207]:
enf_df.drop(columns=['descendants'], inplace=True)

In [209]:
enf_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128545 entries, 0 to 128544
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   word                  128545 non-null  object 
 1   pos                   128545 non-null  object 
 2   lang_code             128545 non-null  object 
 3   lang                  128545 non-null  object 
 4   standard_lang         128545 non-null  object 
 5   categories            128545 non-null  object 
 6   senses                128545 non-null  object 
 7   forms                 60665 non-null   object 
 8   etymology_templates   47911 non-null   object 
 9   derived               13139 non-null   object 
 10  etymology_text        49833 non-null   object 
 11  head_templates        128539 non-null  object 
 12  hyphenations          39891 non-null   object 
 13  translations          128545 non-null  object 
 14  wl_code               128545 non-null  object 
 15  

In [210]:
enf_df

Unnamed: 0,word,pos,lang_code,lang,standard_lang,categories,senses,forms,etymology_templates,derived,etymology_text,head_templates,hyphenations,translations,wl_code,ipa,inflection_templates,rhymes,related,etymology_number,synonyms,antonyms,sound_tags,hypernyms,holonyms,hyponyms,coordinate_terms,meronyms,word_code
0,woordenboek,noun,nl,dutch,dutch,[],"[{'glosses': ['dictionary'], 'synonyms': [{'wo...","[{'form': 'woordenboeken', 'tags': ['plural']}...","[{'name': 'af', 'args': {'1': 'nl', '2': 'woor...","[{'word': 'uitspraakwoordenboek'}, {'word': 'v...","From woord (“word”) + -en- + boek (“book”), a ...","[{'name': 'nl-noun', 'args': {'1': 'n', '2': '...","[{'parts': ['woor', 'den', 'boek']}]",[],ENF,"[/ˈʋoːrdə(n)ˌbuk/, [ˈʋʊːrdə(n)ˌbuk]]",,,,,,,,,,,,,woordenboek_noun
1,gratis,adj,nl,dutch,dutch,"[Pages with 21 entries, Pages with entries]","[{'glosses': ['free, without charge'], 'synony...","[{'form': 'no-table-tags', 'source': 'declensi...","[{'name': 'bor', 'args': {'1': 'nl', '2': 'la'...",,"Borrowed from Latin grātīs, contraction of grā...","[{'name': 'nl-adj', 'args': {'1': '-'}, 'expan...","[{'parts': ['gra', 'tis']}]",[],ENF,[/ˈɣraːtɪs/],"[{'name': 'nl-decl-adj', 'args': {'1': 'gratis...",,,,,,,,,,,,gratis_adj
2,gratuit,adj,nl,dutch,dutch,"[Pages with 4 entries, Pages with entries]","[{'glosses': ['gratuitous, not obliged to'], '...","[{'form': 'no-table-tags', 'source': 'declensi...","[{'name': 'bor', 'args': {'1': 'nl', '2': 'fr'...",,From French gratuit.,"[{'name': 'nl-adj', 'args': {'1': '-'}, 'expan...",,[],ENF,[/ɡraːˈtʋi/],"[{'name': 'nl-decl-adj', 'args': {'1': '', '2'...",,,,,,,,,,,,gratuit_adj
3,word,verb,nl,dutch,dutch,"[Dutch entries with incorrect language header,...","[{'glosses': ['inflection of worden:', 'first-...",,,,,"[{'name': 'head', 'args': {'1': 'nl', '2': 've...",,[],ENF,[/ʋɔrt/],,[-ɔrt],,,,,,,,,,,word_verb
4,pond,noun,nl,dutch,dutch,"[Dutch entries with incorrect language header,...","[{'glosses': ['unit of mass, often broadly sim...","[{'form': 'ponden', 'tags': ['plural']}, {'for...","[{'name': 'inh', 'args': {'1': 'nl', '2': 'dum...","[{'word': 'apothekerspond'}, {'word': 'de voll...","From Middle Dutch pont, pond, from Old Dutch p...","[{'name': 'nl-noun', 'args': {'1': 'n', '2': '...",[{'parts': ['pond']}],[],ENF,[/pɔnt/],,[-ɔnt],,,,,,,,,,,pond_noun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128540,flikkerij,noun,nl,dutch,dutch,[],"[{'glosses': ['faggotry'], 'categories': ['Dut...",,"[{'name': 'af', 'args': {'1': 'nl', '2': 'flik...",,From flikker + -ij.,"[{'name': 'nl-noun', 'args': {'1': 'f', '2': '...",,[],ENF,[/flɪ.kəˈrɛi̯/],,,,,,,,,,,,,flikkerij_noun
128541,leefruimte,noun,nl,dutch,dutch,[],"[{'glosses': ['living space'], 'categories': [...","[{'form': 'leefruimten', 'tags': ['plural']}, ...","[{'name': 'compound', 'args': {'1': 'nl', '2':...",,From leven + ruimte.,"[{'name': 'nl-noun', 'args': {'1': 'f', '2': '...","[{'parts': ['leef', 'ruim', 'te']}]",[],ENF,[/ˈleːfˌrœy̯m.tə/],,,,,,,,,,,,,leefruimte_noun
128542,vluchtnummer,noun,nl,dutch,dutch,[],"[{'glosses': ['flight number'], 'categories': ...","[{'form': 'vluchtnummers', 'tags': ['plural']}]","[{'name': 'compound', 'args': {'1': 'nl', '2':...",,From vlucht + nummer.,"[{'name': 'nl-noun', 'args': {'1': 'n', '2': '...","[{'parts': ['vlucht', 'num', 'mer']}]",[],ENF,[/ˈvlʏxtˌnʏ.mər/],,,,,,,,,,,,,vluchtnummer_noun
128543,overnachting,noun,nl,dutch,dutch,[],"[{'glosses': ['overnight stay'], 'categories':...","[{'form': 'overnachtingen', 'tags': ['plural']...","[{'name': 'af', 'args': {'1': 'nl', '2': 'over...",,From overnachten + -ing.,"[{'name': 'nl-noun', 'args': {'1': 'f', '2': '...","[{'parts': ['over', 'nach', 'ting']}]",[],ENF,[/ˌoː.vərˈnɑx.tɪŋ/],,,,,,,,,,,,,overnachting_noun


In [197]:
sounds_df = return_non_na(enf_df, 'sounds')

In [218]:
enf_df.drop(columns=['translations'], inplace=True)

In [219]:
enf_df

Unnamed: 0,word,pos,lang_code,lang,standard_lang,categories,senses,forms,etymology_templates,derived,etymology_text,head_templates,hyphenations,wl_code,ipa,inflection_templates,rhymes,related,etymology_number,synonyms,antonyms,sound_tags,hypernyms,holonyms,hyponyms,coordinate_terms,meronyms,word_code
0,woordenboek,noun,nl,dutch,dutch,[],"[{'glosses': ['dictionary'], 'synonyms': [{'wo...","[{'form': 'woordenboeken', 'tags': ['plural']}...","[{'name': 'af', 'args': {'1': 'nl', '2': 'woor...","[{'word': 'uitspraakwoordenboek'}, {'word': 'v...","From woord (“word”) + -en- + boek (“book”), a ...","[{'name': 'nl-noun', 'args': {'1': 'n', '2': '...","[{'parts': ['woor', 'den', 'boek']}]",ENF,"[/ˈʋoːrdə(n)ˌbuk/, [ˈʋʊːrdə(n)ˌbuk]]",,,,,,,,,,,,,woordenboek_noun
1,gratis,adj,nl,dutch,dutch,"[Pages with 21 entries, Pages with entries]","[{'glosses': ['free, without charge'], 'synony...","[{'form': 'no-table-tags', 'source': 'declensi...","[{'name': 'bor', 'args': {'1': 'nl', '2': 'la'...",,"Borrowed from Latin grātīs, contraction of grā...","[{'name': 'nl-adj', 'args': {'1': '-'}, 'expan...","[{'parts': ['gra', 'tis']}]",ENF,[/ˈɣraːtɪs/],"[{'name': 'nl-decl-adj', 'args': {'1': 'gratis...",,,,,,,,,,,,gratis_adj
2,gratuit,adj,nl,dutch,dutch,"[Pages with 4 entries, Pages with entries]","[{'glosses': ['gratuitous, not obliged to'], '...","[{'form': 'no-table-tags', 'source': 'declensi...","[{'name': 'bor', 'args': {'1': 'nl', '2': 'fr'...",,From French gratuit.,"[{'name': 'nl-adj', 'args': {'1': '-'}, 'expan...",,ENF,[/ɡraːˈtʋi/],"[{'name': 'nl-decl-adj', 'args': {'1': '', '2'...",,,,,,,,,,,,gratuit_adj
3,word,verb,nl,dutch,dutch,"[Dutch entries with incorrect language header,...","[{'glosses': ['inflection of worden:', 'first-...",,,,,"[{'name': 'head', 'args': {'1': 'nl', '2': 've...",,ENF,[/ʋɔrt/],,[-ɔrt],,,,,,,,,,,word_verb
4,pond,noun,nl,dutch,dutch,"[Dutch entries with incorrect language header,...","[{'glosses': ['unit of mass, often broadly sim...","[{'form': 'ponden', 'tags': ['plural']}, {'for...","[{'name': 'inh', 'args': {'1': 'nl', '2': 'dum...","[{'word': 'apothekerspond'}, {'word': 'de voll...","From Middle Dutch pont, pond, from Old Dutch p...","[{'name': 'nl-noun', 'args': {'1': 'n', '2': '...",[{'parts': ['pond']}],ENF,[/pɔnt/],,[-ɔnt],,,,,,,,,,,pond_noun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128540,flikkerij,noun,nl,dutch,dutch,[],"[{'glosses': ['faggotry'], 'categories': ['Dut...",,"[{'name': 'af', 'args': {'1': 'nl', '2': 'flik...",,From flikker + -ij.,"[{'name': 'nl-noun', 'args': {'1': 'f', '2': '...",,ENF,[/flɪ.kəˈrɛi̯/],,,,,,,,,,,,,flikkerij_noun
128541,leefruimte,noun,nl,dutch,dutch,[],"[{'glosses': ['living space'], 'categories': [...","[{'form': 'leefruimten', 'tags': ['plural']}, ...","[{'name': 'compound', 'args': {'1': 'nl', '2':...",,From leven + ruimte.,"[{'name': 'nl-noun', 'args': {'1': 'f', '2': '...","[{'parts': ['leef', 'ruim', 'te']}]",ENF,[/ˈleːfˌrœy̯m.tə/],,,,,,,,,,,,,leefruimte_noun
128542,vluchtnummer,noun,nl,dutch,dutch,[],"[{'glosses': ['flight number'], 'categories': ...","[{'form': 'vluchtnummers', 'tags': ['plural']}]","[{'name': 'compound', 'args': {'1': 'nl', '2':...",,From vlucht + nummer.,"[{'name': 'nl-noun', 'args': {'1': 'n', '2': '...","[{'parts': ['vlucht', 'num', 'mer']}]",ENF,[/ˈvlʏxtˌnʏ.mər/],,,,,,,,,,,,,vluchtnummer_noun
128543,overnachting,noun,nl,dutch,dutch,[],"[{'glosses': ['overnight stay'], 'categories':...","[{'form': 'overnachtingen', 'tags': ['plural']...","[{'name': 'af', 'args': {'1': 'nl', '2': 'over...",,From overnachten + -ing.,"[{'name': 'nl-noun', 'args': {'1': 'f', '2': '...","[{'parts': ['over', 'nach', 'ting']}]",ENF,[/ˌoː.vərˈnɑx.tɪŋ/],,,,,,,,,,,,,overnachting_noun


In [194]:
enf_duplicates.sort_values(by='word_code')

Unnamed: 0,word,pos,lang_code,lang,standard_lang,categories,senses,forms,etymology_templates,derived,descendants,etymology_text,head_templates,hyphenations,sounds,translations,wl_code,inflection_templates,related,etymology_number,synonyms,antonyms,hypernyms,holonyms,hyponyms,coordinate_terms,meronyms,word_code
23320,-de,suffix,nl,dutch,dutch,"[Belgian Dutch, Brabantian Dutch, Dutch colloq...",[{'glosses': ['Indicates second person in inve...,"[{'form': '-te', 'tags': ['alternative']}]","[{'name': 'inh', 'args': {'1': 'nl', '2': 'dum...",,,"From Middle Dutch -dī, from a contraction of t...","[{'name': 'head', 'args': {'1': 'nl', '2': 'su...",,"[{'ipa': '/də/'}, {'audio': 'Nl--de.ogg', 'ogg...",[],ENF,,,3.0,,,,,,,,-de_suffix
23319,-de,suffix,nl,dutch,dutch,"[Belgian Dutch, Brabantian Dutch, Dutch colloq...",[{'glosses': ['a suffix that forms the singula...,"[{'form': '-te', 'tags': ['alternative']}]","[{'name': 'cog', 'args': {'1': 'en', '2': '-ed...",,,Cognate with English -ed; derives from a Germa...,"[{'name': 'head', 'args': {'1': 'nl', '2': 'su...",,"[{'ipa': '/də/'}, {'audio': 'Nl--de.ogg', 'ogg...",[],ENF,,,2.0,,,,,,,,-de_suffix
23318,-de,suffix,nl,dutch,dutch,"[Belgian Dutch, Brabantian Dutch, Dutch colloq...",[{'glosses': ['Forms ordinal numbers from card...,"[{'form': '-ste', 'tags': ['alternative']}, {'...","[{'name': 'cog', 'args': {'1': 'en', '2': '-th...",,,Compare English -th in fourth and German -te i...,"[{'name': 'head', 'args': {'1': 'nl', '2': 'su...",,"[{'ipa': '/də/'}, {'audio': 'Nl--de.ogg', 'ogg...",[],ENF,,,1.0,,,,,,,,-de_suffix
3996,-e,suffix,nl,dutch,dutch,"[Dutch entries with incorrect language header,...",[{'glosses': ['Used to form the singular subju...,,"[{'name': 'inh', 'args': {'1': 'nl', '2': 'dum...",,,"From Middle Dutch -e, the ending of the first ...","[{'name': 'head', 'args': {'1': 'nl', '2': 'su...",,"[{'ipa': '/ə/'}, {'ipa': '/də/'}, {'ipa': '/st...",[],ENF,,,6.0,,,,,,,,-e_suffix
3991,-e,suffix,nl,dutch,dutch,"[Dutch entries with incorrect language header,...",[{'glosses': ['Used to form the female equival...,,"[{'name': 'bor+', 'args': {'1': 'nl', '2': 'fr...",,,Borrowed from French -e.,"[{'name': 'head', 'args': {'1': 'nl', '2': 'su...",,[{'ipa': '/ə/'}],[],ENF,,,1.0,,,,,,,,-e_suffix
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15114,zweefvliegen,noun,nl,dutch,dutch,"[Dutch basic verbs, Dutch compound terms, Dutc...","[{'glosses': ['gliding, flying with unpowered ...",,"[{'name': 'compound', 'args': {'1': 'nl', '2':...",,,"Compound of zweven (“to hover, glide”) + vlieg...","[{'name': 'nl-noun', 'args': {'1': 'n', '2': '...","[{'parts': ['zweef', 'vlie', 'gen']}]","[{'ipa': '/ˈzʋeː(f)ˌfli.ɣə(n)/'}, {'audio': 'N...",[],ENF,,,1.0,,,,,,,,zweefvliegen_noun
87850,zwemster,noun,nl,dutch,dutch,"[Dutch compound terms, Dutch entries with inco...","[{'glosses': ['swimmer'], 'links': [['swimmer'...","[{'form': 'zwemsters', 'tags': ['plural']}, {'...","[{'name': 'suf', 'args': {'1': 'nl', '2': 'zwe...",,,From zwemmen + -ster.,"[{'name': 'nl-noun', 'args': {'1': 'f', '2': '...",,[{'ipa': '/ˈzʋɛm.stər/'}],[],ENF,,,1.0,,,,,,,,zwemster_noun
87851,zwemster,noun,nl,dutch,dutch,"[Dutch compound terms, Dutch entries with inco...",[{'glosses': ['swimming star (successful swimm...,"[{'form': 'zwemsterren', 'tags': ['plural']}, ...","[{'name': 'compound', 'args': {'1': 'nl', '2':...",,,From zwemmen + ster.,"[{'name': 'nl-noun', 'args': {'1': 'm', '2': '...",,"[{'ipa': '/ˈzʋɛm.stər/'}, {'ipa': '/ˈzʋɛm.stɛr...",[],ENF,,,2.0,,,,,,,,zwemster_noun
12411,zweren,verb,nl,dutch,dutch,"[Dutch basic verbs, Dutch class 6 j-present st...","[{'glosses': ['to swear, pledge, declare under...","[{'form': 'present strong', 'source': 'conjuga...","[{'name': 'inh', 'args': {'1': 'nl', '2': 'dum...","[{'word': 'afzweren'}, {'word': 'bezweren'}, {...","[{'lang': 'Afrikaans', 'lang_code': 'af', 'wor...","From Middle Dutch swēren, from Old Dutch *swer...","[{'name': 'nl-verb', 'args': {}, 'expansion': ...","[{'parts': ['zwe', 'ren']}]","[{'ipa': '/ˈzʋeːrə(n)/'}, {'audio': 'Nl-zweren...",[],ENF,"[{'name': 'nl-conj-st', 'args': {'class': '6 j...",,1.0,,,,,,,,zweren_verb


In [208]:
enf_duplicates.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 4333 entries, 8 to 128407
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   word                  4333 non-null   object 
 1   pos                   4333 non-null   object 
 2   lang_code             4333 non-null   object 
 3   lang                  4333 non-null   object 
 4   standard_lang         4333 non-null   object 
 5   categories            4333 non-null   object 
 6   senses                4333 non-null   object 
 7   forms                 3159 non-null   object 
 8   etymology_templates   2698 non-null   object 
 9   derived               1134 non-null   object 
 10  descendants           688 non-null    object 
 11  etymology_text        2959 non-null   object 
 12  head_templates        4332 non-null   object 
 13  hyphenations          2498 non-null   object 
 14  sounds                4213 non-null   object 
 15  translations          43