# Extracting words and definitions

In [4]:
import json
from pathlib import Path
import ujson
from dutchanalyzer.config import *
from dutchanalyzer.utils import *
from dutchanalyzer.json_utils import *
from pathlib import Path
from dotenv import load_dotenv
import datetime
import re
from pprint import pprint
import ast
from tqdm import tqdm

In [5]:
import pandas as pd

In [6]:
eng_save_path = Path(WIKT_PREPROCESSING_DIR, 'en')
nld_save_path = Path(WIKT_PREPROCESSING_DIR, 'nl')

In [7]:
previous_save_path = Path(WIKT_PREPROCESSING_DIR, '12-11-25')

In [8]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))
folders = {'en': ['EEF', 'ENF'], 'nl':['NEF', 'NNF']}


for k, v in folders.items():
    for f in v:
        Path.mkdir(Path(current_save_folder, k, f), parents=True, exist_ok=True)

In [9]:
all_words = Path(current_save_folder, 'all_words.jsonl')

In [10]:
# Paths
NNR_file = Path(NNR_DIR, 'NNR.jsonl')
NER_file = Path(NER_DIR, 'NER.jsonl')
EER_file = Path(EER_DIR, 'EER.jsonl')
ENR_file = Path(ENR_DIR, 'ENR.jsonl')

NNF_folder = Path(WIKT_CLEANING_DIR, 'nl', 'NNF')
NEF_folder = Path(WIKT_CLEANING_DIR, 'nl', 'NEF')
EEF_folder = Path(WIKT_CLEANING_DIR, 'en', 'EEF')
ENF_folder = Path(WIKT_CLEANING_DIR, 'en', 'ENF')

all_words_file = Path(WIKT_CLEANING_DIR, 'all_words.jsonl')
eef_words_file = Path(EEF_folder, 'eef_words.jsonl')
enf_words_file = Path(ENF_folder, 'enf_words.jsonl')


## Extracting Words/Pos/Senses

- Extract words and parts of speech to dict, add all senses to dict

In [14]:
def extract_words_senses(raw_entry: dict):
    word = raw_entry.get("word")
    pos = raw_entry.get("pos", 'unknown')
    senses = raw_entry.get("senses")
    translations = raw_entry.get("translations")
    lang_code = raw_entry.get("lang_code")
    forms = raw_entry.get('forms')
    if not forms:
        forms = raw_entry.get('form_of')
    glosses = []
    sense_translations = []
    word_entry = {'word': word,
                  'pos': pos, 
                  'lang_code': lang_code}
    #word_entry = {word: {pos: {'lang_code': lang_code}}}

    if senses:
        word_entry['senses'] = {}
        for i, sense in enumerate(senses):
            new_sense = {}
            if 'glosses' in sense:
                glosses = sense['glosses']
                new_sense['glosses'] = glosses
            if 'translations' in sense or 'translation' in sense:
                sense_translations = sense['translations']
                if not sense_translations:
                    sense_translations = sense_translations['translation']
                new_sense['translations'] = sense_translations
            if 'form_of' in sense or 'forms' in sense:
                forms = sense.get('form_of')
                if not forms: forms = sense['forms']
                new_sense['forms'] = forms
                
            word_entry['senses'][i] = new_sense

    if translations:
        word_entry['translations'] = translations
    if forms:
        word_entry['forms'] = forms
    if 'wl_code' in raw_entry:
        word_entry['wl_code'] = raw_entry['wl_code']
    if 'etymology_templates' in raw_entry:
        word_entry['etymology_templates'] = raw_entry['etymology_templates']
    return word_entry

In [30]:
def filter_obj(obj):
    lang_codes_to_keep = ['nl', 'en', 'simple', 'ang', 'dum', 'nds', 'odt', 'nds-nl', 'enm']
    new_senses = []
    senses = obj.get('senses')
    if senses:
        for sense in senses:
            if 'attestations' in sense:
                sense.pop('attestations')
            # if 'examples' in sense:
            #     sense.pop('examples')
            if 'categories' in sense:
                cats = [c for c in sense['categories'] if not ('Pages with' in c and ('entries' in c or 'entry' in c))]
                sense['categories'] = cats
            new_senses.append(sense)

        obj['senses'] = new_senses
    if 'etymology_templates' in obj:
        new_templates = []
        for template in obj['etymology_templates']:
            if 'args' in template:
                if template['args'].get('1', '') in lang_codes_to_keep:
                    new_templates.append(template)
        if new_templates:
            obj['etymology_templates'] = new_templates
    if 'translations' in obj:
        translations = obj['translations']
        new_translations = [t for t in translations if t.get("lang_code") in lang_codes_to_keep]
        obj['translations'] = translations

In [24]:
all_words_dict = {}
repeat_words = []
entries_batch = []
error_lines = []

In [31]:
batch = []
entries_batch = []
file = ENR_file
wl_code = 'ENF'
out_file = Path(WIKT_CLEANING_DIR, 'en', 'ENF.jsonl')
ENF_definitions_file = Path(WIKT_CLEANING_DIR, 'en', 'ENF_definitions.jsonl')
with open(file, 'r', encoding='utf-8') as f:
    with open(out_file, 'w+', encoding='utf-8') as out:
        for i, line in tqdm(enumerate(f)):
            loaded = json.loads(line)
           
            if loaded:
                try:
                    filter_obj(loaded)
                    loaded['wl_code'] = wl_code
                    entries_batch.append(loaded)
                    word = loaded.get('word')
                    word_entry = extract_words_senses(loaded)
                    batch.append(word_entry)
                    if len(entries_batch) > 1000:
                            for obj in entries_batch:
                                json.dump(obj, out, ensure_ascii=False)
                                out.write('\n')
                            entries_batch = []
     
                except Exception as e:
                    error_lines.append((i, loaded))
                    display(line)
                    print("Error on line: ", i, " Error: ", e)
                
        if entries_batch:
            for obj in entries_batch:
                json.dump(obj, out, ensure_ascii=False)
                out.write('\n')  
with open(ENF_definitions_file, 'w+', encoding='utf-8') as out:
    for obj in batch:
        json.dump(obj, out, ensure_ascii=False)
        out.write('\n')  

140758it [00:27, 5117.59it/s] 


In [34]:
multi_senses = []
enf_one_gloss_senses = []
enf_multigloss_senses = []
no_gloss_entries = []

for item in batch[0:10]:
    display(item)
    if 'senses' in item:
        if len(item['senses']) > 1:
            multi_senses.append(item)
        else:
            sense = item['senses']
            if sense:
                glosses = sense.get('glosses')
                if not glosses:
                    no_gloss_entries.append(item)
                elif len(glosses) > 1:
                    enf_multigloss_senses.append(item)
                else:
                    enf_one_gloss_senses.append(item)
print("Multi-senses: ", len(multi_senses))  
print("No gloss entries: ", len(no_gloss_entries))  
print("Multi-glosses: ", enf_multigloss_senses)    
print("One Gloss: ", len(enf_one_gloss_senses))   

{'word': 'woordenboek',
 'pos': 'noun',
 'lang_code': 'nl',
 'senses': {'glosses': ['dictionary']},
 'wl_code': 'ENF'}

{'word': 'gratis',
 'pos': 'adj',
 'lang_code': 'nl',
 'senses': {'glosses': ['free, without charge']},
 'wl_code': 'ENF'}

{'word': 'gratuit',
 'pos': 'adj',
 'lang_code': 'nl',
 'senses': {'glosses': ['gratuitous, not obliged to']},
 'wl_code': 'ENF'}

{'word': 'word',
 'pos': 'verb',
 'lang_code': 'nl',
 'senses': {'glosses': ['inflection of worden:', 'imperative']},
 'wl_code': 'ENF'}

{'word': 'pond',
 'pos': 'noun',
 'lang_code': 'nl',
 'senses': {'glosses': ['one of several monetary units', 'Flemish pound']},
 'wl_code': 'ENF'}

{'word': 'pies',
 'pos': 'noun',
 'lang_code': 'nl',
 'senses': {'glosses': ['alternative form of pis; pee, piss']},
 'wl_code': 'ENF'}

{'word': 'A',
 'pos': 'character',
 'lang_code': 'nl',
 'senses': {'glosses': ['The first letter of the Dutch alphabet, written in the Latin script.']},
 'wl_code': 'ENF'}

{'word': 'raven',
 'pos': 'verb',
 'lang_code': 'nl',
 'senses': {'glosses': ['to (hold a) rave, to party wildly']},
 'wl_code': 'ENF'}

{'word': 'raven',
 'pos': 'noun',
 'lang_code': 'nl',
 'senses': {'glosses': ['obsolete form of raaf']},
 'wl_code': 'ENF'}

{'word': 'raven',
 'pos': 'noun',
 'lang_code': 'nl',
 'senses': {'glosses': ['plural of raaf']},
 'wl_code': 'ENF'}

Multi-senses:  0
No gloss entries:  0
Multi-glosses:  [{'word': 'word', 'pos': 'verb', 'lang_code': 'nl', 'senses': {'glosses': ['inflection of worden:', 'imperative']}, 'wl_code': 'ENF'}, {'word': 'pond', 'pos': 'noun', 'lang_code': 'nl', 'senses': {'glosses': ['one of several monetary units', 'Flemish pound']}, 'wl_code': 'ENF'}]
One Gloss:  8


In [22]:
print(len(batch))

5


In [7]:
nnf_entries_batch = []
nef_entries_batch = []
enf_entries_batch = []

In [34]:
batch = []
error_lines = []
entries_batch = []
file = EER_file
wl_code = 'EEF'
entries_file = Path(current_save_folder, 'EEF.jsonl')
eef_definitions_file = Path(WIKT_CLEANING_DIR, 'en', 'EEF_definitions.jsonl')
with open(file, 'r', encoding='utf-8') as f:
    with open(entries_file, 'w+', encoding='utf-8') as out:
        for i, line in tqdm(enumerate(f)):
            loaded = json.loads(line)
            if loaded:
                try:
                    filter_obj(loaded)
                    loaded['wl_code'] = wl_code
                    entries_batch.append(loaded)
                    word = loaded.get('word')
                    word_entry = extract_words_senses(loaded)
                    
                    batch.append(word_entry)
                    if len(entries_batch) > 1000:
                        for obj in entries_batch:
                            json.dump(obj, out, ensure_ascii=False)
                            out.write('\n')
                        entries_batch = []
                except Exception as e:
                    error_lines.append((i, loaded))
                    display(line)
                    print("Error on line: ", i, " Error: ", e)
                
        if entries_batch:
            for obj in entries_batch:
                json.dump(obj, out, ensure_ascii=False)
                out.write('\n')   
if batch:
    with open(eef_definitions_file, 'w+', encoding='utf-8') as eout:
        for i, obj in tqdm(enumerate(batch)):
            json.dump(obj, eout, ensure_ascii=False)
            eout.write('\n')           

1417383it [03:01, 7829.03it/s] 
1417383it [00:43, 32380.19it/s]


In [63]:
count_lines_with_progress(EER_file)

Counting Lines: 100%|██████████| 1.63G/1.63G [00:01<00:00, 1.30GB/s]


1423864

In [32]:
batch = []
entries_batch = []
file = NER_file
wl_code = 'NEF'
out_file = Path(WIKT_CLEANING_DIR, 'nl', 'NEF.jsonl')
definitions_file = Path(WIKT_CLEANING_DIR, 'nl', 'NEF_definitions.jsonl')
with open(file, 'r', encoding='utf-8') as f:
    with open(out_file, 'w+', encoding='utf-8') as out:
        for i, line in tqdm(enumerate(f)):
            loaded = json.loads(line)
           
            if loaded:
                try:
                    filter_obj(loaded)
                    loaded['wl_code'] = wl_code
                    entries_batch.append(loaded)
                    word = loaded.get('word')
                    word_entry = extract_words_senses(loaded)
                    batch.append(word_entry)
                    if len(entries_batch) > 1000:
                            for obj in entries_batch:
                                json.dump(obj, out, ensure_ascii=False)
                                out.write('\n')
                            entries_batch = []
     
                except Exception as e:
                    error_lines.append((i, loaded))
                    display(line)
                    print("Error on line: ", i, " Error: ", e)
                
        if entries_batch:
            for obj in entries_batch:
                json.dump(obj, out, ensure_ascii=False)
                out.write('\n')  
with open(definitions_file, 'w+', encoding='utf-8') as out:
    for obj in batch:
        json.dump(obj, out, ensure_ascii=False)
        out.write('\n')  

17441it [00:00, 17789.55it/s]


In [33]:
batch = []
entries_batch = []
file = NNR_file
wl_code = 'NNF'
out_file = Path(WIKT_CLEANING_DIR, 'nl', 'NNF.jsonl')
definitions_file = Path(WIKT_CLEANING_DIR, 'nl', 'NNF_definitions.jsonl')
with open(file, 'r', encoding='utf-8') as f:
    with open(out_file, 'w+', encoding='utf-8') as out:
        for i, line in tqdm(enumerate(f)):
            loaded = json.loads(line)
           
            if loaded:
                try:
                    filter_obj(loaded)
                    loaded['wl_code'] = wl_code
                    entries_batch.append(loaded)
                    word = loaded.get('word')
                    word_entry = extract_words_senses(loaded)
                    batch.append(word_entry)
                    if len(entries_batch) > 1000:
                            for obj in entries_batch:
                                json.dump(obj, out, ensure_ascii=False)
                                out.write('\n')
                            entries_batch = []
     
                except Exception as e:
                    error_lines.append((i, loaded))
                    display(line)
                    print("Error on line: ", i, " Error: ", e)
                
        if entries_batch:
            for obj in entries_batch:
                json.dump(obj, out, ensure_ascii=False)
                out.write('\n')  
with open(definitions_file, 'w+', encoding='utf-8') as out:
    for obj in batch:
        json.dump(obj, out, ensure_ascii=False)
        out.write('\n')  

611444it [01:20, 7606.56it/s] 


### Parse Repeats