# Extracting one word definitions from the ENF file

In [47]:
import json
from pathlib import Path
from dutchanalyzer.config import *
from dutchanalyzer.utilities.utils import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.replacement_utils import *
from dutchanalyzer.utilities.pandas_utils import *
from pathlib import Path
import datetime
from tqdm import tqdm
import pickle

In [2]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(WIKT_CLEANING_DIR, str(today))
folders = {'en': ['EEF', 'ENF'], 'nl':['NEF', 'NNF']}


for k, v in folders.items():
    for f in v:
        Path.mkdir(Path(current_save_folder, k, f), parents=True, exist_ok=True)

In [5]:
def sort_file_words(file, start_at_a=True):
    lines = []
    total_lines = count_lines_with_progress(file)
    if total_lines > 5000000:
        lines_a_to_e = []

In [25]:
def get_entry_counts(file, wl_code, with_pos=True, quiet=False):
    one_word_one_sense_enf = []
    entry_count = {}
    entry_id = 0
    total_lines = count_lines_with_progress(file)
    with open(file, 'r', encoding='utf-8') as f:
       
        for i, line in tqdm(enumerate(f), total=total_lines, desc='getting entry counts'):
            obj = json.loads(line)
            if obj:

                word = obj.get('word', '')
                pos = obj.get('pos', '')
                id = obj.get('entry_id')
                if not id:
                    id = f'{wl_code}_{entry_id}'
                    entry_id += 1
                item = word
                if with_pos:
                    item = (word, pos)
                if word not in entry_count:
                    entry_count[item] = {'count': 1}
                    entry_count[item]['entry_ids'] = [id]
                else:
                    entry_count[item]['count'] += 1
                    entry_count[item]['entry_ids'].append(id)
    return entry_count

In [24]:
entries = get_entry_counts(ENF_FILE, 'ENF')
one_entry_words = [x for x in entries.keys() if entries[x]['count'] == 1]
len(one_entry_words)

Counting Lines: 100%|██████████| 158M/158M [00:00<00:00, 1.49GB/s]
getting entry counts: 100%|██████████| 127859/127859 [00:02<00:00, 62530.94it/s]


125576

In [None]:
def get_one_sense_entries(file, one_entry_words=[], wl_code=''):
    one_sense_entries = {}
    one_word_one_sense_entries = {}
    if not one_entry_words:
        if not wl_code:
            wl_code = get_file_wl_code(file)
        entry_count = get_entry_counts(file, wl_code)
        one_entry_words = [x for x in entries.keys() if entries[x]['count'] == 1]
    with open(file, 'r', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f), 'narrowing senses and glosses'):
            obj = json.loads(line)
            if obj:
                word = obj.get('word', '')
                pos = obj.get('pos', '')
                item = (word, pos)
                if item in one_entry_words:
                    senses = obj.get('senses', '')
                    if senses and len(senses) == 1:
                        one_sense_entries[item] = obj
                        glosses = senses[0].get('glosses')
                        if glosses and len(glosses) == 1:
                            one_word_one_sense_entries[item] = glosses[0]

    return one_sense_entries, one_word_one_sense_entries

In [30]:
file = ENF_FILE
wl_code = 'ENF'
one_sense_entries, one_word_one_sense_entries = get_one_sense_entries(ENF_FILE, one_entry_words, wl_code)

narrowing senses and glosses: 127859it [05:10, 411.24it/s]


In [31]:
len(one_word_one_sense_entries)

97461

In [None]:
one_sense_entries_path = Path(ENF_FOLDER, 'one_sense_entries.pkl')
with open(one_sense_entries_path, 'wb+') as f:
    pickle.dump(one_sense_entries, f)

In [48]:
one_word_entries_path = Path(ENF_FOLDER, 'one_word_entries.pkl')
with open(one_word_entries_path, 'wb+') as f:
    pickle.dump(one_word_one_sense_entries, f)
