# Extracting one word definitions from the ENF file

In [1]:
import json
from pathlib import Path
from dutchanalyzer.config import *
from dutchanalyzer.utilities.utils import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.replacement_utils import *
from dutchanalyzer.utilities.pandas_utils import *
import datetime
from tqdm import tqdm
import pickle

In [2]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(WIKT_CLEANING_DIR, str(today))
folders = ['EEF', 'ENF', 'NNF', 'NEF']


for f in folders:
            Path.mkdir(Path(current_save_folder, f), parents=True, exist_ok=True)

## Add entry ids

In [10]:
def make_temp_file_path(file):
    
    if not isinstance(file, Path):
        file = Path(file)
    temp_name = f'{file.stem}_temp.jsonl'
    folder = file.parents[0]
    temp_file = Path(folder, temp_name)
    return temp_file
      

In [None]:
def add_entry_ids(file, overwrite=False):
    batch = []
    batch_size = 100000
    wl_code = get_file_wl_code(file)
    temp_file = make_temp_file_path(file)
    with open(file, 'r', encoding='utf-8') as f:
        with open(temp_file, 'w+',encoding='utf-8') as out:
            for i, line in tqdm(enumerate(f)):
                if line:
                    loaded = json.loads(line)
                    if loaded:
                        if 'entry_id' not in loaded:
                            loaded['entry_id'] = f'{wl_code}_{i}'
                            sorted_keys = sort_entry_keys(loaded, start_keys=['entry_id', 'word', 'pos', 'lang_code', 'lang', 'senses'])
                            obj = {}
                            for k in sorted_keys:
                                obj[k] = loaded[k]
                        batch.append(obj)
                    
                if len(batch) > batch_size:
                    for b in batch:
                        json.dump(b, out, ensure_ascii=False)
                        out.write('\n')
                    batch = []
            if batch:
                for b in batch:
                    json.dump(b, out, ensure_ascii=False)
                    out.write('\n')
    if overwrite:
        overwrite_file(file, temp_file)
    return batch

In [None]:
batch = add_entry_ids(EEF_FILE)

1230364it [01:50, 11181.49it/s]


In [None]:
batch = add_entry_ids(ENF_FILE)


127859it [00:15, 8508.89it/s] 


In [35]:
batch = add_entry_ids(NNF_FILE)

598925it [00:54, 10920.22it/s]


In [37]:
batch = add_entry_ids(NEF_FILE)

16343it [00:00, 23340.38it/s]


In [38]:
overwrite_file(EEF_FILE, Path(EEF_FOLDER, 'EEF_temp.jsonl'), quiet=True)

In [39]:
overwrite_file(ENF_FILE, Path(ENF_FOLDER, 'ENF_temp.jsonl'), quiet=True)

In [40]:
overwrite_file(NNF_FILE, Path(NNF_FOLDER, 'NNF_temp.jsonl'), quiet=True)

In [41]:
overwrite_file(NEF_FILE, Path(NEF_FOLDER, 'NEF_temp.jsonl'), quiet=True)

## Get entry counts

In [None]:
def get_entry_counts(file, wl_code, with_pos=True, quiet=False):
    entry_count = {}
    entry_id = 0
    total_lines = count_lines_with_progress(file)
    with open(file, 'r', encoding='utf-8') as f:
       
        for i, line in tqdm(enumerate(f), total=total_lines, desc='getting entry counts'):
            obj = json.loads(line)
            if obj:
                word = obj.get('word', '')
                pos = obj.get('pos', '')
                id = obj.get('entry_id')
                if not id:
                    id = f'{wl_code}_{entry_id}'
                    entry_id += 1
                item = word
                if with_pos:
                    item = (word, pos)
                if word not in entry_count:
                    entry_count[item] = {'count': 1}
                    entry_count[item]['entry_ids'] = [id]
                else:
                    entry_count[item]['count'] += 1
                    entry_count[item]['entry_ids'].append(id)
    return entry_count

In [43]:
entries = get_entry_counts(ENF_FILE, 'ENF')
one_entry_words = [x for x in entries.keys() if entries[x]['count'] == 1]
len(one_entry_words)

Counting Lines:   0%|          | 0.00/162M [00:00<?, ?B/s]

Counting Lines: 100%|██████████| 162M/162M [00:00<00:00, 1.29GB/s]


Lines in file: 127859


getting entry counts: 100%|██████████| 127859/127859 [00:01<00:00, 65380.42it/s]


125576

In [55]:
def get_one_sense_entries(file, one_entry_words=[], wl_code='', one_sense_entries_file='', one_gloss_file=''):
    one_sense_entries = []
    one_gloss_entries = []
    if not one_entry_words:
        if not wl_code:
            wl_code = get_file_wl_code(file)
        entry_count = get_entry_counts(file, wl_code)
        one_entry_words = [x for x in entries.keys() if entries[x]['count'] == 1]
    with open(file, 'r', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f), 'narrowing senses and glosses'):

            obj = json.loads(line)
            if obj:
                word = obj.get('word', '')
                pos = obj.get('pos', '')
                item = (word, pos)
                if item in one_entry_words:
                    entry_id = obj.get('entry_id')
                    senses = obj.get('senses', '')
                    if senses and len(senses) == 1:
                        one_sense_entries.append(obj)
                        glosses = senses[0].get('glosses')
                        if glosses and len(glosses) == 1:
                            one_gloss_entries.append({'entry_id': entry_id, 'word': word, 'pos': pos,'gloss': glosses[0]})
    
    if one_sense_entries_file:
        with open(one_sense_entries_file, 'w+', encoding='utf-8') as f:
            for obj in one_sense_entries:
                json.dump(obj, f, ensure_ascii=False)
                f.write('\n')
                
    if one_gloss_file:
        with open(one_gloss_file, 'w+', encoding='utf-8') as f:
            for obj in one_gloss_entries:
                json.dump(obj, f, ensure_ascii=False)
                f.write('\n')

    return one_sense_entries, one_gloss_entries

In [56]:
file = ENF_FILE
wl_code = 'ENF'
one_sense_entries, one_gloss_one_sense_entries = get_one_sense_entries(ENF_FILE, one_entry_words, wl_code, one_sense_entries_file=Path(current_save_folder, 'en', 'ENF','ENF_one_sense_entries.jsonl'), one_gloss_file=Path(current_save_folder, 'en', 'ENF', 'ENF_one_gloss_entries.jsonl'))

narrowing senses and glosses: 127859it [05:21, 398.03it/s]


In [58]:
len(one_gloss_one_sense_entries)

98431

In [None]:
# if an entry is 2 words this will also accept 2 word definition
def get_3_or_less_word_defs(entries):
    pass

In [None]:
get_3_or_less_word_defs(one_gloss_one_sense_entries)