In [8]:
%load_ext autoreload
%autoreload 3

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import json
from dutchanalyzer.config import *
from dutchanalyzer.utilities.utils import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.replacement_utils import *
from pathlib import Path
from io import StringIO
import datetime
import re
from dotenv import load_dotenv
from pprint import pprint
import ast

### Paths

In [5]:
ERAW_FILE = Path(RAW_KAIKKI_DIR, 'en', 'kaikki_en-raw-wiktextract-data.jsonl') 
NRAW_FILE = Path(RAW_KAIKKI_DIR, 'nl', 'kaikki_nl-raw-extract.jsonl')

In [6]:
eng_save_path = Path(WIKT_PREPROCESSING_DIR, 'en')
nld_save_path = Path(WIKT_PREPROCESSING_DIR, 'nl')

In [5]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))

In [6]:
folders = ['EEF', 'ENF', 'NEF', 'NNF']
for f in folders:
    Path.mkdir(Path(current_save_folder, f), parents=True, exist_ok=True)

## Process Raw Files

In [9]:
# Paths
NL_lines_file = Path(nld_save_path, 'NLR.jsonl')
EN_lines_file = Path(eng_save_path, 'ELR.jsonl')
ENR_file = Path(ENR_DIR, 'ENR.jsonl')
EER_file = Path(EER_DIR, 'EER.jsonl')
NNR_file = Path(NNR_DIR, 'NNR.jsonl')
NER_file = Path(NER_DIR, 'NER.jsonl')
other_file = Path(WIKT_PREPROCESSING_DIR, 'other_langs.jsonl')

In [None]:
def filter_translations_regex(obj_str: str):
    translations_pattern = r'"translations"\s*:\s*\[({.*?})\]'
    translations_block = re.compile(translations_pattern, re.DOTALL)
    gen_lang_block_pattern = r'(\{[^{}]*?"lang"\s*:\s*"(English|Engels|Dutch|Nederlands|Old English|Oudengels|Old Saxon|Oudnederlands|Dutch Low Saxon|Middle Dutch|Middelnederlands|Old Dutch|Middle English|Limburgish|Oudnederlands|Middenengels|Middelengels|Simple English|Eenvoudig Engels)"[^{}]*?\})'
    gen_lang_block = re.compile(gen_lang_block_pattern, re.DOTALL)
    match = translations_block.search(obj_str)
    
    while match is not None:
        if match:
            new_translations_str = '"translations": ['
            start, end = match.span()
            m = match.group(0)
            dn = gen_lang_block.findall(m)
            if dn:
                
                dn = [safe_dict(x[0]) for x in dn]
                str_dn = '[' + ', '.join(json.dumps(x) for x in dn) + ']'
                
                obj_str = obj_str[:start] + '"translations": ' + str_dn + obj_str[end:]
                match = translations_block.search(obj_str, start + len(str_dn))
            else:
                to_remove_end = end
                if obj_str[end] == ',':
                    to_remove_end += 1
                if obj_str[end + 1] == ']' or obj_str[end + 1] == '}':
                    if obj_str[start - 1] == ',':
                        start = start - 1
                    elif obj_str[start - 2] == ',':
                        start = start - 2
                obj_str = obj_str[:start] + obj_str[to_remove_end:]
                match = translations_block.search(obj_str, start)
    obj_str = obj_str.replace(', ]', ']')
    obj_str = obj_str.replace(', }', '}')
    return obj_str

In [10]:
def filter_obj(obj):
    obj.pop('anagrams', '')
    obj.pop('proverbs', '')
    obj.pop('pos_title', '')
    obj.pop('wikipedia', '')
    obj.pop('wikidata', '')
    obj.pop('abbreviations', '')
    obj.pop('original_title', '')
    obj.pop('info_templates', '')

In [19]:
def process_raw_file(in_file, save_path, wl_code, batch_size=100000, break_point=-1):
    entries_batch = []
    lang_2_lines = []
    other_lines = []
    error_lines = []
    l1_mode = 'w+'
    l2_mode = 'w+'
    o_mode = 'w+'
    if wl_code == 'ERAW':
        out_file = Path(save_path, 'EER.jsonl')
        lang2_out_file = Path(save_path, 'ENR.jsonl')
        lang1 = 'en'
        lang1_wl_code = 'EER'
        lang2 = 'nl'
        lang2_wl_code = 'ENR'
        other_code = 'EOR'
    elif wl_code == 'NRAW':
        out_file = Path(save_path, 'NNR.jsonl')
        lang2_out_file = Path(save_path, 'NER.jsonl')
        lang1 = 'nl'
        lang1_wl_code = 'NNR'
        lang2 = 'en'
        lang2_wl_code = 'NER'
        other_code = 'NOR'
    other_file = Path(save_path, '{other_code}.jsonl')
    with open(in_file, 'r', encoding='utf-8') as f:
        
                for i, line in tqdm(enumerate(f), total=count_lines_with_progress(in_file, quiet=True)):
                    if break_point > 0:
                        if i > break_point:
                            print(entries_batch)
                            break
                    if line:
                        
                        if wl_code == 'ERAW' or wl_code == 'NRAW':
                            if wl_code == 'ERAW':
                                if not en_keep_before_load(line):
                                    continue
                            if wl_code == 'NRAW':
                                if not nl_keep_before_load(line):
                                    continue      
                        line = filter_translations_regex(line)    
                            
                        try:
                            obj = json.loads(line)
                            if not obj:
                                continue
                            filter_obj(obj)
                            if not obj:
                                continue
                            obj = sort_standardize_entry(obj)
                            
                            if obj:

                                lang_code = obj.get('lang_code')
                                if lang_code == lang1:
                                    curr_wl_code = lang1_wl_code
                                    obj['wl_code'] = curr_wl_code
                                    entries_batch.append(obj)
                                    if len(entries_batch) > batch_size:
                                        save_batch_to_file(entries_batch, out_file, l1_mode)
                                        if l1_mode == 'w+':
                                            l1_mode='a'
                                        entries_batch = []

                                    
                                elif lang_code == lang2:
                                    curr_wl_code = lang2_wl_code
                                    obj['wl_code'] = curr_wl_code
                                    lang_2_lines.append(obj)
                                    if len(lang_2_lines) > batch_size:
                                        save_batch_to_file(lang_2_lines, lang2_out_file, l2_mode)
                                        if l2_mode == 'w+':
                                            l2_mode='a'
                                        lang_2_lines = []

                                else:
                                    curr_wl_code = other_code
                                    obj['wl_code'] = curr_wl_code
                                    other_lines.append(obj)
    
                                    if len(other_lines) > batch_size:
                                        save_batch_to_file(other_lines, other_file, o_mode)
                                        if o_mode == 'w+':
                                            o_mode='a'
                                        other_lines = []
                                

                        except Exception as e:
                            error_lines.append((i, obj))
                            print(line)
                            print("Error on line: ", i, " Error: ", e)
                            
                if entries_batch:
                    save_batch_to_file(entries_batch, out_file, l1_mode)
                    
                if lang_2_lines:
                    save_batch_to_file(lang_2_lines, lang2_out_file, l2_mode)
                
                if other_lines:
                    save_batch_to_file(other_lines, other_file, o_mode)
            
    
    
    return entries_batch, lang_2_lines

In [21]:
entries_batch, lang_2_lines = process_raw_file(Path(RAW_KAIKKI_DIR, 'nl', 'nl-extract.jsonl'), current_save_folder, 'NRAW')

100%|██████████| 1052635/1052635 [01:25<00:00, 12254.75it/s]


In [22]:
entries_batch

[{'word': 'glucosegehalten',
  'pos': 'noun',
  'lang_code': 'nl',
  'lang': 'dutch',
  'senses': [{'glosses': ['meervoud van het zelfstandig naamwoord glucosegehalte'],
    'categories': ['Zelfstandignaamwoordsvorm in het Nederlands'],
    'form_of': [{'word': 'glucosegehalte'}],
    'tags': ['form-of', 'plural']}],
  'categories': ['Ontbrekend geluid',
   'Retrograad van het Nederlands',
   'Woorden in het Nederlands',
   'Woorden in het Nederlands met audioweergave'],
  'hyphenations': [{'parts': ['glu', 'co', 'se', 'ge', 'hal', 'ten']}],
  'wl_code': 'NNR'},
 {'word': 'goalgetter',
  'pos': 'noun',
  'lang_code': 'nl',
  'lang': 'dutch',
  'senses': [{'glosses': ['voetballer die vaak doelpunten maakt'],
    'categories': ['Voetbal_in_het_Nederlands'],
    'topics': ['football']}],
  'forms': [{'form': 'goalgetters', 'tags': ['plural']}],
  'categories': ['Ontbrekend geluid',
   'Retrograad van het Nederlands',
   'Telbaar',
   'Woorden in het Nederlands',
   'Woorden in het Nederla

In [23]:
entries_batch, lang_2_lines = process_raw_file(ERAW_FILE, current_save_folder, 'ERAW')

100%|██████████| 10329308/10329308 [04:41<00:00, 36736.37it/s]


this should not have happened, pos


In [None]:
sorted_eef = alpha_sort_large_file(Path(current_save_folder, 'EER.jsonl'), Path(current_save_folder, 'EEF'))

Splitting and reading lines: 100%|██████████| 1410397/1410397 [02:58<00:00, 7918.27it/s] 
sorting letters and writing to out file:   0%|          | 0/28 [00:00<?, ?it/s]

Now processing: a


sorting letters and writing to out file:   4%|▎         | 1/28 [00:08<03:58,  8.82s/it]

Now processing: b


sorting letters and writing to out file:   7%|▋         | 2/28 [00:20<04:31, 10.44s/it]

Now processing: c


sorting letters and writing to out file:  11%|█         | 3/28 [00:32<04:36, 11.05s/it]

Now processing: d


sorting letters and writing to out file:  14%|█▍        | 4/28 [00:40<03:59,  9.97s/it]

Now processing: e


sorting letters and writing to out file:  18%|█▊        | 5/28 [00:44<02:58,  7.75s/it]

Now processing: f


sorting letters and writing to out file:  21%|██▏       | 6/28 [00:50<02:36,  7.10s/it]

Now processing: g


sorting letters and writing to out file:  25%|██▌       | 7/28 [00:55<02:20,  6.68s/it]

Now processing: h


sorting letters and writing to out file:  29%|██▊       | 8/28 [01:02<02:12,  6.61s/it]

Now processing: i


sorting letters and writing to out file:  32%|███▏      | 9/28 [01:07<01:55,  6.09s/it]

Now processing: j


sorting letters and writing to out file:  36%|███▌      | 10/28 [01:08<01:23,  4.63s/it]

Now processing: k


sorting letters and writing to out file:  39%|███▉      | 11/28 [01:11<01:11,  4.21s/it]

Now processing: l


sorting letters and writing to out file:  43%|████▎     | 12/28 [01:17<01:12,  4.53s/it]

Now processing: m


sorting letters and writing to out file:  46%|████▋     | 13/28 [01:25<01:26,  5.77s/it]

Now processing: n


sorting letters and writing to out file:  50%|█████     | 14/28 [01:31<01:20,  5.76s/it]

Now processing: o


sorting letters and writing to out file:  54%|█████▎    | 15/28 [01:34<01:04,  4.98s/it]

Now processing: p


sorting letters and writing to out file:  57%|█████▋    | 16/28 [01:46<01:24,  7.01s/it]

Now processing: q


sorting letters and writing to out file:  61%|██████    | 17/28 [01:48<01:01,  5.55s/it]

Now processing: r


sorting letters and writing to out file:  64%|██████▍   | 18/28 [01:55<00:57,  5.79s/it]

Now processing: s


sorting letters and writing to out file:  68%|██████▊   | 19/28 [02:10<01:18,  8.68s/it]

Now processing: t


sorting letters and writing to out file:  71%|███████▏  | 20/28 [02:19<01:10,  8.83s/it]

Now processing: u


sorting letters and writing to out file:  75%|███████▌  | 21/28 [02:22<00:50,  7.15s/it]

Now processing: v


sorting letters and writing to out file:  79%|███████▊  | 22/28 [02:25<00:35,  5.94s/it]

Now processing: w


sorting letters and writing to out file:  82%|████████▏ | 23/28 [02:30<00:27,  5.52s/it]

Now processing: x


sorting letters and writing to out file:  86%|████████▌ | 24/28 [02:30<00:15,  3.99s/it]

Now processing: y


sorting letters and writing to out file:  89%|████████▉ | 25/28 [02:31<00:08,  2.99s/it]

Now processing: z


sorting letters and writing to out file:  93%|█████████▎| 26/28 [02:32<00:04,  2.27s/it]

Now processing: non_ascii
Now processing: non_a_z


sorting letters and writing to out file: 100%|██████████| 28/28 [02:32<00:00,  5.45s/it]


In [None]:
sorted_enf = alpha_sort_large_file(Path(current_save_folder, 'ENR.jsonl'), Path(current_save_folder, 'ENF'))

Splitting and reading lines: 100%|██████████| 140103/140103 [00:22<00:00, 6112.44it/s] 
sorting letters and writing to out file:   0%|          | 0/28 [00:00<?, ?it/s]

Now processing: a


sorting letters and writing to out file:   4%|▎         | 1/28 [00:02<01:00,  2.23s/it]

Now processing: b


sorting letters and writing to out file:   7%|▋         | 2/28 [00:03<00:49,  1.90s/it]

Now processing: c


sorting letters and writing to out file:  11%|█         | 3/28 [00:04<00:31,  1.26s/it]

Now processing: d


sorting letters and writing to out file:  14%|█▍        | 4/28 [00:05<00:27,  1.13s/it]

Now processing: e


sorting letters and writing to out file:  18%|█▊        | 5/28 [00:07<00:30,  1.34s/it]

Now processing: f


sorting letters and writing to out file:  21%|██▏       | 6/28 [00:07<00:21,  1.01it/s]

Now processing: g


sorting letters and writing to out file:  25%|██▌       | 7/28 [00:08<00:20,  1.01it/s]

Now processing: h


sorting letters and writing to out file:  29%|██▊       | 8/28 [00:09<00:18,  1.09it/s]

Now processing: i


sorting letters and writing to out file:  36%|███▌      | 10/28 [00:09<00:10,  1.73it/s]

Now processing: j
Now processing: k


sorting letters and writing to out file:  39%|███▉      | 11/28 [00:10<00:11,  1.47it/s]

Now processing: l


sorting letters and writing to out file:  43%|████▎     | 12/28 [00:12<00:16,  1.02s/it]

Now processing: m


sorting letters and writing to out file:  46%|████▋     | 13/28 [00:13<00:13,  1.08it/s]

Now processing: n


sorting letters and writing to out file:  50%|█████     | 14/28 [00:13<00:10,  1.30it/s]

Now processing: o


sorting letters and writing to out file:  54%|█████▎    | 15/28 [00:14<00:11,  1.12it/s]

Now processing: p


sorting letters and writing to out file:  57%|█████▋    | 16/28 [00:15<00:10,  1.17it/s]

Now processing: q
Now processing: r


sorting letters and writing to out file:  64%|██████▍   | 18/28 [00:17<00:08,  1.18it/s]

Now processing: s


sorting letters and writing to out file:  68%|██████▊   | 19/28 [00:18<00:08,  1.01it/s]

Now processing: t


sorting letters and writing to out file:  71%|███████▏  | 20/28 [00:19<00:07,  1.08it/s]

Now processing: u


sorting letters and writing to out file:  75%|███████▌  | 21/28 [00:19<00:05,  1.31it/s]

Now processing: v


sorting letters and writing to out file:  79%|███████▊  | 22/28 [00:22<00:07,  1.25s/it]

Now processing: w


sorting letters and writing to out file:  82%|████████▏ | 23/28 [00:22<00:05,  1.09s/it]

Now processing: x
Now processing: y
Now processing: z


sorting letters and writing to out file: 100%|██████████| 28/28 [00:23<00:00,  1.19it/s]


Now processing: non_ascii
Now processing: non_a_z


In [None]:
sorted_nnf = alpha_sort_large_file(Path(current_save_folder, 'NNR.jsonl'), Path(current_save_folder, 'NNF'))

Splitting and reading lines: 100%|██████████| 598925/598925 [01:19<00:00, 7571.43it/s] 
sorting letters and writing to out file:   0%|          | 0/28 [00:00<?, ?it/s]

Now processing: a


sorting letters and writing to out file:   4%|▎         | 1/28 [00:05<02:27,  5.45s/it]

Now processing: b


sorting letters and writing to out file:   7%|▋         | 2/28 [00:11<02:37,  6.04s/it]

Now processing: c


sorting letters and writing to out file:  11%|█         | 3/28 [00:13<01:43,  4.16s/it]

Now processing: d


sorting letters and writing to out file:  14%|█▍        | 4/28 [00:17<01:37,  4.06s/it]

Now processing: e


sorting letters and writing to out file:  18%|█▊        | 5/28 [00:19<01:18,  3.40s/it]

Now processing: f


sorting letters and writing to out file:  21%|██▏       | 6/28 [00:21<00:57,  2.62s/it]

Now processing: g


sorting letters and writing to out file:  25%|██▌       | 7/28 [00:25<01:07,  3.21s/it]

Now processing: h


sorting letters and writing to out file:  29%|██▊       | 8/28 [00:27<00:57,  2.86s/it]

Now processing: i


sorting letters and writing to out file:  32%|███▏      | 9/28 [00:30<00:54,  2.86s/it]

Now processing: j


sorting letters and writing to out file:  36%|███▌      | 10/28 [00:30<00:37,  2.11s/it]

Now processing: k


sorting letters and writing to out file:  39%|███▉      | 11/28 [00:33<00:39,  2.35s/it]

Now processing: l


sorting letters and writing to out file:  43%|████▎     | 12/28 [00:36<00:40,  2.55s/it]

Now processing: m


sorting letters and writing to out file:  46%|████▋     | 13/28 [00:38<00:35,  2.40s/it]

Now processing: n


sorting letters and writing to out file:  50%|█████     | 14/28 [00:39<00:27,  2.00s/it]

Now processing: o


sorting letters and writing to out file:  54%|█████▎    | 15/28 [00:44<00:35,  2.72s/it]

Now processing: p


sorting letters and writing to out file:  57%|█████▋    | 16/28 [00:48<00:37,  3.09s/it]

Now processing: q


sorting letters and writing to out file:  61%|██████    | 17/28 [00:48<00:24,  2.23s/it]

Now processing: r


sorting letters and writing to out file:  64%|██████▍   | 18/28 [00:50<00:21,  2.16s/it]

Now processing: s


sorting letters and writing to out file:  68%|██████▊   | 19/28 [00:57<00:32,  3.57s/it]

Now processing: t


sorting letters and writing to out file:  71%|███████▏  | 20/28 [00:59<00:25,  3.20s/it]

Now processing: u


sorting letters and writing to out file:  75%|███████▌  | 21/28 [01:00<00:17,  2.53s/it]

Now processing: v


sorting letters and writing to out file:  79%|███████▊  | 22/28 [01:05<00:19,  3.29s/it]

Now processing: w


sorting letters and writing to out file:  86%|████████▌ | 24/28 [01:09<00:09,  2.43s/it]

Now processing: x
Now processing: y
Now processing: z


sorting letters and writing to out file: 100%|██████████| 28/28 [01:11<00:00,  2.55s/it]


Now processing: non_ascii
Now processing: non_a_z


In [28]:
sorted_nef = alpha_sort_large_file(Path(current_save_folder, 'NER.jsonl'), Path(current_save_folder, 'NEF'))

Splitting and reading lines: 100%|██████████| 16331/16331 [00:01<00:00, 12256.23it/s]
sorting letters and writing to out file:   7%|▋         | 2/28 [00:00<00:01, 13.71it/s]

Now processing: a
Now processing: b
Now processing: c


sorting letters and writing to out file:  25%|██▌       | 7/28 [00:00<00:01, 18.35it/s]

Now processing: d
Now processing: e
Now processing: f
Now processing: g
Now processing: h


sorting letters and writing to out file:  46%|████▋     | 13/28 [00:00<00:00, 24.08it/s]

Now processing: i
Now processing: j
Now processing: k
Now processing: l
Now processing: m
Now processing: n
Now processing: o


sorting letters and writing to out file:  57%|█████▋    | 16/28 [00:00<00:00, 23.46it/s]

Now processing: p
Now processing: q
Now processing: r
Now processing: s


sorting letters and writing to out file: 100%|██████████| 28/28 [00:01<00:00, 24.83it/s]


Now processing: t
Now processing: u
Now processing: v
Now processing: w
Now processing: x
Now processing: y
Now processing: z
Now processing: non_ascii
Now processing: non_a_z


In [None]:
batch, temp_eef = add_entry_ids(sorted_eef)

1410397it [02:08, 10953.19it/s]


[{'entry_id': 'EEF_1400014',
  'word': 'yoozh',
  'pos': 'noun',
  'lang_code': 'en',
  'lang': 'english',
  'senses': [{'glosses': ['Alternative form of uzhe; Clipping of usual.'],
    'categories': ['English clippings'],
    'alt_of': [{'word': 'uzhe', 'extra': 'Clipping of usual'}],
    'links': [['uzhe', 'uzhe#English'], ['usual', 'usual#English']],
    'tags': ['alt-of', 'alternative']}],
  'forms': [{'form': 'yoozhes', 'tags': ['plural']}],
  'categories': ['English adjectives',
   'English countable nouns',
   'English entries with incorrect language header',
   'English lemmas',
   'English nouns',
   'Pages with 1 entry',
   'Pages with entries'],
  'head_templates': [{'name': 'en-noun',
    'args': {},
    'expansion': 'yoozh (plural yoozhes)'}],
  'wl_code': 'EER'},
 {'entry_id': 'EEF_1400015',
  'word': 'yoozhe',
  'pos': 'adj',
  'lang_code': 'en',
  'lang': 'english',
  'senses': [{'glosses': ['Alternative form of uzhe; Clipping of usual.'],
    'categories': ['English cl

In [None]:
batch, temp_enf = add_entry_ids(sorted_enf)

140103it [00:17, 7854.42it/s] 


[{'entry_id': 'ENF_100001',
  'word': 'sabra',
  'pos': 'noun',
  'lang_code': 'nl',
  'lang': 'dutch',
  'senses': [{'glosses': ['Sabra (native-born Israeli)'],
    'categories': ['Dutch entries with incorrect language header',
     'Dutch lemmas',
     'Dutch masculine nouns',
     'Dutch nouns',
     'Dutch nouns with plural in -s',
     'Dutch terms derived from Hebrew',
     'Dutch terms with quotations',
     'Pages with 6 entries',
     'Pages with entries'],
    'links': [['Sabra', 'Sabra']],
    'tags': ['masculine']}],
  'forms': [{'form': "sabra's", 'tags': ['plural']},
   {'form': 'sabraatje', 'tags': ['diminutive', 'neuter']}],
  'etymology_templates': [{'name': 'der',
    'args': {'1': 'nl',
     '2': 'he',
     '3': 'צַבָּר',
     '4': '',
     '5': 'prickly pear cactus; Sabra',
     'tr': 'tsabár'},
    'expansion': 'Hebrew צַבָּר (tsabár, “prickly pear cactus; Sabra”)'}],
  'etymology_text': 'Ultimately from Hebrew צַבָּר (tsabár, “prickly pear cactus; Sabra”).',
  'ca

In [None]:
batch, temp_nnf = add_entry_ids(sorted_nnf)

598925it [00:53, 11097.90it/s]


[{'entry_id': 'NNF_500005',
  'word': 'toetseninstrument',
  'pos': 'noun',
  'lang_code': 'nl',
  'lang': 'dutch',
  'senses': [{'glosses': ['muziekinstrument dat is voorzien van een of meer toetsenborden om de gewenste tonen tot klinken te brengen'],
    'categories': ['Muziekinstrument_in_het_Nederlands'],
    'topics': ['music']}],
  'forms': [{'form': 'toetseninstrumenten', 'tags': ['plural']}],
  'synonyms': [{'word': 'toetsinstrument'}],
  'categories': ['Invoegsel -en- in het Nederlands',
   'Ontbrekend geluid',
   'Retrograad van het Nederlands',
   'Samenstelling in het Nederlands',
   'Telbaar',
   'WikiWoordenboek:Sjabloon Link onbruikbare woordsoort in het Nederlands',
   'Woorden in het Nederlands',
   'Woorden in het Nederlands met IPA-weergave',
   'Woorden in het Nederlands met audioweergave',
   'Woorden met 5 lettergrepen in het Nederlands',
   'Zelfstandig naamwoord in het Nederlands'],
  'etymology_texts': ['samenstelling van toets en instrument zn met het invoegse

In [None]:
batch, temp_nef = add_entry_ids(sorted_nef)

16343it [00:00, 76972.19it/s]


[{'entry_id': 'NEF_0',
  'word': 'A',
  'pos': 'noun',
  'lang_code': 'en',
  'lang': 'english',
  'senses': [{'glosses': ['de toon “a”'],
    'categories': ['Muziek_in_het_Engels'],
    'topics': ['music']}],
  'derived': [{'word': 'A flat'},
   {'word': 'A major'},
   {'word': 'A minor'},
   {'word': 'A sharp'}],
  'categories': ['Woorden in het Engels',
   'Woorden in het Engels met audioweergave',
   'Zelfstandig naamwoord in het Engels'],
  'related': [{'word': 'B'}, {'word': 'G'}],
  'wl_code': 'NER'},
 {'entry_id': 'NEF_1',
  'word': 'A double flat',
  'pos': 'noun',
  'lang_code': 'en',
  'lang': 'english',
  'senses': [{'glosses': ['de toon “ases”, een verlaagde “as”'],
    'categories': ['Muziek_in_het_Engels'],
    'topics': ['music']}],
  'derived': [{'word': 'A double flat major'}],
  'categories': ['Woorden in het Engels',
   'Woorden in het Engels met audioweergave',
   'Zelfstandig naamwoord in het Engels'],
  'related': [{'word': 'B double flat'}, {'word': 'G double fl

In [None]:
import shutil


EEF_FILE = shutil.copy(temp_eef, Path(WIKT_CLEANING_DIR, 'EEF', 'EEF.jsonl'))
ENF_FILE = shutil.copy(temp_enf, Path(WIKT_CLEANING_DIR, 'ENF', 'ENF.jsonl'))
NNF_FILE = shutil.copy(temp_nnf, Path(WIKT_CLEANING_DIR, 'NNF', 'NNF.jsonl'))
NEF_FILE = shutil.copy(temp_nef, Path(WIKT_CLEANING_DIR, 'NEF', 'NEF.jsonl'))

WindowsPath('C:/Users/elise/SynologyDrive/Dev/DutchAnalyzerPublic/DutchAnalyzer/data/interim/cleaning/wikt/NEF/NEF.jsonl')