In [1]:
import ujson
from dutchanalyzer.config import *
from dutchanalyzer.utils import *
from dutchanalyzer.json_utils import *
from pathlib import Path
from dotenv import load_dotenv
from io import StringIO
import datetime
import re
from pprint import pprint
import ast


### Paths

In [2]:
ERAW_FILE = Path(RAW_KAIKKI_DIR, 'en', 'kaikki_en-raw-wiktextract-data.jsonl') 
NRAW_FILE = Path(RAW_KAIKKI_DIR, 'nl', 'kaikki_nl-raw-extract.jsonl')

In [3]:
eng_save_path = Path(WIKT_PREPROCESSING_DIR, 'en')
nld_save_path = Path(WIKT_PREPROCESSING_DIR, 'nl')

### Shared Utilities

In [4]:
def has_cjk_or_arabic_fast(text: str, limit: int = 50) -> bool:
    """Return True if the first `limit` characters contain
    any Chinese, Japanese, Korean, or Arabic/Farsi character."""
    for ch in text[:limit]:
        cp = ord(ch)
        # CJK (Chinese/Japanese/Korean)
        if (
            0x4E00 <= cp <= 0x9FFF or  # CJK Unified Ideographs
            0x3400 <= cp <= 0x4DBF or  # CJK Ext A
            0xF900 <= cp <= 0xFAFF or  # CJK Compatibility
            0x3040 <= cp <= 0x30FF or  # Hiragana + Katakana
            0x31F0 <= cp <= 0x31FF or  # Katakana Extensions
            0xAC00 <= cp <= 0xD7AF or  # Hangul Syllables
            # Arabic / Farsi
            0x0600 <= cp <= 0x06FF or
            0x0750 <= cp <= 0x077F or
            0x08A0 <= cp <= 0x08FF or
            0xFB50 <= cp <= 0xFEFF
        ):
            return True  # stop immediately
    return False

In [6]:
def safe_dict(obj_str: str):
    if isinstance(obj_str, str):
        try:
            return ast.literal_eval(obj_str)
        except Exception:
            return ""       # fallback

## Process Raw NL File

### Intake JSON

In [5]:
# Paths
NL_lines_file = Path(nld_save_path, 'NLR.jsonl')
NNR_lines_file = Path(NNR_DIR, 'NNR.jsonl')
NER_lines_file = Path(NER_DIR, 'NER.jsonl')

In [None]:
def filter_en_translations_regex(obj_str: str):
    translations_pattern = r'"translations"\s*:\s*\[({.*?})\]'
    en_translation_pattern = r'\{[^{}]*?"lang"\s*:\s*"Engels"[^{}]*?\}'
    translations_block = re.compile(translations_pattern, re.DOTALL)
    en_object = re.compile(en_translation_pattern, re.DOTALL)
    match = translations_block.search(obj_str)
    
    while match is not None:
        start, end = match.span()
        m = match.group(0)
        dn = en_object.findall(m)
        if dn:
            dn = [safe_dict(x) for x in dn]
            str_dn = '[' + ', '.join(json.dumps(x) for x in dn) + ']'
            obj_str = obj_str[:start] + '"translations": ' + str_dn + obj_str[end:]
            match = translations_block.search(obj_str, start + len('"translations": ' + str(dn)))
        else:
            
            to_remove_end = end
            if end < len(obj_str) and obj_str[end] == ',':
                to_remove_end += 1
            obj_str = obj_str[:start] + obj_str[to_remove_end:]
            match = translations_block.search(obj_str, start)

    return obj_str

In [10]:
def nl_keep_before_load(line: str) -> bool:
    if '"lang_code": "en"' in line or '"lang": "Engels"' in line:
        return True
    if '"lang_code": "nl"' in line or '"lang": "Nederlands"' in line:
        return True
    if has_cjk_or_arabic_fast(line, 30):
        return False
    return False
    

In [None]:
def filter_nl_obj(obj: dict):
    code = obj.get('code', '') if not '' else obj.get('lang_code', '')
    if 'lang' in obj or code:
        lang = obj.get("lang", '')
        if lang in ['Engels', 'Nederlands'] or code in ['en', 'nl']:
            return obj
        
    if "translations" in obj:
        new_translations = []
        translations = obj["translations"]
        for t in translations:
            tcode = t.get('code', '') if not '' else t.get('lang_code', '')
            tlang = t.get('lang', '')

            if tlang in ['Engels'] or tcode in ['en']:
                new_translations.append(t)
            
        if new_translations:
            obj['translations'] = new_translations
            return obj
        
    if "senses" in obj:
        senses = obj['senses']
        new_translations = []
        
        if isinstance(senses, list):
            new_senses = []
            new_sense_translations = []
          
            for i, item in enumerate(senses):
                if 'translations' in item:
                    new_sense_translations = item['translations']
                    new_sense_translations = [x for x in new_sense_translations if x.get("code", '') == 'en' or x.get("lang", '') == "Engels"]
                    item['translations'] = new_sense_translations
                if new_sense_translations:
                    new_senses.append(item)
                
                new_sense_translations = []

            if new_senses:
                obj['senses'] = new_senses
                return obj
    return None

            

In [12]:
# The nl file is small enough to intake by readlinesbatch = []
error_lines = []
batch_size = 10000
file_path = NRAW_FILE
out_file = NL_lines_file
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    with open(out_file, 'w+', encoding='utf-8', errors='ignore') as out:
        lines = f.readlines()

        for i, line in tqdm(enumerate(lines), total=len(lines)):
            
            if nl_keep_before_load(line):
                try:
                    loaded = json.loads(line)
                    if loaded:
                        obj = filter_nl_obj(loaded)
                        if obj:
                            batch.append(obj)
                            if len(batch) >= batch_size:
                                for obj in batch:
                                    json.dump(obj, out, ensure_ascii=False)
                                    out.write('\n')
                                    batch = [] 
                except Exception as e:
                    print(f"Error on line: {i}")
                    print(e)
                    break
        if batch:
            for obj in batch:
                json.dump(obj, out, ensure_ascii=False)
                out.write('\n')
                batch = [] 

100%|██████████| 1050145/1050145 [01:14<00:00, 14089.57it/s]


In [None]:
other = []
ner_lines = []
nnr_lines = []
with open(NL_lines_file, 'r', encoding='utf-8') as f:
    with open(NNR_lines_file, 'w+', encoding='utf-8') as out:
        lines = f.readlines()
        print(len(lines))
        for line in tqdm(lines, desc='Sorting lines'):
            loaded = json.loads(line)
            code = loaded.get('lang_code')
            if not code:
                loaded.get('code')
            
            if code == 'nl' or loaded.get('lang', '') == 'Nederlands':
                nnr_lines.append(loaded)
            elif code == 'en' or loaded.get('lang', '') == 'Engels':
                ner_lines.append(loaded)
            else:
                other.append(loaded)
        if nnr_lines:
            print('Dutch defs: ', len(nnr_lines))
            for obj in tqdm(nnr_lines, desc='Saving NNR lines'):
                json.dump(obj, out, ensure_ascii=False)
                out.write('\n')


628905


Sorting lines: 100%|██████████| 628905/628905 [01:10<00:00, 8928.51it/s]  


611444


Saving NNR lines: 100%|██████████| 611444/611444 [00:51<00:00, 11895.92it/s]


17441


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\elise\\SynologyDrive\\Dev\\DutchAnalyzerPublic\\DutchAnalyzer\\data\\interim\\preprocessing\\wikt\\nl\\NER\\NER.jsonl'

In [24]:
if ner_lines:
    print('English defs: ', len(ner_lines))
    with open(NER_lines_file, 'w+',encoding='utf-8') as out:
        for obj in tqdm(ner_lines, desc='Saving NER lines'):
            json.dump(obj, out, ensure_ascii=False)
            out.write('\n')
if other:
    print('Other lines:', len(other))
    display(other)


English defs:  17441


Saving NER lines: 100%|██████████| 17441/17441 [00:00<00:00, 25153.37it/s]

Other lines: 20





[{'word': 'Aldenarda',
  'lang_code': 'la',
  'lang': 'Latijn',
  'pos': 'name',
  'pos_title': 'Eigennaam',
  'senses': [{'glosses': ['een stad in de Belgische provincie Oost-Vlaanderen'],
    'tags': ['toponymic'],
    'categories': ['Aardrijkskunde van België_in_het_Latijn']}],
  'categories': ['Eigennaam in het Latijn',
   'Woorden in het Latijn',
   'Woorden in het Latijn met IPA-weergave',
   'Woorden in het Latijn met audioweergave'],
  'sounds': [{'audio': 'la-Aldenarda.ogg',
    'ogg_url': 'https://commons.wikimedia.org/wiki/Special:FilePath/la-Aldenarda.ogg',
    'mp3_url': 'https://upload.wikimedia.org/wikipedia/commons/transcoded/2/2e/La-Aldenarda.ogg/La-Aldenarda.ogg.mp3'},
   {'ipa': '[ɑldɛnɑɾdɑ]'}],
  'translations': [{'lang_code': 'nl',
    'lang': 'Nederlands',
    'word': 'Oudenaarde',
    'sense': 'een stad in de Belgische provincie Oost-Vlaanderen',
    'sense_index': 1}]},
 {'word': 'Alostum',
  'lang_code': 'la',
  'lang': 'Latijn',
  'pos': 'name',
  'pos_title':

## Process Raw EN File

In [7]:
total_en_lines = count_lines_with_progress(ERAW_FILE)
print(total_en_lines)

Counting Lines: 100%|██████████| 21.3G/21.3G [00:14<00:00, 1.45GB/s]

10329308





In [309]:
previous_save_path = Path(WIKT_PREPROCESSING_DIR, '07-11-25')

In [310]:
current_save_path = Path(WIKT_PREPROCESSING_DIR, '09-11-25')
EER_save_path = Path(current_save_path, 'EER')
ENR_save_path = Path(current_save_path, 'ENR')

Path.mkdir(current_save_path, exist_ok=True)
Path.mkdir(EER_save_path, exist_ok=True)
Path.mkdir(ENR_save_path, exist_ok=True)

### Utilities

In [None]:
def keep_obj(obj: dict) -> bool:
    if "lang_code" not in obj:
        return False
    if obj["lang_code"] not in ["nl", "en"]:
        return False
    return True

def keep_obj_before_load(obj_str: str) -> bool:
    if has_cjk_or_arabic_fast(obj_str):
        return False
    
    if obj_str.find('"lang_code": "en"') == -1 and obj_str.find('"lang_code": "nl"') == -1:
        return False
    return True
        

### Filter json translations and translation related categories 

In [None]:

        
def filter_translations_regex(obj_str: str):
    translations_pattern = r'"translations"\s*:\s*\[({.*?})\]'
    dutch_translation_pattern = r'\{[^{}]*?"lang"\s*:\s*"Dutch"[^{}]*?\}'
    translations_block = re.compile(translations_pattern, re.DOTALL)
    dutch_object = re.compile(dutch_translation_pattern, re.DOTALL)
    match = translations_block.search(obj_str)
    
    while match is not None:
        start, end = match.span()
        m = match.group(0)
        dn = dutch_object.findall(m)
        if dn:
            dn = [safe_dict(x) for x in dn]
            str_dn = '[' + ', '.join(json.dumps(x) for x in dn) + ']'
            obj_str = obj_str[:start] + '"translations": ' + str_dn + obj_str[end:]
            match = translations_block.search(obj_str, start + len('"translations": ' + str(dn)))
        else:
            
            to_remove_end = end
            if end < len(obj_str) and obj_str[end] == ',':
                to_remove_end += 1
            obj_str = obj_str[:start] + obj_str[to_remove_end:]
            match = translations_block.search(obj_str, start)

    return obj_str

In [None]:
def filter_line(obj_str: str) -> str:
    obj_str = filter_translations_regex(obj_str)
    return obj_str

In [297]:
def filter_obj(obj: dict):
    if "categories" in obj:
        categories = obj["categories"]
        new_categories = []
        if isinstance(categories, list):
            for i, cat in enumerate(categories):
                if isinstance(cat, str):
                    if cat not in ["Terms with Dutch translations", "Terms with English translations"]:
                        if cat.startswith('Terms with') and cat.endswith('translations'):
                            categories.remove(cat)
                        else:
                            new_categories.append(cat)
        obj["categories"] = new_categories


In [None]:
temp_output_file = Path(previous_save_path, "ERAW","ERAW_filtered.jsonl")
batch_size = 100000
error_lines = []
from pprint import pp, pprint

with open(temp_output_file, "w+", encoding='utf-8') as output_file:
    with open(ERAW_FILE, "r", encoding='utf-8') as f:
        batch_list = []
        for i, line in tqdm(enumerate(f), total=total_en_lines):
            
            try:
                if not keep_obj_before_load(line):
                    continue
               
                line = filter_line(line)
                try:
                    obj = json.loads(line)
                
                    if not keep_obj(obj):
                        continue
                    
                    filter_obj(obj)
                    batch_list.append(obj)

                    if i % batch_size == 0 and i > 0:
                        print(f"Writing batch at line {i}")
                        for obj in batch_list:
                            json.dump(obj, output_file, ensure_ascii=False)
                            output_file.write("\n")
                        batch_list = []
                      
                except Exception as e1:
                    error_lines.append((i, line))
                    print(f"Error parsing JSON on line {i} ", e1)
            except Exception as e:

                print(f.tell())
                raise e
        for obj in batch_list:
            json.dump(obj, output_file, ensure_ascii=False)
            output_file.write("\n")  

  2%|▏         | 198744/10329308 [00:12<09:37, 17549.80it/s]

Writing batch at line 200000


  5%|▍         | 499327/10329308 [00:37<04:43, 34667.23it/s] 

Writing batch at line 500000


  8%|▊         | 798897/10329308 [00:58<07:11, 22087.03it/s] 

Writing batch at line 800000


 36%|███▌      | 3696373/10329308 [01:59<01:48, 61089.80it/s]

Writing batch at line 3700000


 39%|███▊      | 3994863/10329308 [02:30<02:06, 50165.73it/s] 

Writing batch at line 4000000


 45%|████▌     | 4696406/10329308 [02:46<01:41, 55753.00it/s]

Writing batch at line 4700000


 47%|████▋     | 4894068/10329308 [02:54<01:44, 51807.65it/s]

Writing batch at line 4900000


 48%|████▊     | 4997325/10329308 [02:57<01:31, 58052.20it/s]

Writing batch at line 5000000


 52%|█████▏    | 5394869/10329308 [03:06<01:25, 57562.53it/s]

Writing batch at line 5400000


 63%|██████▎   | 6499098/10329308 [03:29<02:06, 30346.36it/s]

Writing batch at line 6500000


 67%|██████▋   | 6899223/10329308 [03:44<01:16, 44740.47it/s]

Writing batch at line 6900000


 68%|██████▊   | 6996343/10329308 [03:49<00:55, 60495.68it/s]

Writing batch at line 7000000


 69%|██████▊   | 7098445/10329308 [03:52<00:52, 61786.90it/s]

Writing batch at line 7100000


 70%|██████▉   | 7197619/10329308 [03:55<01:41, 30861.19it/s]

Writing batch at line 7200000


 74%|███████▎  | 7598012/10329308 [04:04<00:47, 57820.47it/s]

Writing batch at line 7600000


 76%|███████▌  | 7799599/10329308 [04:13<00:51, 49183.14it/s]

Writing batch at line 7800000


 79%|███████▉  | 8199321/10329308 [04:24<00:46, 46178.08it/s]

Writing batch at line 8200000


 83%|████████▎ | 8597171/10329308 [04:36<00:41, 41955.48it/s]

Writing batch at line 8600000


 84%|████████▍ | 8696683/10329308 [04:43<00:30, 54184.08it/s]

Writing batch at line 8700000


 85%|████████▌ | 8797659/10329308 [04:46<00:50, 30565.10it/s]

Writing batch at line 8800000


 89%|████████▉ | 9199152/10329308 [04:56<00:40, 28055.79it/s]

Writing batch at line 9200000


 95%|█████████▍| 9794810/10329308 [05:09<00:18, 29686.88it/s]

Writing batch at line 9800000


 98%|█████████▊| 10095882/10329308 [05:19<00:04, 55530.33it/s]

Writing batch at line 10100000


100%|██████████| 10329308/10329308 [05:27<00:00, 31527.04it/s]


### Split Json

In [312]:
ERAW_filtered_file = Path(previous_save_path, "ERAW","ERAW_filtered.jsonl")
count_lines_with_progress(ERAW_filtered_file)

Counting Lines: 100%|██████████| 1.84G/1.84G [00:01<00:00, 1.51GB/s]


1564786

In [313]:
EER_file = Path(EER_DIR, 'EER.jsonl')
ENR_file = Path(ENR_DIR, 'ENR.jsonl')

In [316]:
EER_batch = []
ENR_batch = []
batch_size = 10000
with open(ERAW_filtered_file, 'r', encoding='utf-8') as f:
    for line in tqdm(f, total=count_lines_with_progress(ERAW_filtered_file)):
        obj = json.loads(line)
        if obj["lang_code"] == "en":
            EER_batch.append(obj)
            if len(EER_batch) >= batch_size:
                with open(EER_file, 'a+', encoding='utf-8') as eer_f:
                    for oer in EER_batch:
                        json.dump(oer, eer_f, ensure_ascii=False)
                        eer_f.write("\n")
                EER_batch = []
        elif obj["lang_code"] == "nl":
            ENR_batch.append(obj)
            if len(ENR_batch) >= batch_size:
                with open(ENR_file, 'a+', encoding='utf-8') as enr_f:
                    for obj in ENR_batch:
                        json.dump(obj, enr_f, ensure_ascii=False)
                        enr_f.write("\n")
                ENR_batch = []
    if EER_batch:
        with open(EER_file, 'a+', encoding='utf-8') as eer_f:
            for oer in EER_batch:
                json.dump(oer, eer_f, ensure_ascii=False)
                eer_f.write("\n")
    if ENR_batch:
        with open(ENR_file, 'a+', encoding='utf-8') as enr_f:
            for obj in ENR_batch:
                json.dump(obj, enr_f, ensure_ascii=False)
                enr_f.write("\n")

Counting Lines: 100%|██████████| 1.84G/1.84G [00:01<00:00, 1.47GB/s]
100%|██████████| 1564786/1564786 [02:49<00:00, 9243.09it/s] 


In [321]:
print(count_lines_with_progress(EER_file))
print(count_lines_with_progress(ENR_file))

Counting Lines: 100%|██████████| 1.63G/1.63G [00:01<00:00, 1.52GB/s]


1423864


Counting Lines: 100%|██████████| 204M/204M [00:00<00:00, 1.53GB/s]

140922





In [325]:
2050/1425914

0.0014376743618479094

In [None]:
# language	gloss definitions	change	entries	change	gloss entries	change	form definitions	change	total definitions	change
# Total	        4964041		            9977204		    4392194		                6286693		                11250734	

# Dutch	        79363	        +350	141111	+503	66094	        +362	    78942	        +207	    158305	           +557
# English	    949129	        +3110	1425914	+6586	901398	        +4405	    713614	        +4162	    1662743	            +7272

Counting Lines: 100%|██████████| 204M/204M [00:00<00:00, 1.12GB/s]


140922

### Get Keys

In [67]:
def get_subkeys(line):
    structure_dict = {}
    line_type = type(line)
    if not line:
        return line_type
    elif isinstance(line, str):
        return str
    elif isinstance(line, int):
        return int
    elif isinstance(line, dict):
        for k, v in line.items():
            structure_dict[k] = get_subkeys(v)
    elif isinstance(line, list):
        subkeys_list = []
        substruct_count = []
        for i, v in enumerate(line):
            substruct = (dict, get_subkeys(v))
            if substruct not in subkeys_list:
                subkeys_list.append(substruct)
                substruct_count.append(1)
            else:
                index = subkeys_list.index(substruct)
                substruct_count[index] += 1
        return (f'unique:{len(subkeys_list)}', subkeys_list)
    else:
        print(line)
    return structure_dict

In [333]:
def get_subkeysV2(line):
    structure_dict = {}
    line_type = type(line)
    if not line:
        return line_type
    elif isinstance(line, str):
        return str
    elif isinstance(line, int):
        return int
    elif isinstance(line, dict):
        for k, v in line.items():
            structure_dict[k] = get_subkeysV2(v)
    elif isinstance(line, list):
        subkeys_list = []
        substruct_count = []
        line_tuple_list = []
        keys_set = set()
        for i, v in enumerate(line):
            substruct = get_subkeysV2(v)
            if isinstance(substruct, dict):
                for k in substruct.keys():
                    keys_set.update(k)
            if substruct not in subkeys_list:
                subkeys_list.append(substruct)
                
                substruct_count.append(1)
            else:
                index = subkeys_list.index(substruct)
                substruct_count[index] += 1
        return (keys_set, subkeys_list)
    else:
        print(line)
    return structure_dict

In [None]:
def print_subkeys(structure, depth=0):
    indent = '  ' * depth
    if isinstance(structure, dict):
        for key, value in structure.items():
            print(f"{indent}{key}:")
            recurse_subkeys(value, depth + 1)
    elif isinstance(structure, list):
        for i, item in enumerate(structure):
            print(f"{indent}- Item {i}:")
            recurse_subkeys(item, depth + 1)
    else:
        print(f"{indent}{structure}")

In [326]:
def batch_structure_analysis(file_path: Path, out_path: Path, batch_size: int = 100):
    batch_list = []
    with open(file_path, 'r', encoding='utf-8') as f:
        batch_counter = 0
        for line in f:
            subkeys = get_subkeysV2(obj)
            
            display(subkeys)
            batch_counter += 1
            if batch_counter >= batch_size:
                break

In [338]:
from tqdm import trange

def get_subkeys_from_list(list_dict_items):
    template_dict = {}
    keys = set()
    key_type_set = set()
    for i, item in enumerate(tqdm(list_dict_items)):
        if isinstance(item, dict):
            for k in item.keys():
                keys.add(k)
                key_type_set.add((k, type(item[k])))
        else:
            keys.add(type(item))
            key_type_set.add((k, item))
    return key_type_set

In [None]:
def compare_structures(template_dict, new_dict):
    template_keys = template_dict.keys()
    template_values = template_dict.values()
    if template_dict == new_dict:
        return template_dict
    
    for k, v in new_dict:
        if k not in template_dict:
            template_dict[k] = new_dict[k]
        else:
            if isinstance(v, list):
                pass
    
    # return structure_dict

In [None]:
import typing
from collections import Counter
structures = []
bad_lines = []
counts_index = 3

with open(ENR_file, 'r', encoding='utf-8',errors='ignore') as f:
    lines = f.readlines()
    template_structure = json.loads(lines[0])
    for line in tqdm(lines, total=len(lines)):
        try:
            loaded = json.loads(line)
            keys = loaded.keys()

            lines.append(loaded)

            d = get_subkeys(loaded)
            
            structures.append(d)
        except Exception as e:
            bad_lines.append(line)
        

281844it [00:22, 12698.46it/s]                            


In [332]:
display(structures[0:10])

[{'senses': ('unique:1',
   [(dict,
     {'links': ('unique:1', [(dict, ('unique:1', [(dict, str)]))]),
      'synonyms': ('unique:1', [(dict, {'word': str})]),
      'glosses': ('unique:1', [(dict, str)]),
      'tags': ('unique:1', [(dict, str)]),
      'categories': ('unique:1', [(dict, str)]),
      'wikipedia': ('unique:1', [(dict, str)])})]),
  'pos': str,
  'head_templates': ('unique:1',
   [(dict,
     {'name': str,
      'args': {'1': str, '2': str, '3': str},
      'expansion': str})]),
  'forms': ('unique:1',
   [(dict, {'form': str, 'tags': ('unique:1', [(dict, str)])})]),
  'derived': ('unique:1', [(dict, {'word': str})]),
  'descendants': ('unique:2',
   [(dict, {'lang': str, 'lang_code': str, 'word': str}),
    (dict,
     {'lang': str,
      'lang_code': str,
      'word': str,
      'tags': ('unique:1', [(dict, str)])})]),
  'sounds': ('unique:2',
   [(dict, {'ipa': str}),
    (dict, {'audio': str, 'ogg_url': str, 'mp3_url': str})]),
  'hyphenations': ('unique:1',
   [

### Get definitions and translations