In [None]:
import pandas as pd
import numpy as np
import ujson
from dutchanalyzer.config import *
from dutchanalyzer.utils import *
from dutchanalyzer.json_utils import *
from pathlib import Path
from dotenv import load_dotenv
from io import StringIO
import datetime
import re
from pprint import pprint
import ast


### Paths

In [58]:
ERAW_FILE = Path(RAW_KAIKKI_DIR, 'en', 'kaikki_en-raw-wiktextract-data.jsonl') 
NRAW_FILE = Path(RAW_KAIKKI_DIR, 'nl', 'kaikki_nl-raw-extract.jsonl')

In [59]:
eng_save_path = Path(WIKT_PREPROCESSING_DIR, 'en')
nld_save_path = Path(WIKT_PREPROCESSING_DIR, 'nl')

## Process Raw NL File

### Intake JSON

In [6]:
# Paths
NNR_lines_file = Path(NNR_DIR, 'NNR_lines.json')
NER_lines_file = Path(NER_DIR, 'NER_lines.json')

In [5]:
# The nl file is small enough to intake by readlines
nl_lines = []
file_path = NRAW_FILE
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    nl_lines = f.readlines()

In [6]:
nl_word_lines = []
nl_non_word_lines = []
for line in nl_lines:
    if line[0:7] == '{"word"':
        nl_word_lines.append(line)
    else:
        nl_non_word_lines.append(line)

In [7]:
NNR_lines = []
NER_lines = []
other_lines = []

for line in nl_word_lines:
    if '"lang": "Engels"' in line[0:100]:
        NER_lines.append(line)
    elif '"lang_code": "en"' in line[0:100]:
        NER_lines.append(line)
    elif '"lang_code": "nl"' in line[0:100]:
        NNR_lines.append(line)
    elif '"lang": "Nederlands"' in line[0:100]:
        NNR_lines.append(line)
    elif '"lang_code":' not in line[0:100]:
        other_lines.append(line)

In [None]:
with open(NNR_lines_file, 'w', encoding='utf-8') as f:
    for line in NNR_lines:
        f.write(line)


In [None]:
with open(NER_lines_file, 'w', encoding='utf-8') as f:
    for line in NER_lines:
        f.write(line)

## Process Raw EN File

In [7]:
total_en_lines = count_lines_with_progress(ERAW_FILE)
print(total_en_lines)

Counting Lines: 100%|██████████| 21.3G/21.3G [00:14<00:00, 1.45GB/s]

10329308





In [60]:
current_save_path = Path(WIKT_PREPROCESSING_DIR, '07-11-25')
chunks_dir = Path(current_save_path, 'en', 'chunks2')

In [65]:
from collections import Counter
def make_structure_line_tuple(key, line):
    size = 0
    if line:
        size = len(line)
        obj_type = type(line)
        if size == 0:
            return (key, obj_type, 0, 0)
        counts = Counter()
        typecounts = Counter(type(x).__name__ for x in line)
        if isinstance(line, dict):
            items = line.items()
            return (key, dict, size, typecounts) 
        elif isinstance(line, list):
            return (key, list, size, typecounts)
        elif isinstance(line, str):
            try:
                line = json.loads(line)
                if isinstance(line, str):
                    return (key, str, size, typecounts)
            except:
                
                return (key, str, size, typecounts)
    return None

In [187]:
def has_cjk_or_arabic_fast(text: str, limit: int = 50) -> bool:
    """Return True if the first `limit` characters contain
    any Chinese, Japanese, Korean, or Arabic/Farsi character."""
    for ch in text[:limit]:
        cp = ord(ch)
        # CJK (Chinese/Japanese/Korean)
        if (
            0x4E00 <= cp <= 0x9FFF or  # CJK Unified Ideographs
            0x3400 <= cp <= 0x4DBF or  # CJK Ext A
            0xF900 <= cp <= 0xFAFF or  # CJK Compatibility
            0x3040 <= cp <= 0x30FF or  # Hiragana + Katakana
            0x31F0 <= cp <= 0x31FF or  # Katakana Extensions
            0xAC00 <= cp <= 0xD7AF or  # Hangul Syllables
            # Arabic / Farsi
            0x0600 <= cp <= 0x06FF or
            0x0750 <= cp <= 0x077F or
            0x08A0 <= cp <= 0x08FF or
            0xFB50 <= cp <= 0xFEFF
        ):
            return True  # stop immediately
    return False

In [287]:
from numpy import isin


def keep_obj(obj: dict) -> bool:
    if "lang_code" not in obj:
        return False
    if obj["lang_code"] not in ["nl", "en"]:
        return False
    return True

def keep_obj_before_load(obj_str: str) -> bool:
    if has_cjk_or_arabic_fast(obj_str):
        return False
    
    if obj_str.find('"lang_code": "en"') == -1 and obj_str.find('"lang_code": "nl"') == -1:
        return False
    
    
    return True
        

In [67]:
def get_subkeys(line):
    structure_dict = {}
    line_type = type(line)
    if not line:
        return line_type
    elif isinstance(line, str):
        return str
    elif isinstance(line, int):
        return int
    elif isinstance(line, dict):
        for k, v in line.items():
            structure_dict[k] = get_subkeys(v)
    elif isinstance(line, list):
        subkeys_list = []
        substruct_count = []
        for i, v in enumerate(line):
            substruct = (dict, get_subkeys(v))
            if substruct not in subkeys_list:
                subkeys_list.append(substruct)
                substruct_count.append(1)
            else:
                index = subkeys_list.index(substruct)
                substruct_count[index] += 1
        return (f'unique:{len(subkeys_list)}', subkeys_list)
    else:
        print(line)
    return structure_dict

In [81]:
def get_subkeysV2(line):
    structure_dict = {}
    line_type = type(line)
    if not line:
        return line_type
    elif isinstance(line, str):
        return str
    elif isinstance(line, int):
        return int
    elif isinstance(line, dict):
        for k, v in line.items():
            structure_dict[k] = get_subkeysV2(v)
    elif isinstance(line, list):
        subkeys_list = []
        substruct_count = []
        line_tuple_list = []
        keys_set = set()
        for i, v in enumerate(line):
            substruct = get_subkeysV2(v)
            if isinstance(substruct, dict):
                keys_set.update(substruct.keys())
            if substruct not in subkeys_list:
                subkeys_list.append(substruct)
                
                substruct_count.append(1)
            else:
                index = subkeys_list.index(substruct)
                substruct_count[index] += 1
        return (keys_set, subkeys_list)
    else:
        print(line)
    return structure_dict

In [84]:
def recurse_subkeys(structure, depth=0):
    indent = '  ' * depth
    if isinstance(structure, dict):
        for key, value in structure.items():
            print(f"{indent}{key}:")
            recurse_subkeys(value, depth + 1)
    elif isinstance(structure, list):
        for i, item in enumerate(structure):
            print(f"{indent}- Item {i}:")
            recurse_subkeys(item, depth + 1)
    else:
        print(f"{indent}{structure}")

In [None]:
def batch_structure_analysis(file_path: Path, out_path: Path, batch_size: int = 100):
    batch_list = []
    with open(file_path, 'r', encoding='utf-8') as f:
        batch_counter = 0
        for line in f:
            if not keep_obj_before_load(line):
                continue
            obj = ujson.loads(line)
            if not keep_obj(obj):
                continue
            
            subkeys = get_subkeysV2(obj)
            
            display(subkeys)
            #batch_list.append(subkeys)
            batch_counter += 1
            if batch_counter >= batch_size:
                break

In [280]:

def safe_dict(obj_str: str):
    if isinstance(obj_str, str):
        try:
            return ast.literal_eval(obj_str)
        except Exception:
            return ""       # fallback
        
def filter_translations_regex(obj_str: str):
    translations_pattern = r'"translations"\s*:\s*\[({.*?})\]'
    dutch_translation_pattern = r'\{[^{}]*?"lang"\s*:\s*"Dutch"[^{}]*?\}'
    translations_block = re.compile(translations_pattern, re.DOTALL)
    dutch_object = re.compile(dutch_translation_pattern, re.DOTALL)
    match = translations_block.search(obj_str)
    
    while match is not None:
        start, end = match.span()
        m = match.group(0)
        dn = dutch_object.findall(m)
        if dn:
            dn = [safe_dict(x) for x in dn]
            str_dn = '[' + ', '.join(json.dumps(x) for x in dn) + ']'
            obj_str = obj_str[:start] + '"translations": ' + str_dn + obj_str[end:]
            match = translations_block.search(obj_str, start + len('"translations": ' + str(dn)))
        else:
            
            to_remove_end = end
            if end < len(obj_str) and obj_str[end] == ',':
                to_remove_end += 1
            obj_str = obj_str[:start] + obj_str[to_remove_end:]
            match = translations_block.search(obj_str, start)

    return obj_str

In [None]:
def filter_categories_regex(obj_str: str):
    categories_pattern = r'"categories"\s*:\s*\[.*?\]'
    categories_block = re.compile(categories_pattern, re.DOTALL)
    terms_with_Dtranslations = re.compile(r'"Terms with Dutch translations"', re.DOTALL)
    terms_with_translations = re.compile(r'"Terms with \w* translations"', re.DOTALL)
    match = categories_block.search(obj_str)
    
    while match is not None:
        start, end = match.span()
        # Remove the entire categories block including any trailing comma
        to_remove_end = end
        if end < len(obj_str) and obj_str[end] == ', ':
            to_remove_end += 1
        obj_str = obj_str[:start] + obj_str[to_remove_end:]
        match = categories_block.search(obj_str, start)

    return obj_str

In [None]:
def filter_line(obj_str: str) -> str:
    obj_str = filter_translations_regex(obj_str)
    return obj_str

In [295]:
def filter_obj(obj: dict):
    if "categories" in obj:
        categories = obj["categories"]
        new_categories = []
        if isinstance(categories, list):
            for i, cat in enumerate(categories):
                if isinstance(cat, str):
                    if cat not in ["Terms with Dutch translations", "Terms with English translations"]:
                        if cat.startswith('Terms with') and cat.endswith('translations'):
                            print(cat)
                            categories.remove(cat)
                        else:
                            new_categories.append(cat)
        obj["categories"] = new_categories


In [296]:
temp_output_file = Path(current_save_path, "test_output.jsonl")
batch_size = 10
error_lines = []
from pprint import pp, pprint

with open(temp_output_file, "w+", encoding='utf-8') as output_file:
    with open(ERAW_FILE, "r", encoding='utf-8') as f:
        batch_list = []
        for i, line in tqdm(enumerate(f), total=total_en_lines):
            try:
                if not keep_obj_before_load(line):
                    continue
               
                line = filter_line(line)
                try:
                    obj = json.loads(line)
                
                    if not keep_obj(obj):
                        continue
                    
                    filter_obj(obj)
                    batch_list.append(obj)

                    if i % batch_size == 0 and i > 0:
                        for obj in batch_list:
                            json.dump(obj, output_file, ensure_ascii=False)
                            output_file.write("\n")
                        batch_list = []
                        break
                except Exception as e1:
                    error_lines.append((i, line))
                    print(f"Error parsing JSON on line {i} ", e1)
            except Exception as e:

                print(f.tell())
                raise e        

  0%|          | 30/10329308 [00:00<3:28:20, 826.33it/s]

Terms with Abaza translations
Terms with Adyghe translations
Terms with Albanian translations
Terms with Ambonese Malay translations
Terms with Amharic translations
Terms with Antillean Creole translations
Terms with Aragonese translations
Terms with Armenian translations
Terms with Assamese translations
Terms with Asturian translations
Terms with Aymara translations
Terms with Balkan Romani translations
Terms with Bambara translations
Terms with Basque translations
Terms with Bengali translations
Terms with Bikol Central translations
Terms with Breton translations
Terms with Burmese translations
Terms with Cantonese translations
Terms with Carpathian Rusyn translations
Terms with Cebuano translations
Terms with Chakma translations
Terms with Chechen translations
Terms with Choctaw translations
Terms with Chuvash translations
Terms with Classical Syriac translations
Terms with Corsican translations
Terms with Czech translations
Terms with Dargwa translations
Terms with Dungan translati




In [None]:
print(f"Total error lines: {len(error_lines)}")
display(error_lines)

Total error lines: 0


IndexError: list index out of range

### Split Json

In [None]:
def split_jsonl_by_lines(f, output_dir, lines_per_chunk=1_000_000):
    os.makedirs(output_dir, exist_ok=True)
    part = 1
    line_count = 0
    outfile = open(os.path.join(output_dir, f"chunk_{part:03}.jsonl"), "w", encoding="utf-8")

    with open(f, "r", encoding="utf-8", errors="ignore") as infile:
        for line in tqdm(infile, total=count_lines_with_progress):
            if line_count >= lines_per_chunk:
                outfile.close()
                part += 1
                line_count = 0
                outfile = open(os.path.join(output_dir, f"chunk_{part:03}.jsonl"), "w", encoding="utf-8")
            outfile.write(line)
            line_count += 1

    outfile.close()
    print(f"✅ Done. Created {part} chunks in '{output_dir}/'")

In [None]:
import re, json, os

def split_and_fix_jsonl(input_path, total_lines, lines_per_chunk=1_000_000, output_dir="chunks"):
    os.makedirs(output_dir, exist_ok=True)
    part = 1
    count = 0
    out = open(os.path.join(output_dir, f"chunk_{part:03}.jsonl"), "w", encoding="utf-8")

    with open(input_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in tqdm(f, total=total_lines):
            # split concatenated JSON objects
            parts = re.split(r'(?=\{"senses":)', line)
            for p in parts:
                p = p.strip()
                if not p:
                    continue
                try:
                    json.loads(p)  # validate
                    out.write(p + "\n")
                    count += 1
                except json.JSONDecodeError:
                    continue

                if count >= lines_per_chunk:
                    out.close()
                    part += 1
                    count = 0
                    out = open(os.path.join(output_dir, f"chunk_{part:03}.jsonl"), "w", encoding="utf-8")
    out.close()
    print(f"✅ Finished writing {part} chunk files.")


In [None]:
split_and_fix_jsonl(ERAW_FILE, output_dir=chunks_dir, total_lines=total_en_lines)

In [None]:
import ijson

def split_large_json_objects(input_path, output_dir="chunks", chunk_size=500_000):
    os.makedirs(output_dir, exist_ok=True)

    part = 1
    count = 0
    out = open(os.path.join(output_dir, f"chunk_{part:03}.jsonl"), "w", encoding="utf-8")

    with open(input_path, "rb") as f:
        parser = ijson.items(f, "", multiple_values=True)  # accept concatenated JSON objects
        for obj in parser:
            json.dump(obj, out, ensure_ascii=False)
            out.write("\n")
            count += 1

            if count >= chunk_size:
                out.close()
                lines_start = part*chunk_size - chunk_size
                lines_end = lines_start + chunk_size
                print(f"Part: {part} Lines: {lines_start}-{lines_end} complete")
                part += 1
                count = 0
                out = open(os.path.join(output_dir, f"chunk_{part:03}.jsonl"), "w", encoding="utf-8")
    
    out.close()
    print(f"✅ Finished splitting into {part} chunks in '{output_dir}/'")

In [None]:
chunks2 = Path(current_save_path, 'en', 'chunks2')
#split_large_json_objects(ERAW_FILE, output_dir=chunks2)

In [None]:
from pickletools import read_bytes1


def get_longest_line(file_path, chunk_size=1024 * 1024):
    total_size = os.path.getsize(file_path)
    lines = 0
    longest_line = 0
    longest_lines = []
    num_chunks_longest = 0
    current_chunks_count = 1
    current_line = 0
    lowest_chunks_per_line = 1000000
    with open(file_path, 'rb') as f, tqdm(total=total_size, unit='B', unit_scale=True, desc="Counting Lines") as pbar:
        current_chunk = b''
        last_chunk = b''
        while chunk := f.read(chunk_size):
            chunk_count = chunk.count(b'\n')
            if chunk_count < lowest_chunks_per_line:
                lowest_chunks_per_line = chunk_count
                
                longest_lines.append(((1, chunk_count), chunk))
            else:
                if chunk_count == 0:
                    if current_chunks_count == 1:
                        current_chunk = last_chunk
                    current_chunk += chunk
                    current_chunks_count += 1
                else:
                    last_chunk = chunk
                    if current_chunks_count > num_chunks_longest:
                        longest_line = current_chunk

                        num_chunks_longest =  current_chunks_count
                        longest_lines.append((current_chunks_count, current_chunk))
                        current_chunks_count = 1

            lines += chunk_count
            pbar.update(len(chunk))
            
    return lines, longest_lines

In [None]:
total_en_lines

In [None]:
from collections import deque

last_lines = deque(maxlen=5_000_000)
with open(ERAW_FILE, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        last_lines.append(line)

In [None]:
last_5_million_lines_file = Path(WIKT_PREPROCESSING_DIR, '07-11-25', 'en', 'last_5_million.jsonl')


### Get Keys

In [None]:
c, long_lines = get_longest_line(ERAW_FILE)

In [None]:
print(len(long_lines))
print(long_lines[-1][0])
print(long_lines[-2])

### json_parse

In [None]:
def save_json_to_file(json_lines, output_dir, file_name='', overwrite=True, suppress_print=False):
    try:
        os.makedirs(output_dir, exist_ok=overwrite)
    except:
        print('error or overwrite not allowed')
        return False

    if not file_name:
        file_name = "out.jsonl"
    with open(Path(output_dir, file_name), 'w', encoding='utf-8') as f:
        json.dump(json_lines, f, ensure_ascii=False, indent=1)
    if not suppress_print:
        print(file_name, ' saved')
    return True

In [None]:
def save_json_files_to_dir(json_files_contents_list, output_dir, json_file_names_list=[], out_format='jsonl', overwrite=True, file_prefix='', suppress_print=False):
    if not json_file_names_list:
        if not file_prefix:
            file_prefix = 'out'
        json_file_names_list = [f"{file_prefix}_{i}.{out_format}" for i in range(len(json_files_contents_list))] 
    else:
        json_file_list = [f"{x}.{out_format}" for x in range(len(json_files_contents_list))]

    if len(json_files_contents_list) != len(json_file_list):
        print('Length of names does not match')
        return   

    try:
            os.makedirs(output_dir, exist_ok=overwrite)
            files_in_dir = os.listdir(output_dir)
            for i in len(range(json_files_contents_list)):
                contents = json_files_contents_list[i]
                name = json_file_list[i]
                save_json_to_file(contents, output_dir, name, overwrite=overwrite, suppress_print=suppress_print)
                
                
    except Exception as e:
        print('error with write')
        
    
        
    # else:
    #     try:
    #         os.makedirs(output_dir, exist_ok=overwrite)
    #     except Exception as e:
    #         print(e)
    #         if e == OSError and overwrite==False:
    #             files_in_dir = os.listdir(output_dir)

    #             matching_files = [x for x in json_file_names_list if x in files_in_dir]
    #             num_matches = len(matching_files)
    #             user_input = input(f'Directory exists and {num_matches} files match: 1: quit: \n 2: overwrite whole directory: \n, 3: pick overwrite for each file: \n')
    #             for file in matching_files:
    #                 if user_input == 1:
    #                         return
    #                 elif user_input == 3:
    #                     print('Overwrite (y/n)?')
    #                     for i, file in enumerate(matching_files):
    #                         file_choice = input(f'{file}: ')
    #                         if file_choice == 'y':
    #                             print('not implemented')
    #                             break
    #                 elif user_input == 2:
    #                     overwrite = True
                    
    
    
    


In [None]:
all_ENR_lines = []

counts_index = 3
first_chunk = Path(chunks2, 'chunk_001.jsonl')
all_chunks = os.listdir(chunks2)
en_rough_save_path = Path(current_save_path, 'en', 'rough_line_files2')
ENR_files = []
EER_files = []
other_lines_dutch_files = []
remaining_lines_files = []
lines_count = 0
appended_lines = 0
english_dutch_lines = []
ENR_lines = []
EER_lines = []
no_dutch_lines = []
other_lines_with_dutch_and_en_translations = []

save_limit = 100000
#file = all_chunks[0]
for file in all_chunks:

    print("Processing file: ", file)
    
    file_path = Path(chunks2, file)
    
    with open(file_path, 'r', encoding='utf-8',errors='ignore') as f:
        
        lines = f.readlines()
        for line in tqdm(lines):
            has_dutch = False
            lines_count += 1
            if not has_cjk_or_arabic_fast(line):
                if line.find('"lang": "Dutch"' ) != -1:
                    has_dutch = True
                elif line.find('"lang_code": "nl"' ) != -1:
                    has_dutch = True

                

                if has_dutch:
                    loaded = json.loads(line)
                    if not loaded:
                        continue
                    lang_code = loaded.get("lang_code", None)
                    if lang_code:
                        if lang_code == 'en':
                            EER_lines.append(loaded)
                        elif lang_code == 'nl':
                            ENR_lines.append(loaded)
                        else:
                            english_dutch_lines.append(loaded)
                        appended_lines += 1
                    has_dutch = False
                    
                else:
                    if not has_cjk_or_arabic_fast(line[50:100]):
                        loaded = json.loads(line)
                        lang_code = loaded.get("lang_code", None)
                        if lang_code:
                            if lang_code == 'en':
                                EER_lines.append(loaded)
                            else:
                                no_dutch_lines.append(line)
                            appended_lines += 1
            else:
                continue
            if len(EER_lines)%save_limit == 0 and len(EER_lines) !=0:
                save_json_to_file(EER_lines, en_rough_save_path, f"EER_rough_{len(EER_files)}.jsonl", suppress_print=True)
                EER_files.append(f"EER_rough_{len(EER_files)}.jsonl")
                EER_lines = []

            if len(ENR_lines)%save_limit == 0 and len(ENR_lines) != 0:
                save_json_to_file(ENR_lines, en_rough_save_path, f"ENR_rough_{len(ENR_files)}.jsonl", suppress_print=True)
                ENR_files.append(f"ENR_rough_{len(ENR_files)}.jsonl")
                ENR_lines = []
            if len(english_dutch_lines)%save_limit == 0 and len(english_dutch_lines) > 0:
            
                save_json_to_file(english_dutch_lines, en_rough_save_path, f"other_dutch_lines_{len(other_lines_dutch_files)}.jsonl", suppress_print=True)
                other_lines_dutch_files.append(f"other_dutch_lines_{len(other_lines_dutch_files)}.jsonl")
                english_dutch_lines = []
            if len(no_dutch_lines)%200000 == 0 and len(no_dutch_lines) > 0:
                save_json_to_file(no_dutch_lines, en_rough_save_path, f"no_dutch_lines_{len(remaining_lines_files)}.jsonl", suppress_print=True)
                remaining_lines_files.append(f"no_dutch_lines_{len(remaining_lines_files)}.jsonl")
                no_dutch_lines = []
    print('Now appended lines: ', appended_lines, " of ", lines_count)

In [None]:
save_json_to_file(EER_lines, en_rough_save_path, f"EER_rough_{len(EER_files)}.jsonl", suppress_print=True)
EER_files.append(f"EER_rough_{len(EER_files)}.jsonl")
EER_lines = []

In [None]:
save_json_to_file(ENR_lines, en_rough_save_path, f"ENR_rough_{len(ENR_files)}.jsonl", suppress_print=True)
ENR_files.append(f"ENR_rough_{len(ENR_files)}.jsonl")
ENR_lines = []
save_json_to_file(english_dutch_lines, en_rough_save_path, f"other_dutch_lines_{len(other_lines_dutch_files)}.jsonl", suppress_print=True)
other_lines_dutch_files.append(f"other_dutch_lines_{len(other_lines_dutch_files)}.jsonl")
english_dutch_lines = []
save_json_to_file(no_dutch_lines, en_rough_save_path, f"no_dutch_lines_{len(remaining_lines_files)}.jsonl", suppress_print=True)
remaining_lines_files.append(f"no_dutch_lines_{len(remaining_lines_files)}.jsonl")
no_dutch_lines = []

In [None]:
print(len(EER_files))
print(len(ENR_files))

In [None]:

def concat_lang_strip_files(save_folder, source_folder, prefix='EER', lang='en', translation_lang='nl'):
    """Load all JSONL files with `prefix` in filename from `source_folder`,
    remove all translations except those matching `translation_lang`,
    and return the cleaned list of objects."""
    
    new_lines = []
    files = [f for f in os.listdir(source_folder) if prefix in f]
    
    for file in files:
        path = Path(source_folder, file)
        print(f"Processing {path} ...")
        
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            for line_no, line in tqdm(enumerate(f, 1)):
                line = line.strip()
                if not line:
                    continue
                
                try:
                    loaded = json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"⚠️ Skipping bad JSON (line {line_no} in {file}): {e}")
                    continue

                if "translations" in loaded and isinstance(loaded["translations"], list):
                    translations = loaded["translations"]
                    translation_list = [
                        t for t in translations if isinstance(t, dict) and t.get("lang_code") == translation_lang]

                    # Only keep translations if we found Dutch ones
                    if translation_list:
                        loaded["translations"] = translation_list
                    else:
                        loaded.pop("translations", None)

                new_lines.append(loaded)
    
    print(f"✅ Processed {len(new_lines):,} total objects.")
    return new_lines


In [None]:
# def concat_lang_strip_files(prefix, save_folder, source_folder, lang='en', translation_lang='nl'):
#     files = [f for f in os.listdir(source_folder) if prefix in f]
#     new_lines = []
#     for file in files:
#         with open(Path(source_folder, file), 'r', encoding='utf-8') as f:
#             for line in f:
#                 loaded = json.loads(line)
#                 if line.find('translations') != -1:
                    
#                     translations = loaded.get(translations, None)
#                     translation_list = []
#                     if translations:
#                         if translations.find('"lang_code": "{translation_lang}"') != -1:
#                             for t in translations:
#                                 l = t.get('lang_code', None)
#                                 if l == translation_lang:
#                                     translation_list.append(t)
#                         loaded.pop(translations)
#                         if translation_list:
#                             loaded['translations'] = translation_list

#                 new_lines.append(loaded)

#     return new_lines

In [None]:
en_rough_save_path = Path(current_save_path, 'en', 'rough_line_files2')
new_EER = concat_lang_strip_files(source_folder=en_rough_save_path, prefix='EER', save_folder=en_rough_save_path)

In [None]:
all_ENR_lines = []

counts_index = 3
first_chunk = Path(chunks2, 'chunk_001.jsonl')
all_chunks = os.listdir(chunks2)
en_rough_save_path = Path(current_save_path, 'en', 'rough_line_files')
ENR_files = []
EER_files = []
other_lines_dutch_files = []
remaining_lines_files = []
lines_count = 0
appended_lines = 0
english_dutch_lines = []
ENR_lines = []
EER_lines = []
no_dutch_lines = []
other_lines_with_dutch_and_en_translations = []
save_limit = 100000
file = all_chunks[0]
# for file in all_chunks:
if file == all_chunks[0]:
    print("Processing file: ", file)
    
    file_path = Path(chunks2, file)
    has_dutch = False
    with open(file_path, 'r', encoding='utf-8',errors='ignore') as f:
        
        lines = f.readlines()
        for line in tqdm(lines):
            lines_count += 1
            if not has_cjk_or_arabic_fast(line):
                if line.find('"lang": "Dutch"' ) != -1:
                    has_dutch = True
                elif line.find('"lang_code": "nl"' ) != -1:
                    has_dutch = True

                

                if has_dutch:
                    loaded = json.loads(line)
                    if not loaded:
                        continue
                    lang_code = loaded.get("lang_code", None)
                    if lang_code:
                        if lang_code == 'en':
                            EER_lines.append(loaded)
                        elif lang_code == 'nl':
                            ENR_lines.append(loaded)
                        else:
                            english_dutch_lines.append(loaded)
                        appended_lines += 1
                    has_dutch = False
                    
                else:
                    if not has_cjk_or_arabic_fast(line[50:100]):
                        loaded = json.loads(line)
                        lang_code = loaded.get("lang_code", None)
                        if lang_code:
                            if lang_code == 'en':
                                EER_lines.append(loaded)
                            else:
                                no_dutch_lines.append(line)
                            appended_lines += 1
            else:
                continue
    if len(EER_lines)%save_limit == 0:
        save_json_to_file(EER_lines, en_rough_save_path, f"EER_rough_{file[-8:]}", suppress_print=True)
    if len(ENR_lines)%save_limit == 0:
        save_json_to_file(ENR_lines, en_rough_save_path, f"ENR_rough_{file[-8:]}", suppress_print=True)
    if len(english_dutch_lines)%save_limit == 0:
        save_json_to_file(english_dutch_lines, en_rough_save_path, f"other_dutch_lines_{file[-8:]}", suppress_print=True)
    if len(no_dutch_lines)%save_limit == 0:
        save_json_to_file(no_dutch_lines, en_rough_save_path, f"no_dutch_lines_{file[-8:]}", suppress_print=True)
    print('Now appended lines: ', appended_lines, " of ", lines_count)

In [None]:
print(len(all_ENR_lines))
total = 0
for line_list in all_ENR_lines:
    total += len(line_list)
print(total)

In [None]:
total_size = 0

for file in os.listdir(en_rough_save_path, 'rough_line_files'):
 
    eer_no_dutch = []
    if 'no_dutch' in file:
        with open(Path(en_rough_save_path, file)):
        
            lines = json.loads()

In [None]:
EER_files = []
ENR_files = []
translations_files = []
lang_codes = ['en', 'nl']
ENR_lines = []
EER_lines = []
remaining_lines = []
stop_size = 100000
ENR_count = 0
EER_count = 0
other_translations_count = 0
remaining_lines_count = 0
enr_file_num = 0
eer_file_num = 0
other_translation_file_num = 0
remaining_lines_file_num = 0 
lines_split_path = Path(current_save_path, 'lines_split')
for file in os.listdir(chunks2):
    
    with open(Path(chunks2, file), 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in tqdm(lines):
            loaded = json.loads(line)
            lang_code = loaded.get('lang_code', None)
            
            if lang_code:
                if lang_code == 'nl':
                    ENR_count += 1
                    ENR_lines.append(loaded)
                    if len(ENR_lines)%stop_size == 0 and ENR_count != 0:
                        save_json_to_file(ENR_lines, Path(lines_split_path, 'ENR'), f"ENR_{enr_file_num}.jsonl")
                        enr_file_num += 1
                        ENR_lines = []
                elif lang_code == 'en':
                    EER_count += 1
                    EER_lines.append(loaded)
                    if len(EER_lines)%stop_size == 0 and EER_count != 0:
                        save_json_to_file(EER_lines, Path(lines_split_path, 'EER'), f"EER_{eer_file_num}.jsonl")
                        eer_file_num += 1
                        EER_lines = []
                else:
                    remaining_lines_count += 1
                    remaining_lines.append(loaded)
                    if len(remaining_lines)%stop_size == 0 and remaining_lines_count != 0:
                        save_json_to_file(remaining_lines, Path(lines_split_path, 'remaining'), f"remaining_{remaining_lines_file_num}.jsonl")
                        remaining_lines_file_num += 1
                        remaining_lines = []

In [None]:
import typing
from collections import Counter
lines = []
counts_index = 3

with open(ERAW_FILE, 'r', encoding='utf-8',errors='ignore') as f:
    line = f.readline()
    print(len(line))
    loaded = json.loads(line)
    keys = loaded.keys()

    lines.append(loaded)

    d = get_subkeys(loaded)
    d2 = get_subkeysV2(loaded)
    display(len(d))
    display(len(d2))
    display(d)
    print('-------------------------------------------------------------')
    display(d2)

    
    #display(d2)
    #display(d2)
    # for k in keys:
    #     if type(loaded[k]) == list:
    #         subkeys = get_keys_from_list(loaded[k])
    #         subkeys_list.append(subkeys)
    #     print(subkeys)
    # print(subkeys_list)
    #structure_dict = get_nested_structure(loaded)
    #display(structure_dict)
    # for k in keys:
    #     val = loaded[k]
    #     type_tuples = make_structure_line_tuple(k, val)
    #     types = type_tuples[counts_index]
    #     if 'dict' in types.keys():
            
    #         print(types[3].keys())
    #         print(types)
        # counts = Counter()
        # typecounts = Counter(type(x).__name__ for x in val)
        # print(typecounts)
    #print(loaded)

    #structure = get_nested_structure(loaded)
    #display(structure)
    
    # if loaded:
    #     if isinstance(loaded, (dict, list)):
    #         structure_dict = get_nested_structure(loaded)
        

    # for k in keys:
    #     kl2 = []
    #     val = loaded[k]
    #     if isinstance(val, dict):
    #         kl2 = loaded[k].keys()
            
    #         sl2 = []
    #         for k2 in kl2:
    #             v3 = get_nested_structure(k2)
    #             if v3 != {} and v3 != []:
    #                 sl2.append(v3)
    #         structure_dict[k] = set(sl2)
    #     elif isinstance(val, list):
            
    #         sl3 = []
    #         try:
    #             for i in val:
    #                 v3 = get_nested_structure(i)
    #                 if v3 != {}:
    #                     sl3.append(v3)
                        
    #         except Exception as e:
    #             print(k, i)
    #             print(e)
    #             continue
    #         structure_dict[k] = sl3
    # print(line)

In [None]:
for k, v in structure_dict.items():
    print(k)
    display(v)
    print('--------------------------')

In [None]:
# EER_words = []
# ENR_words = []
en_words = []
error_lines =[]



count = 0
with open(ERAW_FILE, 'r', encoding='utf-8',errors='ignore') as f:
    for line in f:
        line = f.readline()
        line = line.strip()
        
        if count%100000 == 0:
            print(count, ' - ', line)
            
        # count += 1    
        # if not line:
        #     error_lines.append(line)
        #     continue
        
        posnl = line.find('"lang_code": "nl"')
        posen = line.find('"lang_code": "en"')
        if posnl == -1 and posen == -1:
            error_lines.append(line)
            continue
        en_words.append(line)
        # loaded_line = json.loads(line)
        # lang_code = loaded_line.get('lang_code', None)
        # if lang_code == 'en':
        #     EER_words.append(loaded_line)
        # elif lang_code == 'nl':
        #     ENR_words.append(loaded_line)
        
       

In [None]:
import json
import re
file_path = ERAW_FILE
def stream_json_lines(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # Split if multiple JSON objects are jammed on one line
            # (since each should start with {"senses":)
            parts = re.split(r'(?=\{"senses":)', line)
            for part in parts:
                part = part.strip()
                if part:
                    try:
                        yield json.loads(part)
                    except json.JSONDecodeError as e:
                        print("Skipping malformed JSON:", e, part[:100])

In [None]:
print(len(EER_words))
print(len(error_lines))

In [None]:
print(len(ENR_words))

In [None]:
display(error_lines[50:70])