In [1]:
import json
from pathlib import Path
from dutchanalyzer.config import *
from dutchanalyzer.utils import *
from dutchanalyzer.json_utils import *
import re
from pprint import pprint
import ast
from collections import Counter
from tqdm import tqdm
import pickle

In [3]:
eng_save_path = Path(WIKT_PREPROCESSING_DIR, 'en')
nld_save_path = Path(WIKT_PREPROCESSING_DIR, 'nl')

In [2]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))
folders = {'en': ['EER', 'ENR','EEF', 'ENF'], 'nl':['NER', 'NNR', 'NEF', 'NNF']}
Path.mkdir(current_save_folder, exist_ok=True)
for k, fold in folders.items():
    Path.mkdir(Path(current_save_folder, k), exist_ok=True)
    for f in fold:
        Path.mkdir(Path(current_save_folder, k, f), exist_ok=True)

In [3]:
# Paths
NNR_file = Path(NNR_DIR, 'NNR.jsonl')
NER_file = Path(NER_DIR, 'NER.jsonl')
EER_file = Path(EER_DIR, 'EER.jsonl')
ENR_file = Path(ENR_DIR, 'ENR.jsonl')

In [4]:
NNF_folder = Path(WIKT_CLEANING_DIR, 'nl', 'NNF')
NEF_folder = Path(WIKT_CLEANING_DIR, 'nl', 'NEF')
EEF_folder = Path(WIKT_CLEANING_DIR, 'en', 'EEF')
ENF_folder = Path(WIKT_CLEANING_DIR, 'en', 'ENF')

all_words_file = Path(WIKT_CLEANING_DIR, 'all_words.jsonl')
eef_words_file = Path(EEF_folder, 'eef_words.jsonl')
enf_words_file = Path(ENF_folder, 'enf_words.jsonl')

### Explination of Structure via Tatuylonen/wiktextract:


## Get Subkeys

### Utilities

In [5]:
def make_structure_line_tuple(key, line):
    size = 0
    if line:
        size = len(line)
        obj_type = type(line)
        if size == 0:
            return (key, obj_type, 0, 0)
        counts = Counter()
        typecounts = Counter(type(x).__name__ for x in line)
        if isinstance(line, dict):
            items = line.items()
            return (key, dict, size, typecounts) 
        elif isinstance(line, list):
            return (key, list, size, typecounts)
        elif isinstance(line, str):
            try:
                line = json.loads(line)
                if isinstance(line, str):
                    return (key, str, size, typecounts)
            except:
                
                return (key, str, size, typecounts)
    return None

In [8]:
def print_subkeys(structure, depth=0):
    indent = '  ' * depth
    if isinstance(structure, dict):
        for key, value in structure.items():
            print(f"{indent}{key}:")
            print_subkeys(value, depth + 1)
    elif isinstance(structure, list):
        for i, item in enumerate(structure):
            print(f"{indent}- Item {i}:")
            print_subkeys(item, depth + 1)
    else:
        print(f"{indent}{structure}")

In [5]:
def get_subkeys(obj):
    structure_dict = {}
    obj_type = type(obj)
    if not obj:
        return None
    elif isinstance(obj, (str, int)):
        return obj_type
    elif isinstance(obj, dict):
        for k, v in obj.items():
            structure_dict[k] = get_subkeys(v)
    elif isinstance(obj, list):
        if len(obj) == 0:
            return None
       

        subkeys_list = []
        substruct_count = []
        for i, v in enumerate(obj):
            substruct = get_subkeys(v)
            if substruct not in subkeys_list:
                subkeys_list.append(substruct)
                
            
        return subkeys_list
         # our list has at least 1 item
        # list_type = type(obj[0])
        # if list_type != dict:
        #     return list_type
        
        # now obj is a list of dicts
        # for each dict, get the keys in our dict
    #     for d in obj:
    #         # for each dictionary in our list
    #         for k, v in d.items():
    #             structure_dict[k] = get_subkeys(v)
    # return structure_dict
    else:
        print(obj)


    return structure_dict

In [128]:
def get_subkeysV2(line):
    structure_dict = {}
    line_type = type(line)
    if not line:
        return line_type
    elif isinstance(line, str):
        return str
    elif isinstance(line, int):
        return int
    elif isinstance(line, dict):
        for k, v in line.items():
            structure_dict[k] = get_subkeysV2(v)
    elif isinstance(line, list):
        subkeys_list = []
        substruct_count = []
        line_tuple_list = []
        keys_set = set()
        for i, v in enumerate(line):
            substruct = get_subkeysV2(v)
            if isinstance(substruct, dict):
                for k in substruct.keys():
                    keys_set.update(k)
            if substruct not in subkeys_list:
                subkeys_list.append(substruct)
                
                substruct_count.append(1)
            else:
                index = subkeys_list.index(substruct)
                substruct_count[index] += 1
        return (keys_set, subkeys_list)
    else:
        print(line)
    return structure_dict

## Get Substructures NER

In [14]:
def get_level_structure(obj, level=0):
     obj_type = type(obj)
     if not isinstance(obj, (list, dict)):
        return obj_type
     else:
         if isinstance(obj, list):
            level_keys = set()
            key_value_types = []
            substructures = []
            for i, item in enumerate(obj):
               if item:
                  if isinstance(item, dict):
                     for k, v in item.items():
                           level_keys.add(k)
                           if (k, type(v)) not in key_value_types:
                              key_value_types.append((k, type(v)))
                            
                           
                  else:
                     key_value_types.append(i, type(item))
            return key_value_types, level_keys

In [None]:
level_0_keys = {
'senses': [list][dict],
 'pos': [str],
 'head_templates': [list][dict],
 'forms': [list][dict],
 'derived': [list][dict],
 'descendants': [list][dict],
 'sounds': [list][dict],
 'hyphenations': [list][dict],
 'etymology_text': [str],
 'etymology_templates': [list][dict],
 'word': [str],
 'lang': [str],
 'lang_code': [str],
 'inflection_templates': [list][dict],
 'categories': [list][str],
 'related': [list][dict],
 'etymology_number': [int],
 'synonyms': [list][dict],
 'antonyms': [list][dict],
 'wikipedia': [list][str],
 'hypernyms': [list][dict],
 'hyponyms': [list][dict],
 'holonyms': [list][dict],
 'coordinate_terms': [list][dict],
 'meronyms': [list][dict],
 'abbreviations': [list][dict],
 'original_title': [str]}

level_0_list_types = [('senses', Counter({'dict': 1})),
 ('head_templates', Counter({'dict': 1})),
 ('forms', Counter({'dict': 4})),
 ('derived', Counter({'dict': 2})),
 ('descendants', Counter({'dict': 3})),
 ('sounds', Counter({'dict': 3})),
 ('hyphenations', Counter({'dict': 1})),
 ('etymology_templates', Counter({'dict': 8})),
 ('inflection_templates', Counter({'dict': 1})),
 ('categories', Counter({'str': 2})),
 ('related', Counter({'dict': 53})),
 ('synonyms', Counter({'dict': 2})),
 ('antonyms', Counter({'dict': 1})),
 ('wikipedia', Counter({'str': 1})),
 ('hypernyms', Counter({'dict': 1})),
 ('hyponyms', Counter({'dict': 1})),
 ('holonyms', Counter({'dict': 1})),
 ('coordinate_terms', Counter({'dict': 1})),
 ('meronyms', Counter({'dict': 5})),
 ('abbreviations', Counter({'dict': 2}))]

In [9]:
level_0_list_keys = []
for k, v in level_0_keys.items():
    if v == list:
        level_0_list_keys.append(k)

In [None]:
senses = {'links': [str],
 'synonyms': [
    {'word': str,
    'tags': list,
    'extra': str,
    'source': str,
    'alt': str,
    'english': str,
    'translation': str,
    'topics': list}],
 'glosses': [str],
 'tags': [str],
 'categories': [str],
 'wikipedia': [str],
 'form_of': [
            {'word': str, 
             'extra': str}],
 'raw_glosses': [str],
 'raw_tags': [str],
 'examples': [{'text': str,
   'bold_text_offsets': list,
   'translation': str,
   'english': str,
   'bold_translation_offsets': list,
   'type': str,
   'tags': list,
   'ref': str,
   'literal_meaning': str,
   'bold_literal_offsets': list,
   'roman': str,
   'raw_tags': list}],
 'alt_of': [{'word': str, 
             'extra': str}],
 'topics': [str],
 'attestations': [{'date': str, 'references': list}],
 'antonyms': [{'word': str}],
 'wikidata': [str],
 'senseid': [str],
 'hypernyms': [{'word': str}],
 'coordinate_terms': [{'word': str, 'english': str, 'translation': str}],
 'meronyms': [{'word': str}],
 'info_templates': [{'args': dict,
   'name': str,
   'extra_data': dict,
   'expansion': str}],
 'holonyms': [{'word': str}],
 'related': [{'word': str, 'tags': list}],
 'hyponyms': [{'word': str}]}

In [None]:
senses = {'links': list, #str or list of lists with str
 'synonyms': list, #[{'word': str}]
 'glosses': list, #str
 'tags': list,
 'categories': list, # str
 'wikipedia': list, #str
 'form_of': list,
 'raw_glosses': list,
 'raw_tags': list,
 'examples': list,
 'qualifier': str,
 'alt_of': list,
 'topics': list,
 'attestations': list,
 'antonyms': list,
 'wikidata': list,
 'senseid': list,
 'hypernyms': list,
 'coordinate_terms': list,
 'meronyms': list,
 'info_templates': list,
 'head_nr': int,
 'holonyms': list,
 'related': list,
 'hyponyms': list}
forms = {'form': str, 'tags': list[str], 'source': str, 'raw_tags': list[str], 'head_nr': int}
derived = {'word': str,
 'lang': str,
 'lang_code': str,
 'tags': list,
 'ipa': str,
 'audio': str,
 'ogg_url': str,
 'mp3_url': str,
 'parts': list,
 'name': str,
 'args': dict,
 'expansion': str,
 'raw_tags': list,
 'roman': str,
 'rhymes': str,
 'homophone': str,
 'sense': str,
 'descendants': list,
 'english': str,
 'translation': str,
 'note': str,
 'text': str,
 'other': str,
 'topics': list,
 'alt': str,
 'taxonomic': str,
 'ruby': list}
descendants = {'word': str,
 'lang': str,
 'lang_code': str,
 'tags': list,
 'ipa': str,
 'audio': str,
 'ogg_url': str,
 'mp3_url': str,
 'parts': list,
 'name': str,
 'args': dict,
 'expansion': str,
 'raw_tags': list,
 'roman': str,
 'rhymes': str,
 'homophone': str,
 'sense': str,
 'descendants': list,
 'english': str,
 'translation': str,
 'note': str,
 'text': str,
 'other': str,
 'topics': list,
 'alt': str,
 'taxonomic': str,
 'ruby': list}
sounds = {'ipa': str,
 'audio': str,
 'ogg_url': str,
 'mp3_url': str,
 'rhymes': str,
 'homophone': str,
 'tags': list,
 'note': str,
 'text': str,
 'other': str}
hyphenations = {'parts': list}
etymology_templates = {'name': str, 'args': dict, 'expansion': str}
inflection_templates = {'name': str, 'args': dict}

related = {'tags': list,
 'word': str,
 'sense': str,
 'english': str,
 'translation': str,
 'roman': str,
 'topics': list,
 'alt': str,
 'raw_tags': list}
coordinate_terms= {'word': str,
 'tags': list,
 'sense': str,
 'topics': list,
 'english': str,
 'translation': str,
 'alt': str}
abbreviations= {'word': str}
synonyms= {'word': str,
 'sense': str,
 'tags': list,
 'raw_tags': list,
 'topics': list,
 'source': str,
 'alt': str,
 'english': str,
 'translation': str,
 'taxonomic': str,
 'roman': str}
antonyms= {'word': str,
 'sense': str,
 'english': str,
 'translation': str,
 'tags': list,
 'topics': list,
 'alt': str}

hypernyms= {'word': str,
 'sense': str,
 'topics': list,
 'alt': str,
 'tags': list,
 'english': str,
 'translation': str}
hyponyms= {'word': str,
 'sense': str,
 'english': str,
 'translation': str,
 'topics': list,
 'taxonomic': str,
 'alt': str}
holonyms= {'word': str, 'tags': list}

meronyms= {'word': str, 'alt': str}



In [None]:
file = ENR_file
sense_synonyms= {}
sense_antonyms= {}
sense_hypernyms= {}
sense_hyponyms= {}
sense_holonyms= {}
sense_meronyms= {}
sense_glosses = {}
senses_tags = {}
keys = []
key_types = []
with open(file, 'r', encoding='utf-8') as f:
    with open(all_words_file, 'a+', encoding='utf-8') as out:
        for i, obj in tqdm(enumerate(f)):
            loaded = json.loads(obj)
            if loaded:
                
                try:
                    if 'senses' in loaded:
                        senses_loaded = loaded['senses']
                        if 'glosses' in senses_loaded:
                            temp_dict = {}
                            for j, item in enumerate(sense_glosses['glosses']):
                                for k, v in item.items():
                                    if k not in temp_dict.keys():
                                        temp_dict[k] = type(v)
                            
                            sense_glosses.update(temp_dict) 
                        
                        if 'antonyms' in senses_loaded:
                            temp_dict1 = {}
                            for j, item in enumerate(loaded['antonyms']):
                                for k, v in item.items():
                                    if k not in temp_dict1.keys():
                                        temp_dict1[k] = type(v)
                            
                            antonyms.update(temp_dict1) 
                            
                        if 'hypernyms' in senses_loaded:
                            temp_dict2 = {}
                            for j, item in enumerate(loaded['hypernyms']):
                                for k, v in item.items():
                                    if k not in temp_dict2.keys():
                                        temp_dict2[k] = type(v)
                            hypernyms.update(temp_dict2) 
                        
                        if 'hyponyms' in senses_loaded:
                            temp_dict3 = {}
                            for j, item in enumerate(loaded['hyponyms']):
                                for k, v in item.items():
                                    if k not in temp_dict3.keys():
                                        temp_dict3[k] = type(v)
                        
                            senses_hyponyms.update(temp_dict3)
                            
                        if 'holonyms' in senses_loaded:
                            temp_dict4 = {}
                            for j, item in enumerate(senses_loaded['holonyms']):
                                for k, v in item.items():
                                    if k not in temp_dict4.keys():
                                        temp_dict4[k] = type(v)
                    
                            senses_holonyms.update(temp_dict4)
                    
                        if 'meronyms' in senses_loaded:
                            temp_dict5 = {}
                            for j, item in enumerate(senses_loaded['meronyms']):
                                for k, v in item.items():
                                    if k not in temp_dict5.keys():
                                        temp_dict5[k] = type(v)
                
                            senses_meronyms.update(temp_dict5)
                        
                        if 'related' in loaded:
                            temp_dict6 = {}
                            for j, item in enumerate(loaded['related']):
                                for k, v in item.items():
                                    if k not in temp_dict6.keys():
                                        temp_dict6[k] = type(v)
    
                            related.update(temp_dict6)
                     
                    # if 'coordinate_terms' in loaded:
                    #     temp_dict7 = {}
                    #     for j, item in enumerate(loaded['coordinate_terms']):
                    #         for k, v in item.items():
                    #             if k not in temp_dict7.keys():
                    #                 temp_dict7[k] = type(v)
                  
                    #     coordinate_terms.update(temp_dict7)  
                    
                    # if 'abbreviations' in loaded:
                    #     temp_dict8 = {}
                    #     for j, item in enumerate(loaded['abbreviations']):
                    #         for k, v in item.items():
                    #             if k not in temp_dict8.keys():
                    #                 temp_dict8[k] = type(v)
                  
                    #     abbreviations.update(temp_dict8) 
                        
               
                except:
                    print(i)
                    break
                

                     
                        # else:
                        #     if type(value) not in level_0_keys[key]:
                        #         level_0_keys[key].append(type(value))

                    
                    

140922it [00:02, 59561.30it/s]


In [105]:
from pandas import value_counts


file = ENR_file
sub_senses = {}
sense_synonyms= {}
sense_antonyms= {}
sense_hypernyms= {}
sense_hyponyms= {}
sense_holonyms= {}
sense_meronyms= {}
sense_glosses = {}
senses_tags = {}
all_subkeys_senses_to_make_dicts = []
subkey_dicts_lists = []
keys = []
key_types = []
subkey = 'senses'
with open(file, 'r', encoding='utf-8') as f:
    with open(all_words_file, 'a+', encoding='utf-8') as out:
        for i, obj in tqdm(enumerate(f)):
            loaded = json.loads(obj)
            if loaded:
                if 'senses' in loaded:
                    sense_items_dicts = []
                    
                    for sense in loaded['senses']:
                        if isinstance(sense, dict):
                            for key, value in sense.items():
                                try:
                                    if isinstance(value, list):
                                        temp_keys = []
                                        counter = Counter()
                                        typecounts = Counter(type(x).__name__ for x in value)
                                        sub_key_l = []
                                        if len(typecounts) == 1:
                                            if typecounts.get('dict'):
                                                if not sub_senses.get(key):
                                                    sub_senses[key] = [{}]
                                                for val in value:
                                                    for k, v in val.items():
                                                        if k not in sub_senses[key][0].keys():
                                                            sub_senses[key][0][k] = type(v)
                                                        else: 
                                                            if not isinstance(sub_senses[key][0][k], list):
                                                                if type(v) != sub_senses[key][0][k]:
                                                                    old_val = sub_senses[key][0][k]
                                                                    sub_senses[key][0][k] = ["forced list", old_val, type(v)]
                                                            else:
                                                                if type(v) not in sub_senses[key][0][k]:
                                                                    sub_senses[key][0][k].append(type(v))
                                            elif typecounts.get('list'):
                                                if not sub_senses.get(key):
                                                    sub_senses[key] = []
                                                for val in value:
                                                    for v in val:
                                                        if (type(v)) not in sub_senses[key]:
                                                            sub_senses[key].append((type(v)))
                                            else:
                                                sub_senses[key] = [type(value[0])]
                                        else:
                                            if len(typecounts) > 1:
                                                print(typecounts)
                                except Exception as e:
                                    print("error on ", i, ', ', e)
                                    
                                        # else:
                                        #     if typecounts.get('dict'):
                                        #         if not sub_senses.get(key):
                                        #             sub_senses[key] = [{}]
                                        #         for val in value:
                                        #             for k, v in val.items():
                                        #                 if k not in sub_senses[key][0].keys():
                                        #                     sub_senses[key][0][k] = type(v)
                                        #     elif typecounts.get('list'):
                                        #         if not sub_senses.get(key):
                                        #             sub_senses[key] = []
                                        #         for val in value:
                                        #             for v in val:
                                        #                 if (type(v)) not in sub_senses[key]:
                                        #                     sub_senses[key].append((type(v)))
                                        #     else:
                                        #         sub_senses[key] = [type(value[0])]
                                    
                                                        
                                            
                                            
                                            

                                    # if type_keys[0] == 'dict':
                                    #     
                                            
                                    
                                            
                                    # sub_senses[key] = [type_keys]
                                    
                                            
                                    

                                        
                                            


                                # if isinstance(value, dict):
                                #     sub_senses[key] = value.keys()
                                #     for sense_item in sense: # links and other keys
                                #         if isinstance(sense_item, list):
                                #             key_types.append((key, type(sense_item), sense_item, sense))
                                #             #print(key, ' list, ', sense_item)
                                #             pass
                                #             # for si in sense_item:
                                #             #     pass
                                            
                                #         else:
                                #             key_types.append((key, type(sense_item), sense_item, sense))
                                            
                                      
                        
                            
                                #    for si in sense_item:
                                #        if isinstance(sense_item, list):

                        
                             
                # if subkey in loaded:
                #     subkey_items = loaded[subkey]
                #     subkeys_dict = {}
                #     if isinstance(subkey_items, list):
                #         for j, item in subkey_items:
                #             if isinstance(item, list):

                #     for key, value in subkey_items:
                #         if isinstance(subkey_items, (int, str)):
                #             subkeys_dict[subkey] = type(subkey_items)
                #             print(subkeys_dict)
                #             break
                #         elif isinstance(subkey_items, dict):
                #             for k, v in subkey_items.items():
                #                 if isinstance(v, list):
display(sub_senses)

140922it [00:04, 30118.21it/s]


{'links': [str],
 'synonyms': [{'word': str,
   'tags': list,
   'extra': str,
   'source': str,
   'alt': str,
   'english': str,
   'translation': str,
   'topics': list}],
 'glosses': [str],
 'tags': [str],
 'categories': [str],
 'wikipedia': [str],
 'form_of': [{'word': str, 'extra': str}],
 'raw_glosses': [str],
 'raw_tags': [str],
 'examples': [{'text': str,
   'bold_text_offsets': list,
   'translation': str,
   'english': str,
   'bold_translation_offsets': list,
   'type': str,
   'tags': list,
   'ref': str,
   'literal_meaning': str,
   'bold_literal_offsets': list,
   'roman': str,
   'raw_tags': list}],
 'alt_of': [{'word': str, 'extra': str}],
 'topics': [str],
 'attestations': [{'date': str, 'references': list}],
 'antonyms': [{'word': str}],
 'wikidata': [str],
 'senseid': [str],
 'hypernyms': [{'word': str}],
 'coordinate_terms': [{'word': str, 'english': str, 'translation': str}],
 'meronyms': [{'word': str}],
 'info_templates': [{'args': dict,
   'name': str,
   'ext

In [None]:
forms = {'form': str, 'tags': list[str], 'source': str, 'raw_tags': list[str], 'head_nr': int}
sub_forms = {'tags': list[str], 'raw_tags': list[str]}
subkey = 'forms'
with open(file, 'r', encoding='utf-8') as f:
    with open(all_words_file, 'a+', encoding='utf-8') as out:
        for i, obj in tqdm(enumerate(f)):
            loaded = json.loads(obj)
            if loaded:
                if subkey in loaded:
                    for sub in loaded[subkey]:
                            if isinstance(sub, dict):
                                for key, value in sub.items():
                                    if key == 'tags' and type(value) != list:
                                        if type(value[0]) != 'str':
                                            print(i, key, value)
                                    if key == 'raw_tags' and type(value) != list:
                                        if type(value[0]) != 'str':
                                            print(i, key, value)
            

140922it [00:02, 57842.00it/s]


In [None]:
derived = {'word': str,
 'lang': str,
 'lang_code': str,
 'tags': list[str],
 'ipa': str,
 'audio': str,
 'ogg_url': str,
 'mp3_url': str,
 'parts': list[str],
 'name': str,
 'args': dict,
 'expansion': str,
 'raw_tags': list[str],
 'roman': str,
 'rhymes': str,
 'homophone': str,
 'sense': str,
 'descendants': list[str],
 'english': str,
 'translation': str,
 'note': str,
 'text': str,
 'other': str,
 'topics':  list[str],
 'alt': str,
 'taxonomic': str,
 'ruby': list[str]}
descendants = {'word': str,
 'lang': str,
 'lang_code': str,
 'tags': list[str],
 'ipa': str,
 'audio': str,
 'ogg_url': str,
 'mp3_url': str,
 'parts': list[str],
 'name': str,
 'args': dict,
 'expansion': str,
 'raw_tags': list[str],
 'roman': str,
 'rhymes': str,
 'homophone': str,
 'sense': str,
 'descendants': list[str],
 'english': str,
 'translation': str,
 'note': str,
 'text': str,
 'other': str,
 'topics':  list[str],
 'alt': str,
 'taxonomic': str,
 'ruby': list[str]}

In [125]:
derived_list_dict = {}
for k, v in derived.items():
    if v == list or v == dict:
        derived_list_dict[k] = v
derived_list_dict

{'tags': list,
 'parts': list,
 'args': dict,
 'raw_tags': list,
 'descendants': list,
 'topics': list,
 'ruby': list}

In [126]:
desc_list_dict = {'tags': list[str],
 'parts': list[str],
 'args': dict,
 'raw_tags': list[str],
 'descendants': list[str],
 'topics': list[str],
 'ruby': list[str]}
sub_dict = {}
subkey = 'derived'
with open(file, 'r', encoding='utf-8') as f:
    with open(all_words_file, 'a+', encoding='utf-8') as out:
        for i, obj in tqdm(enumerate(f)):
            loaded = json.loads(obj)
            if loaded:
                if subkey in loaded:
                    for sub in loaded[subkey]:

                        if isinstance(sub, dict):
                            for key, value in sub.items():
                               

                                if key == 'tags' and type(value) != list:
                                    if type(value[0]) != 'str':
                                        print(i, key, value)
                                if key == 'raw_tags' and type(value) != list:
                                    if type(value[0]) != 'str':
                                        print(i, key, value)
                                    
                                if key == 'parts' and type(value) != list:
                                    if type(value[0]) != 'str':
                                        print(i, key, value)      
                                if key == 'descendants' and type(value) != list:
                                    if type(value[0]) != 'str':
                                        print(i, key, value)
                                    
                                if key == 'topics' and type(value) != list:
                                    if type(value[0]) != 'str':
                                        print(i, key, value)   
                                if key == 'ruby' and type(value) != list:
                                    if type(value[0]) != 'str':
                                        print(i, key, value) 
                                if key == 'ruby':
                                    print(i, key, value) 

140922it [00:02, 67940.30it/s]


In [67]:
display(holonyms)
meronyms

{'word': str, 'tags': list}

{'word': str, 'alt': str}

In [None]:
# if 'senses' in loaded:
#     for j, item in enumerate(loaded['senses']):
#         item_type = type(item)
#         if isinstance(item, dict):
#             for k, v in item.items():
#                 temp_sense_dict[k] = type(v)
#     senses_structure.update(temp_sense_dict)                
# if 'forms' in loaded:
#     for j, item in enumerate(loaded['forms']):
#         for k, v in item.items():
#             temp_forms_dict[k] = type(v)
#     forms.update(temp_forms_dict)

AttributeError: 'dict' object has no attribute 'sort'

### NL Structure

In [7]:
substructures = []

In [20]:
NNF_out = Path(current_save_folder, 'NNF')
NEF_out = Path(current_save_folder, 'NEF')

In [22]:
substructures2 = []

In [5]:
def safe_dict(obj_str: str):
    if isinstance(obj_str, str):
        try:
            return ast.literal_eval(obj_str)
        except Exception:
            return ""       # fallback

In [6]:
def filter_obj(obj):
    if 'anagrams' in obj:
        obj.pop('anagrams')
        
    if 'proverbs' in obj:
        obj.pop('proverbs')
    if 'pos_title' in obj:
        obj.pop('pos_title')
    if 'sounds' in obj:
        sounds = obj['sounds']
        remove_sounds = ['audio','ogg_url','mp3_url']
        new_sounds = []
        for i in sounds:
            ns = []
            for k in i.keys():
                if k not in remove_sounds:
                    ns.append(i)
            if ns:
                new_sounds.append(ns)
            
        obj['sounds'] = new_sounds
    if 'senses' in obj:
        for i in obj['senses']:
            pass

In [8]:
def make_def_obj(obj) -> dict:
    new_obj_keys = ['word', 'lang_code', 'pos', 'glosses']
    new_obj = {}
    glosses = []
    translations = []
    top_translations = []
    new_obj['word'] = obj['word']
    new_obj['lang_code'] = obj['lang_code']
    new_obj['pos'] = obj['pos']
    if 'senses' in obj:
        for sense in obj['senses']:
            if 'glosses' in sense:
                glosses.append(sense['glosses'])
            if 'translation' in sense:
                translations.append(sense['translation'])

    new_obj['glosses'] = glosses
    if 'translations' in obj:
        top_translations = [x for x in obj['translations'] if x.get("lang_code", '') == 'en' or x.get("lang_code", '') == 'nl']
        new_obj['translations'] = top_translations
    return new_obj

In [77]:
all_words = Path(current_save_folder, 'all_words.jsonl')

In [13]:
all_words_defs = []

In [14]:

nl_file = NNR_file
curr_NEF_file = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', '11-11-25', 'NEF', 'NEF.jsonl')
file_len = count_lines_with_progress(curr_NEF_file)
outfile = Path(current_save_folder, 'NEF', 'NEF_filtered.jsonl')
batch = []
with open(nl_file, 'r', encoding='utf-8') as f:
        with open(all_words_file, 'a+', encoding='utf-8') as out:
            lines = f.readlines()
            print(len(lines))
            for i, line in tqdm(enumerate(lines)):
            
                loaded = json.loads(line)
                
                if type(loaded) == str:
                    loaded = safe_dict(loaded)
        
                
                if loaded:
                    
                    filter_obj(loaded)
                   
                 
                    new_obj = make_def_obj(loaded)
                    if new_obj:
                        new_obj['wl_code'] = 'NEF'
                        all_words_defs.append(new_obj)
                        batch.append(new_obj)
                    

                        
                
                if len(batch) > 1000:
                    for obj in batch:
                        json.dump(obj, out)
                        out.write('\n')
                    batch = []
         
display(batch)

Counting Lines: 100%|██████████| 8.42M/8.42M [00:00<00:00, 1.26GB/s]


611444


611444it [00:18, 33005.61it/s]


[{'word': 'kiezelhard',
  'lang_code': 'nl',
  'pos': 'adj',
  'glosses': [['heel erg stevig, heel erg sterk']],
  'wl_code': 'NEF'},
 {'word': 'toneelschrijfkunst',
  'lang_code': 'nl',
  'pos': 'noun',
  'glosses': [['schrijven van toneelstukken als een literaire kunstuiting']],
  'wl_code': 'NEF'},
 {'word': 'drinkvermogen',
  'lang_code': 'nl',
  'pos': 'noun',
  'glosses': [['de hoeveelheid alcoholische dranken die iemand kan drinken, wat problematisch kan worden bij overmatig of zwaar drinken']],
  'wl_code': 'NEF'},
 {'word': 'uitroepingsteken',
  'lang_code': 'nl',
  'pos': 'noun',
  'glosses': [['!, een leesteken dat uitdrukt dat de zin een uitroep, bevel of uitdrukking van verbazing is.']],
  'wl_code': 'NEF'},
 {'word': 'punthaak',
  'lang_code': 'nl',
  'pos': 'noun',
  'glosses': [['[<][>] elk van beide spiegelbeeldige puntvormige leestekens waarmee een bepaald deel van een tekst kan worden gemarkeerd']],
  'wl_code': 'NEF'},
 {'word': 'dadfluencer',
  'lang_code': 'nl',
 

In [83]:
all_words_defs

[]

In [24]:
with open(Path(NEF_out, f"NLS_to_line_{17441}.pkl"), 'wb') as out:
                        pickle.dump(kv2, out)

In [16]:
with open(Path(current_save_folder, f"NLS_to_line_{611448}.pkl"), 'wb') as out:
    pickle.dump(kv, out)

In [18]:
display(kv.keys())

dict_keys(['word', 'lang_code', 'lang', 'pos', 'pos_title', 'senses', 'categories', 'etymology_texts', 'sounds', 'antonyms', 'derived', 'proverbs', 'hyponyms', 'related', 'translations', 'hyphenations', 'tags', 'synonyms', 'notes', 'forms', 'anagrams', 'hypernyms', 'homophones', 'holonyms', 'metonyms', 'descendants', 'abbreviations', 'paronyms'])

In [None]:
NNL_keys = ['word', 'lang_code', 'lang', 'pos', 'pos_title', 'senses', 'categories', 'etymology_texts', 'sounds', 'antonyms', 'derived', 'proverbs', 'hyponyms', 'related', 'translations', 'hyphenations', 'tags', 'synonyms', 'notes', 'forms', 'anagrams', 'hypernyms', 'homophones', 'holonyms', 'metonyms', 'descendants', 'abbreviations', 'paronyms']

In [153]:
kv = {}
sub2 = []
for x in tqdm(substructures, total=len(substructures)):
    for key, value in x.items():
        if key not in kv.keys():
            kv[key] = [value]
        elif value not in kv[key]:
            kv[key].append(value)
    


100%|██████████| 91386/91386 [00:02<00:00, 39460.00it/s] 


In [29]:
senses_kv = kv['notes']
senses_kv

[[str]]

In [136]:
print(len(substructures))

64428


In [146]:
import pickle
outfile = Path(current_save_folder, 'NL_structures.pkl')
Path.mkdir(current_save_folder, exist_ok=True)
with open(outfile, 'wb') as f:
    pickle.dump(substructures, f)
    
        
    
        
        
        

In [None]:
Path.mkdir(current_save_folder, exist_okay=True)


## Get Defs Translations


In [None]:
EER_word_definitions_translations = Path(current_save_path, 'EER', 'EER_definitions_translations.jsonl')
words_definitions = {}
with open(EER_file, 'r', encoding='utf-8',errors='ignore') as f:
    if not EER_word_definitions_translations.exists():
        Path.mkdir(Path(current_save_path, 'EER'))
    with open(EER_word_definitions_translations, 'a+', encoding='utf-8') as out:
        lines = f.readlines()
        for obj in tqdm(lines, total=len(lines)):
            try:
                obj = json.loads(obj)

            except Exception as e:
                print()
                print(e)

## Previous Exploration categories discovered

en_en_top_keys = ['word', 'pos', 'lang_code', 'abbreviations', 'antonyms', 'categories', 'coordinate_terms',
       'derived', 'descendants', 'etymology_number', 'etymology_templates',
       'etymology_text', 'form_of', 'forms', 'head_templates', 'holonyms',
       'hypernyms', 'hyphenation', 'hyphenations', 'hyponyms',
       'inflection_templates', 'info_templates', 'instances', 'lang',
        'meronyms', 'original_title',  'related', 'senses',
       'sounds', 'source', 'synonyms', 'translations', 'troponyms', 'wikidata',
       'wikipedia']

In [None]:
en_nl_top_keys = ['pos', 'head_templates', 'inflection_templates', 'forms', 'descendants',
       'sounds', 'hyphenations', 'etymology_text', 'etymology_templates',
       'word', 'lang', 'lang_code', 'senses', 'etymology_number', 'derived',
       'related', 'antonyms', 'wikipedia', 'hypernyms', 'synonyms',
       'categories', 'coordinate_terms', 'hyponyms', 'abbreviations',
       'holonyms', 'meronyms']

In [None]:
senses
pos
head_templates
forms
derived
descendants
sounds
hyphenations
etymology_text
etymology_templates
word
lang
lang_code
inflection_templates
categories
related
etymology_number
synonyms
antonyms
wikipedia
hypernyms
hyponyms
holonyms
coordinate_terms
meronyms

In [None]:
eep_senses = ['word', 'pos', 'lang_code', 'senses', 'alt_of', 'antonyms', 'attestations', 'categories', 'coordinate_terms', 'derived', 'examples', 'form_of', 'glosses', 'head_nr', 'holonyms', 'hypernyms', 'hyponyms', 'id', 'info_templates', 'instances', 'links', 'meronyms', 'qualifier', 'raw_glosses', 'raw_tags', 'related', 'senseid', 'synonyms', 'tags', 'taxonomic', 'topics', 'translations', 'troponyms', 'wikidata', 'wikipedia']

In [None]:
"""
EEP Senses
1   word              613365 non-null  object 
 2   pos               613365 non-null  object 
 3   lang_code         613365 non-null  object 
 4   senses            613365 non-null  object 
 5   alt_of            60977 non-null   object 
 6   antonyms          3924 non-null    object 
 7   attestations      3712 non-null    object 
 8   categories        600737 non-null  object 
 9   coordinate_terms  8923 non-null    object 
 10  derived           25243 non-null   object 
 11  examples          128798 non-null  object 
 12  form_of           239885 non-null  object 
 13  glosses           612962 non-null  object 
 14  head_nr           162 non-null     float64
 15  holonyms          189 non-null     object 
 16  hypernyms         4482 non-null    object 
 17  hyponyms          2284 non-null    object 
 18  id                613365 non-null  object 
 19  info_templates    81 non-null      object 
 20  instances         12 non-null      object 
 21  links             599259 non-null  object 
 22  meronyms          229 non-null     object 
 23  qualifier         9498 non-null    object 
 24  raw_glosses       165385 non-null  object 
 25  raw_tags          5429 non-null    object 
 26  related           38556 non-null   object 
 27  senseid           5743 non-null    object 
 28  synonyms          55151 non-null   object 
 29  tags              485883 non-null  object 
 30  taxonomic         34 non-null      object 
 31  topics            91046 non-null   object 
 32  translations      64502 non-null   object 
 33  troponyms         16 non-null      object 
 34  wikidata          2232 non-null    object 
 35  wikipedia         27768 non-null   object 
 """

In [None]:
ENP Senses
'word', 'pos', 'lang_code', 'antonyms', 'categories',
       'coordinate_terms', 'derived', 'descendants', 'etymology_templates',
       'etymology_text', 'forms', 'head_templates', 'holonyms', 'hypernyms',
       'hyphenations', 'hyponyms', 'inflection_templates', 'meronyms',
       'related', 'senses', 'sounds', 'synonyms'

In [None]:
NNP
'word', 'pos', 'lang_code', 'antonyms', 'categories',
       'derived', 'descendants', 'etymology_texts', 'forms', 'holonyms',
       'homophones', 'hypernyms', 'hyphenations', 'hyponyms', 'metonyms',
       'notes', 'related', 'senses', 'sounds', 'synonyms', 'tags',
       'translations'

In [None]:
combined_nl_en_top_keys = ['word', 'pos', 'lang_code', 'senses', 'abbreviations', 'anagrams', 'antonyms', 'categories',
       'coordinate_terms', 'derived', 'descendants', 'etymology_number',
       'etymology_templates', 'etymology_text', 'etymology_texts', 'form_of',
       'forms', 'head_templates', 'holonyms', 'homophones', 'hypernyms',
       'hyphenation', 'hyphenations', 'hyponyms', 'inflection_templates',
       'info_templates', 'instances', 'invalid', 'lang', 
       'meronyms', 'metonyms', 'notes', 'origin', 'original_title', 'paronyms',
        'pos_title', 'proverbs', 'related',  'sounds', 'source',
       'synonyms', 'tags', 'translations', 'troponyms', 'wikidata',
       'wikipedia']