In [None]:
from dutchanalyzer.config import *
from dutchanalyzer.utilities.utils import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.replacement_utils import *

import json
from pathlib import Path

import datetime
import re

import ast
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
NNF_file = Path(WIKT_CLEANING_DIR, 'nl','NNF.jsonl')
NEF_file = Path(WIKT_CLEANING_DIR, 'nl','NEF.jsonl')
EEF_file = Path(WIKT_CLEANING_DIR, 'en','EEF.jsonl')
ENF_file = Path(WIKT_CLEANING_DIR, 'en','ENF.jsonl')
NEF_definitions = Path(WIKT_CLEANING_DIR, 'nl','NEF_definitions.jsonl')
NNF_definitions = Path(WIKT_CLEANING_DIR, 'nl','NNF_definitions.jsonl')
ENF_definitions = Path(WIKT_CLEANING_DIR, 'en','ENF_definitions.jsonl')
EEF_definitions = Path(WIKT_CLEANING_DIR, 'en','EEF_definitions.jsonl')

In [3]:
EER2_file = Path(EER_DIR, 'EER2.jsonl')
ENR2_file = Path(ENR_DIR, 'ENR2.jsonl')
NER2_file = Path(NER_DIR, 'NER2.jsonl')
NNR2_file = Path(NNR_DIR, 'NNR2.jsonl')

In [4]:
today = datetime.date.today().__format__("%d-%m-%y")

current_save_folder = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))
previous_save_folder = get_previous_save_folder(WIKT_CLEANING_DIR, most_recent=True)

In [5]:
if not Path.exists(current_save_folder):
    Path.mkdir(current_save_folder, parents=True)

## Utils

In [10]:
def get_translations_list(tlist: list):
    new_t_list = []
    if tlist:
        for tl in tlist:
            if type(tl) == dict:
                if t1.get('word'):
                    new_t_list.append(tl)
                else:
                    tl.get('sense')
    return new_t_list
    

In [11]:
def apply_translations_list(df, col):
    new_col_df = []
    
    for i, row in df.iterrows():
        newt = []
        if row[col]:
            newt = [x.get('word') for x in safe_eval(row[col]) if x.get('word')]    
        new_col_df.append({'translation_list': newt})
    return new_col_df

## Import DFs

In [None]:
last_df_folder = Path(WIKT_CLEANING_DIR, '13-11-25')

In [None]:
NNF_df = pd.read_csv(Path(current_save_folder, 'NNF_definitions_df.csv'), index_col=0)
EEF_df = pd.read_csv(Path(current_save_folder, 'EEF_definitions_df.csv'), index_col=0)
NEF_df = pd.read_csv(Path(current_save_folder, 'NEF_definitions_df.csv'), index_col=0)
ENF_df = pd.read_csv(Path(current_save_folder, 'ENF_definitions_df.csv'), index_col=0)

  NNF_df = pd.read_csv(Path(last_df_folder, 'NNF_definitions_df.csv'), index_col=0)


In [14]:
EEF_df = pd.read_csv(Path(current_save_folder, 'EEF_definitions_df.csv'), index_col=0)

In [7]:
NNF_df = pd.read_csv(Path(current_save_folder, 'NNF_definitions_df.csv'), index_col=0)

  NNF_df = pd.read_csv(Path(current_save_folder, 'NNF_definitions_df.csv'), index_col=0)


In [None]:
NEF_df.to_csv(Path(current_save_folder, 'NEF_definitions_df.csv'))

In [15]:
NEF_df.to_csv(Path(current_save_folder, 'NEF_definitions_df.csv'))
EEF_df.to_csv(Path(current_save_folder, 'EEF_definitions_df.csv'))
NNF_df.to_csv(Path(current_save_folder, 'NNF_definitions_df.csv'))
ENF_df.to_csv(Path(current_save_folder, 'ENF_definitions_df.csv'))

In [18]:
display(EEF_df.info(verbose=True))
EEF_df

<class 'pandas.core.frame.DataFrame'>
Index: 1464321 entries, 536236 to 198944
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   word             1464292 non-null  object
 1   pos              1464321 non-null  object
 2   lang_code        1464321 non-null  object
 3   sense_code       1464321 non-null  int64 
 4   gloss_code       1464321 non-null  int64 
 5   gloss            1463461 non-null  object
 6   wl_code          1464321 non-null  object
 7   nl_translations  125730 non-null   object
 8   forms            1176821 non-null  object
dtypes: int64(2), object(7)
memory usage: 111.7+ MB


None

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,nl_translations,forms
536236,,punct,en,0,0,Used to space out letters in words relating to...,EEF,,"[{'form': '] [', 'tags': ['canonical']}]"
130442,',symbol,en,0,0,See ’,EEF,,
130443,',particle,en,0,0,See -'.,EEF,,
993527,'Arries,noun,en,0,0,plural of 'Arry,EEF,,"[{'word': ""'Arry""}]"
993528,'Arriet,noun,en,0,0,A Cockney woman.,EEF,,"[{'form': ""'Arriets"", 'tags': ['plural']}]"
...,...,...,...,...,...,...,...,...,...
96467,,noun,en,1,0,Synonym of nancy: an effeminate male homosexual.,EEF,,"[{'form': 'nans', 'tags': ['plural']}]"
96468,,noun,en,2,0,Synonym of nursemaid.,EEF,,"[{'form': 'nans', 'tags': ['plural']}]"
96469,,noun,en,3,0,Synonym of grandmother.,EEF,,"[{'form': 'nans', 'tags': ['plural']}]"
96470,,noun,en,0,0,Alternative spelling of naan.,EEF,,"[{'form': 'nans', 'tags': ['plural']}]"


## Sort and Refine Values

In [14]:
EEF_df.sort_values(by='word', inplace=True)
NNF_df.sort_values(by='word', inplace=True)
NEF_df.sort_values(by='word', inplace=True)
ENF_df.sort_values(by='word', inplace=True)

In [6]:
EEF_df.sort_values(by='word', inplace=True)

In [11]:
NNF_df['en_translations'].value_counts()

en_translations
[{'word': 'moderator', 'sense': None}]                                                                                                                                                                                                                                                                                   25
[{'word': 'affiliate', 'sense': None}]                                                                                                                                                                                                                                                                                   24
[{'word': 'delineate', 'sense': '1 ergens een lijn omheen tekenen'}, {'word': 'draw', 'sense': '1 ergens een lijn omheen tekenen'}]                                                                                                                                                                                      22
[{'word': 'in', 'sense': '-12. in (r

## Pull Word Level/Short/Long Translations

In [12]:
EEF_df

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,nl_translations,forms
536236,,punct,en,0,0,Used to space out letters in words relating to...,EEF,,"[{'form': '] [', 'tags': ['canonical']}]"
130442,',symbol,en,0,0,See ’,EEF,,
130443,',particle,en,0,0,See -'.,EEF,,
993527,'Arries,noun,en,0,0,plural of 'Arry,EEF,,"[{'word': ""'Arry""}]"
993528,'Arriet,noun,en,0,0,A Cockney woman.,EEF,,"[{'form': ""'Arriets"", 'tags': ['plural']}]"
...,...,...,...,...,...,...,...,...,...
96467,,noun,en,1,0,Synonym of nancy: an effeminate male homosexual.,EEF,,"[{'form': 'nans', 'tags': ['plural']}]"
96468,,noun,en,2,0,Synonym of nursemaid.,EEF,,"[{'form': 'nans', 'tags': ['plural']}]"
96469,,noun,en,3,0,Synonym of grandmother.,EEF,,"[{'form': 'nans', 'tags': ['plural']}]"
96470,,noun,en,0,0,Alternative spelling of naan.,EEF,,"[{'form': 'nans', 'tags': ['plural']}]"


In [13]:
EEF_translations_df = EEF_df.loc[:, ['word', 'pos', 'lang_code', 'nl_translations']]
NNF_translations_df = NNF_df.loc[:, ['word', 'pos', 'lang_code', 'en_translations']]

In [23]:
new_translations_list = []
for i, row in tqdm(EEF_translations_df.iterrows()):
    translations = row['nl_translations']
    
    if translations:
        if type(translations) != float:
            new_list = get_translations_list(translations)
        else:
            new_list = []
    new_translations_list.append({'new_translations': new_list})
EEF_df['new_translations'] = new_translations_list

1464321it [00:37, 39100.23it/s]


In [25]:
EEF_df['new_translations'].value_counts()

new_translations
{'new_translations': []}    1464321
Name: count, dtype: int64

In [None]:
def get_translations_list(tlist: list):
    new_t_list = []
    if tlist:
        for tl in tlist:
            if type(tl) == dict:
                if tl.get('word'):
                    new_t_list.append(tl)
                else:
                    tl.get('sense')
    return new_t_list

In [None]:
print(EEF_translations_df['nl_translations'].value_counts())
EEF_translations_df['nl_translations'].info(verbose=True)

## 17-11-25

In [7]:
NNF2 = []

In [32]:
NNF2_full_lines = []

In [8]:
NNF_senses = []
NNF_translations = []
NNF_synonyms = []
NNF_forms = []

In [14]:
def strip_obj(obj):
    keep_keys = ['word', 'pos', 'lang_code', 'lang', 'standard_lang', 'forms', 'senses', 'translations', 'synonyms', 'etemology_templates', 'sounds']
    obj_keys = list(obj.keys())
    for k in obj_keys:
        if k not in keep_keys:
            obj.pop(k)

In [33]:
with open(NNR2_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        obj = json.loads(line)
        if obj:
            #strip_obj(obj)
            NNF2_full_lines.append(obj)

In [16]:
NNF2_df = pd.DataFrame(NNF2)

In [34]:
NNF2_full_df = pd.DataFrame(NNF2_full_lines)

In [36]:
NNF2_full_df['word_code'] = NNF2_full_df['word'] + '_' + NNF2_full_df['pos']

In [40]:
NNF2_full_df['dups'] = NNF2_full_df.duplicated(subset=['word_code'])

In [41]:
NNF2_full_df

Unnamed: 0,word,pos,lang_code,lang,standard_lang,senses,antonyms,categories,derived,etymology_texts,...,forms,hypernyms,homophones,holonyms,metonyms,descendants,abbreviations,paronyms,word_code,dups
0,ja,adv,nl,Nederlands,dutch,"[{'glosses': ['duidt bevestiging, instemming, ...","[{'word': 'neen'}, {'word': 'nee'}]","[Bijwoord in het Nederlands, Ontbrekend geluid...","[{'word': 'ja knikken'}, {'word': 'ja-neevraag...",[In de betekenis van ‘tussenwerpsel: uitroep t...,...,,,,,,,,,ja_adv,False
1,ja,intj,nl,Nederlands,dutch,"[{'glosses': ['kreet van opwinding'], 'example...",,"[Ontbrekend geluid, Retrograad van het Nederla...",,[In de betekenis van ‘tussenwerpsel: uitroep t...,...,,,,,,,,,ja_intj,False
2,ja,noun,nl,Nederlands,dutch,[{'glosses': ['bevestigend of instemmend antwo...,,"[Ontbrekend geluid, Retrograad van het Nederla...",,[In de betekenis van ‘tussenwerpsel: uitroep t...,...,,,,,,,,,ja_noun,False
3,neen,intj,nl,Nederlands,dutch,[{'glosses': ['ontkenning van de gestelde vraa...,,"[Ontbrekend geluid, Retrograad van het Nederla...",,[In de betekenis van ‘tussenwerpsel: uitroep t...,...,,,,,,,,,neen_intj,False
4,een,article,nl,Nederlands,dutch,[{'glosses': ['onbepaald lidwoord dat in het N...,,"[Erfwoord_in_het_Nederlands, Lidwoord in het N...","[{'word': 'eenzelfde'}, {'word': 'eene'}, {'wo...","[erfwoord, via Middelnederlands een van Oudned...",...,"[{'form': '[A] een', 'tags': ['canonical']}, {...",,,,,,,,een_article,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609641,mottigaardje,noun,nl,Nederlands,dutch,[{'glosses': ['verkleinwoord enkelvoud van het...,,"[Ontbrekend geluid, Retrograad van het Nederla...",,,...,,,,,,,,,mottigaardje_noun,False
609642,mottigaards,noun,nl,Nederlands,dutch,[{'glosses': ['meervoud van het zelfstandig na...,,"[Ontbrekend geluid, Retrograad van het Nederla...",,,...,,,,,,,,,mottigaards_noun,False
609643,bankpootjes,noun,nl,Nederlands,dutch,[{'glosses': ['verkleinwoord meervoud van het ...,,"[Ontbrekend geluid, Retrograad van het Nederla...",,,...,,,,,,,,,bankpootjes_noun,False
609644,bankpootje,noun,nl,Nederlands,dutch,[{'glosses': ['verkleinwoord enkelvoud van het...,,"[Ontbrekend geluid, Retrograad van het Nederla...",,,...,,,,,,,,,bankpootje_noun,False


In [42]:
NNF2_full_df_dups = NNF2_full_df[NNF2_full_df['dups'] == True]

In [45]:
NNF2_full_df_dups.drop(columns=['metonyms', 'abbreviations', 'paronyms'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NNF2_full_df_dups.drop(columns=['metonyms', 'abbreviations', 'paronyms'], inplace=True)


In [46]:
NNF2_full_df_dups.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 3004 entries, 13 to 608760
Data columns (total 25 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   word             3004 non-null   object
 1   pos              3004 non-null   object
 2   lang_code        3004 non-null   object
 3   lang             3004 non-null   object
 4   standard_lang    3004 non-null   object
 5   senses           3004 non-null   object
 6   antonyms         24 non-null     object
 7   categories       3004 non-null   object
 8   derived          290 non-null    object
 9   etymology_texts  1668 non-null   object
 10  hyphenations     2999 non-null   object
 11  hyponyms         179 non-null    object
 12  related          253 non-null    object
 13  sounds           3001 non-null   object
 14  translations     3004 non-null   object
 15  tags             1924 non-null   object
 16  notes            47 non-null     object
 17  synonyms         375 non-null    ob

In [18]:
NNF2_df['word_code'] = NNF2_df['word'] + '_' + NNF2_df['pos']

In [19]:
dups = NNF2_df.duplicated(subset=['word_code'])

In [21]:
NNF2_df['dup'] = dups

In [23]:
dup_rows = NNF2_df[NNF2_df['dup'] == True]

In [26]:
groups = dup_rows.groupby(dup_rows['word'], group_keys='word_code')

In [None]:
def highlight(d):
    df = pd.DataFrame(columns=d.columns, index=d.index)
    
    col1 = d.columns[0]
    col2 = d.columns[1]
    df[[col1, col2]] = 'background: None'
    
    df.loc[d[col1].ne(d[col2]), [col1, col2]] = 'background: yellow'

    return df
    


NameError: name 'df' is not defined

In [70]:
def standardize_forms(form_dict):
    if isinstance(form_dict, dict):
        new_forms = []
        if 'form' in form_dict:
            form = form_dict['form']
            if ']' in form:
                form = form.split(']')

                print('split form',form)
                if len(form) > 1:
                    form = form[1]
                    form_dict['form'] = form[1]
            return form_dict
                #form_dict['form'] = form.split(']')[1]
    
        
    

In [73]:
NNF2_full_df_dups[NNF2_full_df_dups.apply(lambda x: [standardize_forms(y) for y in x['forms']])]

KeyError: 'forms'

In [68]:
NNF2_full_df_dups['forms'].value_counts()

forms
[{'form': '[B]', 'tags': ['canonical']}]                                                                                                                    7
[{'form': 'beren', 'tags': ['plural']}, {'form': 'beertje', 'tags': ['diminutive', 'singular']}, {'form': 'beertjes', 'tags': ['diminutive', 'plural']}]    4
[{'form': 'kolot', 'tags': ['plural']}]                                                                                                                     3
[{'form': 'voerde over', 'tags': ['past']}, {'form': 'overgevoerd', 'tags': ['past', 'participle']}]                                                        3
[{'form': '[B] mul', 'tags': ['canonical']}]                                                                                                                3
                                                                                                                                                           ..
[{'form': 'overíjle', 'tags': ['canonical']}] 

In [67]:
display(forms.value_counts())

Series([], Name: count, dtype: int64)

In [None]:
froms = NNF2_full_df_dups.apply()

In [None]:
drop_indexes = []
same_word_code = []
counter = 0
last_row_index = 0
last_row = NNF2_full_df_dups.iloc[0, :]
new_rows = []
for i, row in NNF2_full_df_dups.iterrows():
    if counter != 0:
        if row.equals(last_row):
            print('equal_rows')
            print(i)
            drop_indexes.append(i)
        else:
            compared = NNF2_full_df_dups.loc[i].compare(NNF2_full_df_dups.loc[last_index])
            
            display(row)
            display(last_row)
        
            #highlight(NNF2_full_df_dups)
            last_row = row

            last_index = i
            last_row = row     
    
    
    counter += 1
    if counter > 3:
        break

Unnamed: 0,self,other
derived,"[{'word': 'de dato'}, {'word': 'de facto'}, {'...",
forms,"[{'form': '[D] de', 'tags': ['canonical']}]","[{'form': '[C] de', 'tags': ['canonical']}]"


word                                                              de
pos                                                             prep
lang_code                                                         nl
lang                                                      Nederlands
standard_lang                                                  dutch
senses             [{'glosses': ['van'], 'raw_tags': ['alleen in ...
antonyms                                                         NaN
categories         [Ontbrekend geluid, Retrograad van het Nederla...
derived            [{'word': 'de dato'}, {'word': 'de facto'}, {'...
etymology_texts                                                  NaN
hyphenations                                                     NaN
hyponyms                                                         NaN
related                                                          NaN
sounds                                             [[{'ipa': 'də'}]]
translations                      

word                                                              de
pos                                                             prep
lang_code                                                         nl
lang                                                      Nederlands
standard_lang                                                  dutch
senses             [{'glosses': ['van'], 'raw_tags': ['alleen in ...
antonyms                                                         NaN
categories         [Ontbrekend geluid, Retrograad van het Nederla...
derived                                                          NaN
etymology_texts                                                  NaN
hyphenations                                                     NaN
hyponyms                                                         NaN
related                                                          NaN
sounds                                             [[{'ipa': 'də'}]]
translations                      

Unnamed: 0,self,other
word,het,de
pos,pron,prep
senses,[{'glosses': ['3e persoon enkelvoud onzijdig']...,"[{'glosses': ['van'], 'raw_tags': ['alleen in ..."
categories,"[Erfwoord_in_het_Nederlands, Onbepaald voornaa...","[Ontbrekend geluid, Retrograad van het Nederla..."
derived,"[{'word': 'hetgeen'}, {'word': 'hetwelk'}, {'w...","[{'word': 'de dato'}, {'word': 'de facto'}, {'..."
etymology_texts,[erfwoord als persoonlijk voornaamwoord aanget...,
hyphenations,[{'parts': ['het']}],
sounds,"[[{'ipa': 'hɛt'}], [{'ipa': '/ɦɛt/', 'raw_tags...",[[{'ipa': 'də'}]]
translations,"[{'word': 'it', 'lang_code': 'en', 'lang': 'en...",[]
tags,[indefinite],


word                                                             het
pos                                                             pron
lang_code                                                         nl
lang                                                      Nederlands
standard_lang                                                  dutch
senses             [{'glosses': ['3e persoon enkelvoud onzijdig']...
antonyms                                                         NaN
categories         [Erfwoord_in_het_Nederlands, Onbepaald voornaa...
derived            [{'word': 'hetgeen'}, {'word': 'hetwelk'}, {'w...
etymology_texts    [erfwoord als persoonlijk voornaamwoord aanget...
hyphenations                                    [{'parts': ['het']}]
hyponyms                                                         NaN
related                                                          NaN
sounds             [[{'ipa': 'hɛt'}], [{'ipa': '/ɦɛt/', 'raw_tags...
translations       [{'word': 'it',

word                                                              de
pos                                                             prep
lang_code                                                         nl
lang                                                      Nederlands
standard_lang                                                  dutch
senses             [{'glosses': ['van'], 'raw_tags': ['alleen in ...
antonyms                                                         NaN
categories         [Ontbrekend geluid, Retrograad van het Nederla...
derived            [{'word': 'de dato'}, {'word': 'de facto'}, {'...
etymology_texts                                                  NaN
hyphenations                                                     NaN
hyponyms                                                         NaN
related                                                          NaN
sounds                                             [[{'ipa': 'də'}]]
translations                      

Unnamed: 0,self,other
word,IPA,het
pos,noun,pron
senses,[{'glosses': [''extra hoppig' lichtbier of pil...,[{'glosses': ['3e persoon enkelvoud onzijdig']...
categories,"[Initiaalwoord_in_het_Nederlands, Ontbrekend g...","[Erfwoord_in_het_Nederlands, Onbepaald voornaa..."
derived,,"[{'word': 'hetgeen'}, {'word': 'hetwelk'}, {'w..."
etymology_texts,"[zn o: (initiaalwoord) van Engels IPA en [2], ...",[erfwoord als persoonlijk voornaamwoord aanget...
hyphenations,[{'parts': ['IPA']}],[{'parts': ['het']}]
sounds,"[[{'ipa': 'ipeˈʔa'}], [{'ipa': 'ɑjpiˈʔe'}]]","[[{'ipa': 'hɛt'}], [{'ipa': '/ɦɛt/', 'raw_tags..."
translations,[],"[{'word': 'it', 'lang_code': 'en', 'lang': 'en..."
tags,[masculine],[indefinite]


word                                                             IPA
pos                                                             noun
lang_code                                                         nl
lang                                                      Nederlands
standard_lang                                                  dutch
senses             [{'glosses': [''extra hoppig' lichtbier of pil...
antonyms                                                         NaN
categories         [Initiaalwoord_in_het_Nederlands, Ontbrekend g...
derived                                                          NaN
etymology_texts    [zn o: (initiaalwoord) van Engels IPA en [2], ...
hyphenations                                    [{'parts': ['IPA']}]
hyponyms                                                         NaN
related                                                          NaN
sounds                   [[{'ipa': 'ipeˈʔa'}], [{'ipa': 'ɑjpiˈʔe'}]]
translations                      

word                                                             het
pos                                                             pron
lang_code                                                         nl
lang                                                      Nederlands
standard_lang                                                  dutch
senses             [{'glosses': ['3e persoon enkelvoud onzijdig']...
antonyms                                                         NaN
categories         [Erfwoord_in_het_Nederlands, Onbepaald voornaa...
derived            [{'word': 'hetgeen'}, {'word': 'hetwelk'}, {'w...
etymology_texts    [erfwoord als persoonlijk voornaamwoord aanget...
hyphenations                                    [{'parts': ['het']}]
hyponyms                                                         NaN
related                                                          NaN
sounds             [[{'ipa': 'hɛt'}], [{'ipa': '/ɦɛt/', 'raw_tags...
translations       [{'word': 'it',

In [28]:
for group in groups.describe():
    display(group)

KeyboardInterrupt: 

### Internal Translations

### From ENF

### From NEF