In [None]:
import json
from pathlib import Path
import ujson
from dutchanalyzer.config import *
from dutchanalyzer.utilities.utils import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.replacement_utils import *
from dotenv import load_dotenv
import datetime
import re
from pprint import pprint
import ast
from tqdm import tqdm
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'dutchanalyzer.utils'

In [None]:
NNF_file = Path(WIKT_CLEANING_DIR, 'nl','NNF.jsonl')
NEF_file = Path(WIKT_CLEANING_DIR, 'nl','NEF.jsonl')
EEF_file = Path(WIKT_CLEANING_DIR, 'en','EEF.jsonl')
ENF_file = Path(WIKT_CLEANING_DIR, 'en','ENF.jsonl')
NEF_definitions = Path(WIKT_CLEANING_DIR, 'nl','NEF_definitions.jsonl')
NNF_definitions = Path(WIKT_CLEANING_DIR, 'nl','NNF_definitions.jsonl')
ENF_definitions = Path(WIKT_CLEANING_DIR, 'en','ENF_definitions.jsonl')
EEF_definitions = Path(WIKT_CLEANING_DIR, 'en','EEF_definitions.jsonl')

In [None]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))

## Flatten definitions

In [25]:
def filter_obj(obj):
    if 'translations' in obj:
        translations = []
        for t in obj['translations']:
            lang_code = t.get('lang_code')
            if lang_code == 'en' or lang_code == 'nl':
                translations.append(t)
        obj['translations'] = translations
    if obj.get('pos', '') == 'name' or obj.get('pos', '') == 'character':
        return None
    return obj

In [26]:
def reformat_translations(translations_list: list):
    # formatted as returning en translations, nl translations and other translations if they exist
    en_translations = []
    nl_translations = []
    other_translations = []
    accepted_tags = ['word', 'sense', 'sense_index', 'lang', 'lang_code', 'roman', 'tags', 'code', 'note', 'english', 'translation', 'raw_tags', 'taxonomic', 'topics', 'alt']
    for translation in translations_list:
        tags = translation.get('tags')
        note = translation.get('note')
        english = translation.get('english')
        alt = translation.get('alt')
        extra_translation_keys = False
        if translation.get('lang_code') == 'en' or translation.get('code') == 'en':
            en_translations.append({ "word" : translation.get('word'), "sense": translation.get('sense')})
            if tags:
                 en_translations[-1]['tags'] = tags
            if alt:
                en_translations[-1]['alt'] = alt
            for key in translation.keys():
                if key not in accepted_tags:
                    print("word : ", translation.get('word'), " other key: ", key)
                    extra_translation_keys = True
                    break
        elif translation.get('lang_code') == 'nl' or translation.get('code') == 'nl':
            nl_translations.append({ "word" : translation.get('word'), "sense": translation.get('sense')})
            if tags:
                 nl_translations[-1]['tags'] = tags
            if alt:
                 nl_translations[- 1]['alt'] = alt   
            for key in translation.keys():
                if key not in accepted_tags:
                    print("word : ", translation.get('word'), " other key: ", key)
                    extra_translation_keys = True
                    break
        else:
            other_translations.append({ "word" : translation.get('word'), "sense": translation.get('sense')})
            if tags:
                other_translations[-1]['tags'] = tags
            if alt:
                other_translations[-1]['alt'] = alt
            for key in translation.keys():
                if key not in accepted_tags:
                    print("word : ", translation.get('word'), " other key: ", key)
                    extra_translation_keys = True
                    break
    all_translations = {'en' : en_translations, 'nl': nl_translations, 'other': other_translations}
    return all_translations, extra_translation_keys

In [27]:
def split_to_senses_glosses_rows(obj: dict, prev_snum=0):
    new_rows = []
    new_translations = {}
    senses = obj.get('senses')
    translations = obj.get('translations')
    forms = obj.get('forms')
    if senses:
        for i, sense in senses.items():
            if 'glosses' in sense:
                glosses = sense['glosses'] 
                for j, gloss in enumerate(glosses):
                   new_rows.append({"word": obj.get("word"), "pos": obj.get("pos"), "lang_code": obj.get("lang_code"), 'sense_code': i, 'gloss_code':j, 'gloss': gloss, "wl_code": obj.get("wl_code")})
            else:
                new_rows.append({"word": obj.get("word"), "pos": obj.get("pos"), "lang_code": obj.get("lang_code"), 'sense_code': i, 'gloss_code': -1, 'gloss': None, "wl_code": obj.get("wl_code")})
    else:
        new_rows.append({"word": obj.get("word"), "pos": obj.get("pos"), "lang_code": obj.get("lang_code"), 'sense_code': -1, 'gloss_code': -1, 'gloss': None, "wl_code": obj.get("wl_code")})    
    if translations:
        all_translations, extra_keys = reformat_translations(translations)
        for k, v in all_translations.items():
            if v:
                new_translations[f'{k}_translations'] = v
    for i in range(len(new_rows)):
        if new_translations:
            new_rows[i].update(new_translations)
        if forms:
            new_rows[i]['forms'] = forms
    return new_rows


## Loading Definitions

In [58]:
NNF_new_defs_list = []
with open(NNF_definitions, 'r', encoding='utf-8') as f:
    for i, line in tqdm(enumerate(f)):
        
        loaded = json.loads(line)
        loaded = filter_obj(loaded)
        
        if loaded:
            NNF_new_defs_list.extend(split_to_senses_glosses_rows(loaded))
    

611444it [00:10, 60603.60it/s] 


In [59]:
NNF_df = pd.DataFrame(NNF_new_defs_list)
NNF_df

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,en_translations,forms,nl_translations
0,ja,adv,nl,0,0,"duidt bevestiging, instemming, toestemming, in...",NNF,"[{'word': 'yes', 'sense': 'duidt bevestiging o...",,
1,ja,intj,nl,0,0,kreet van opwinding,NNF,"[{'word': 'yes', 'sense': 'kreet van opwinding'}]",,
2,ja,intj,nl,1,0,nou ja: kijk zo simpel is het! dat je dat niet...,NNF,"[{'word': 'yes', 'sense': 'kreet van opwinding'}]",,
3,ja,noun,nl,0,0,bevestigend of instemmend antwoord,NNF,"[{'word': 'yes', 'sense': 'bevestigend of inst...",,
4,neen,intj,nl,0,0,ontkenning van de gestelde vraag of gedane bew...,NNF,"[{'word': 'no', 'sense': 'nee'}]",,
...,...,...,...,...,...,...,...,...,...,...
709451,mottigaardje,noun,nl,0,0,verkleinwoord enkelvoud van het zelfstandig na...,NNF,,[{'word': 'mottigaard'}],
709452,mottigaards,noun,nl,0,0,meervoud van het zelfstandig naamwoord mottigaard,NNF,,[{'word': 'mottigaard'}],
709453,bankpootjes,noun,nl,0,0,verkleinwoord meervoud van het zelfstandig naa...,NNF,,[{'word': 'bankpoot'}],
709454,bankpootje,noun,nl,0,0,verkleinwoord enkelvoud van het zelfstandig na...,NNF,,[{'word': 'bankpoot'}],


In [60]:
NNF_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709456 entries, 0 to 709455
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   word             709456 non-null  object
 1   pos              709456 non-null  object
 2   lang_code        709456 non-null  object
 3   sense_code       709456 non-null  object
 4   gloss_code       709456 non-null  int64 
 5   gloss            708785 non-null  object
 6   wl_code          709456 non-null  object
 7   en_translations  54260 non-null   object
 8   forms            665915 non-null  object
 9   nl_translations  66 non-null      object
dtypes: int64(1), object(9)
memory usage: 54.1+ MB


In [61]:
NNF_df.to_csv(Path(current_save_folder, 'NNF_definitions_df.csv'))

In [63]:
NEF_defs_list = []
with open(NEF_definitions, 'r', encoding='utf-8') as f:
    for i, line in tqdm(enumerate(f)):
        loaded = json.loads(line)
        loaded = filter_obj(loaded)
        if loaded:
            NEF_defs_list.extend(split_to_senses_glosses_rows(loaded))
        

17441it [00:00, 207342.31it/s]


In [64]:
NEF_df = pd.DataFrame(NEF_defs_list)
NEF_df.to_csv(Path(current_save_folder, 'NEF_definitions_df.csv'))

In [65]:
EEF_defs_list = []
with open(EEF_definitions, 'r', encoding='utf-8') as f:
    for i, line in tqdm(enumerate(f)):
        
        loaded = json.loads(line)
        loaded = filter_obj(loaded)
        if loaded:
            EEF_defs_list.extend(split_to_senses_glosses_rows(loaded))

1417383it [00:14, 96091.58it/s] 


In [66]:
EEF_df = pd.DataFrame(EEF_defs_list)
EEF_df.to_csv(Path(current_save_folder, 'EEF_definitions_df.csv'))

In [67]:
EEF_df

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,nl_translations,forms
0,dictionary,noun,en,0,0,A reference work listing words or names from o...,EEF,"[{'word': 'woordenboek', 'sense': 'publication...","[{'form': 'dictionaries', 'tags': ['plural']},..."
1,dictionary,noun,en,1,0,A reference work on a particular subject or ac...,EEF,"[{'word': 'woordenboek', 'sense': 'publication...","[{'form': 'dictionaries', 'tags': ['plural']},..."
2,dictionary,noun,en,2,0,A person or thing regarded as a repository or ...,EEF,"[{'word': 'woordenboek', 'sense': 'publication...","[{'form': 'dictionaries', 'tags': ['plural']},..."
3,dictionary,noun,en,3,0,The collection of words used or understood by ...,EEF,"[{'word': 'woordenboek', 'sense': 'publication...","[{'form': 'dictionaries', 'tags': ['plural']},..."
4,dictionary,noun,en,4,0,A synchronic dictionary of a standardised lang...,EEF,"[{'word': 'woordenboek', 'sense': 'publication...","[{'form': 'dictionaries', 'tags': ['plural']},..."
...,...,...,...,...,...,...,...,...,...
1464316,in motion,adj,en,0,-1,,EEF,,
1464317,be sad,verb,en,0,-1,,EEF,,
1464318,fashionable person,noun,en,0,-1,,EEF,,
1464319,what a pity,intj,en,0,-1,,EEF,,


In [68]:
ENF_defs_list = []
with open(ENF_definitions, 'r', encoding='utf-8') as f:
    for i, line in tqdm(enumerate(f)):
        
        loaded = json.loads(line)
        loaded = filter_obj(loaded)
        if loaded:
            ENF_defs_list.extend(split_to_senses_glosses_rows(loaded))

140758it [00:01, 91684.04it/s] 


In [None]:
ENF_df = pd.DataFrame(ENF_defs_list)
ENF_df.to_csv(Path(current_save_folder, 'ENF_definitions_df.csv'))

In [70]:
ENF_df

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,forms
0,woordenboek,noun,nl,0,0,dictionary,ENF,"[{'form': 'woordenboeken', 'tags': ['plural']}..."
1,gratis,adj,nl,0,0,"free, without charge",ENF,"[{'form': 'no-table-tags', 'source': 'declensi..."
2,gratuit,adj,nl,0,0,"gratuitous, not obliged to",ENF,"[{'form': 'no-table-tags', 'source': 'declensi..."
3,word,verb,nl,0,0,inflection of worden:,ENF,[{'word': 'worden'}]
4,word,verb,nl,0,1,first-person singular present indicative,ENF,[{'word': 'worden'}]
...,...,...,...,...,...,...,...,...
213288,flikkerij,noun,nl,0,0,faggotry,ENF,
213289,leefruimte,noun,nl,0,0,living space,ENF,"[{'form': 'leefruimten', 'tags': ['plural']}, ..."
213290,vluchtnummer,noun,nl,0,0,flight number,ENF,"[{'form': 'vluchtnummers', 'tags': ['plural']}]"
213291,overnachting,noun,nl,0,0,overnight stay,ENF,"[{'form': 'overnachtingen', 'tags': ['plural']..."


### Finding Words That Are Equivalent

In [72]:
equal_definitions_NEF = get_equal_definitions(NEF_df)

In [73]:
equal_definitions_NEF

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,forms,en_translations
39,Esperanto,noun,en,0,0,Esperanto,NEF,,
76,Cherokee,noun,en,0,0,Cherokee,NEF,,
77,Cherokee,noun,en,1,0,Cherokee,NEF,,
108,Farsi,noun,en,0,0,Farsi,NEF,,
123,man,noun,en,0,0,man,NEF,,
...,...,...,...,...,...,...,...,...,...
24470,chateau migraine,noun,en,0,0,chateau migraine,NEF,"[{'form': 'Chateau Migraine, Château Migraine,...",
24662,envoi,noun,en,0,0,envoi,NEF,,
24675,minigun,noun,en,0,0,minigun,NEF,,
24861,LETS,noun,en,0,0,LETS,NEF,,


In [74]:
equal_definitions_ENF = get_equal_definitions(ENF_df)

In [75]:
equal_definitions_ENF

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,forms
33,product,noun,nl,0,0,product,ENF,"[{'form': 'producten', 'tags': ['plural']}, {'..."
40,aam,noun,nl,0,0,aam,ENF,"[{'form': 'amen', 'tags': ['plural']}]"
41,aardwolf,noun,nl,0,0,aardwolf,ENF,"[{'form': 'aardwolven', 'tags': ['plural']}, {..."
49,quiz,noun,nl,0,0,quiz,ENF,"[{'form': 'quizzen', 'tags': ['plural']}, {'fo..."
50,millennium,noun,nl,0,0,millennium,ENF,"[{'form': 'millennia', 'tags': ['plural']}, {'..."
...,...,...,...,...,...,...,...,...
205687,hexadec-,prefix,nl,0,0,hexadec-,ENF,
205688,heptadec-,prefix,nl,0,0,heptadec-,ENF,
205689,octadec-,prefix,nl,0,0,octadec-,ENF,
205690,nonadec-,prefix,nl,0,0,nonadec-,ENF,


In [None]:
def get_duplicate_words(df, filter_by=['word', 'pos', 'gloss']):
    # repeated = (
    # df.groupby(['word', 'pos', 'gloss'])
    # .filter(lambda g: len(g) > 1)
    # .groupby(['word', 'pos', 'gloss'], as_index=False)
    # .agg({'gloss': list})
    #         )
    # duplicates = duplicates[duplicates['pos'] > 1]
    duplicates = df[df.duplicated(subset=filter_by, keep=False)]
    return duplicates

In [None]:
pd.set_option('display.width', 5000)
pd.set_option

In [None]:
dups_ENF = get_duplicate_words(ENF_df)
display()

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,forms
3,word,verb,nl,0,0,inflection of worden:,ENF,[{'word': 'worden'}]
5,word,verb,nl,1,0,inflection of worden:,ENF,[{'word': 'worden'}]
7,word,verb,nl,2,0,inflection of worden:,ENF,[{'word': 'worden'}]
9,pond,noun,nl,0,0,"unit of mass, often broadly similar to 500 grams",ENF,"[{'form': 'ponden', 'tags': ['plural']}, {'for..."
11,pond,noun,nl,1,0,"unit of mass, often broadly similar to 500 grams",ENF,"[{'form': 'ponden', 'tags': ['plural']}, {'for..."
...,...,...,...,...,...,...,...,...
213271,gemotiveerde,verb,nl,1,0,inflection of gemotiveerd:,ENF,[{'word': 'gemotiveerd'}]
213273,gemotiveerde,verb,nl,2,0,inflection of gemotiveerd:,ENF,[{'word': 'gemotiveerd'}]
213275,zwaarste,adj,nl,0,0,"inflection of zwaarst, the superlative degree ...",ENF,"[{'word': 'zwaarst', 'extra': 'the superlative..."
213277,zwaarste,adj,nl,1,0,"inflection of zwaarst, the superlative degree ...",ENF,"[{'word': 'zwaarst', 'extra': 'the superlative..."


In [15]:
ENF_df = pd.read_csv(Path(current_save_folder, 'ENF_definitions_df.csv'), index_col=0)

In [16]:
ENF_df

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,forms
0,woordenboek,noun,nl,0,0,dictionary,ENF,"[{'form': 'woordenboeken', 'tags': ['plural']}..."
1,gratis,adj,nl,0,0,"free, without charge",ENF,"[{'form': 'no-table-tags', 'source': 'declensi..."
2,gratuit,adj,nl,0,0,"gratuitous, not obliged to",ENF,"[{'form': 'no-table-tags', 'source': 'declensi..."
3,word,verb,nl,0,0,inflection of worden:,ENF,[{'word': 'worden'}]
4,word,verb,nl,0,1,first-person singular present indicative,ENF,[{'word': 'worden'}]
...,...,...,...,...,...,...,...,...
213288,flikkerij,noun,nl,0,0,faggotry,ENF,
213289,leefruimte,noun,nl,0,0,living space,ENF,"[{'form': 'leefruimten', 'tags': ['plural']}, ..."
213290,vluchtnummer,noun,nl,0,0,flight number,ENF,"[{'form': 'vluchtnummers', 'tags': ['plural']}]"
213291,overnachting,noun,nl,0,0,overnight stay,ENF,"[{'form': 'overnachtingen', 'tags': ['plural']..."


In [9]:
ENF_defs_list_no_change = []
with open(ENF_definitions, 'r', encoding='utf-8') as f:
    for i, line in tqdm(enumerate(f)):
        
        loaded = json.loads(line)
        loaded = filter_obj(loaded)
        if loaded:
            ENF_defs_list_no_change.append(loaded)

140758it [00:02, 55413.00it/s]


In [93]:
ENF_defs_list_no_change

[{'word': 'woordenboek',
  'pos': 'noun',
  'lang_code': 'nl',
  'senses': {'0': {'glosses': ['dictionary']}},
  'forms': [{'form': 'woordenboeken', 'tags': ['plural']},
   {'form': 'woordenboekje', 'tags': ['diminutive', 'neuter']},
   {'form': 'woordboek', 'tags': ['alternative', 'obsolete']},
   {'form': 'woordboeck', 'tags': ['alternative', 'obsolete']}],
  'wl_code': 'ENF',
  'etymology_templates': [{'name': 'af',
    'args': {'1': 'nl',
     '2': 'woord',
     't1': 'word',
     '3': '-en-',
     '4': 'boek',
     't3': 'book'},
    'expansion': 'woord (“word”) + -en- + boek (“book”)'},
   {'name': 'cal',
    'args': {'1': 'nl', '2': 'LL.', 'nocap': '1', '3': 'vocābulārium'},
    'expansion': 'calque of Late Latin vocābulārium'},
   {'name': 'cog',
    'args': {'1': 'en', '2': 'wordbook'},
    'expansion': 'English wordbook'}]},
 {'word': 'gratis',
  'pos': 'adj',
  'lang_code': 'nl',
  'senses': {'0': {'glosses': ['free, without charge']}},
  'forms': [{'form': 'no-table-tags',


In [10]:
ENF_no_change_df = pd.DataFrame(ENF_defs_list_no_change)
ENF_no_change_df

Unnamed: 0,word,pos,lang_code,senses,forms,wl_code,etymology_templates
0,woordenboek,noun,nl,{'0': {'glosses': ['dictionary']}},"[{'form': 'woordenboeken', 'tags': ['plural']}...",ENF,"[{'name': 'af', 'args': {'1': 'nl', '2': 'woor..."
1,gratis,adj,nl,"{'0': {'glosses': ['free, without charge']}}","[{'form': 'no-table-tags', 'source': 'declensi...",ENF,"[{'name': 'bor', 'args': {'1': 'nl', '2': 'la'..."
2,gratuit,adj,nl,"{'0': {'glosses': ['gratuitous, not obliged to...","[{'form': 'no-table-tags', 'source': 'declensi...",ENF,"[{'name': 'bor', 'args': {'1': 'nl', '2': 'fr'..."
3,word,verb,nl,"{'0': {'glosses': ['inflection of worden:', 'f...",[{'word': 'worden'}],ENF,
4,pond,noun,nl,"{'0': {'glosses': ['unit of mass, often broadl...","[{'form': 'ponden', 'tags': ['plural']}, {'for...",ENF,"[{'name': 'inh', 'args': {'1': 'nl', '2': 'dum..."
...,...,...,...,...,...,...,...
128457,flikkerij,noun,nl,{'0': {'glosses': ['faggotry']}},,ENF,"[{'name': 'af', 'args': {'1': 'nl', '2': 'flik..."
128458,leefruimte,noun,nl,{'0': {'glosses': ['living space']}},"[{'form': 'leefruimten', 'tags': ['plural']}, ...",ENF,"[{'name': 'compound', 'args': {'1': 'nl', '2':..."
128459,vluchtnummer,noun,nl,{'0': {'glosses': ['flight number']}},"[{'form': 'vluchtnummers', 'tags': ['plural']}]",ENF,"[{'name': 'compound', 'args': {'1': 'nl', '2':..."
128460,overnachting,noun,nl,{'0': {'glosses': ['overnight stay']}},"[{'form': 'overnachtingen', 'tags': ['plural']...",ENF,"[{'name': 'af', 'args': {'1': 'nl', '2': 'over..."


In [11]:
tqdm.pandas(ncols=50)

In [None]:
enf_dups = get_duplicate_words(ENF_df, filter_by=['word','pos','gloss'])

In [20]:
enf_dups_gloss = get_duplicate_words(ENF_df, filter_by=['gloss'])

In [21]:
with pd.option_context('display.max_colwidth', None, 'display.width', 2000, 'display.max_columns', None):
    display(enf_dups)
    display(enf_dups_gloss)

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,forms
3,word,verb,nl,0,0,inflection of worden:,ENF,[{'word': 'worden'}]
5,word,verb,nl,1,0,inflection of worden:,ENF,[{'word': 'worden'}]
7,word,verb,nl,2,0,inflection of worden:,ENF,[{'word': 'worden'}]
9,pond,noun,nl,0,0,"unit of mass, often broadly similar to 500 grams",ENF,"[{'form': 'ponden', 'tags': ['plural']}, {'form': 'pondje', 'tags': ['diminutive', 'neuter']}]"
11,pond,noun,nl,1,0,"unit of mass, often broadly similar to 500 grams",ENF,"[{'form': 'ponden', 'tags': ['plural']}, {'form': 'pondje', 'tags': ['diminutive', 'neuter']}]"
...,...,...,...,...,...,...,...,...
213271,gemotiveerde,verb,nl,1,0,inflection of gemotiveerd:,ENF,[{'word': 'gemotiveerd'}]
213273,gemotiveerde,verb,nl,2,0,inflection of gemotiveerd:,ENF,[{'word': 'gemotiveerd'}]
213275,zwaarste,adj,nl,0,0,"inflection of zwaarst, the superlative degree of zwaar:",ENF,"[{'word': 'zwaarst', 'extra': 'the superlative degree of zwaar:'}]"
213277,zwaarste,adj,nl,1,0,"inflection of zwaarst, the superlative degree of zwaar:",ENF,"[{'word': 'zwaarst', 'extra': 'the superlative degree of zwaar:'}]"


Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,forms
0,woordenboek,noun,nl,0,0,dictionary,ENF,"[{'form': 'woordenboeken', 'tags': ['plural']}, {'form': 'woordenboekje', 'tags': ['diminutive', 'neuter']}, {'form': 'woordboek', 'tags': ['alternative', 'obsolete']}, {'form': 'woordboeck', 'tags': ['alternative', 'obsolete']}]"
1,gratis,adj,nl,0,0,"free, without charge",ENF,"[{'form': 'no-table-tags', 'source': 'declension', 'tags': ['table-tags']}, {'form': 'nl-decl-adj', 'source': 'declension', 'tags': ['inflection-template']}, {'form': 'gratis', 'tags': ['adverbial', 'positive', 'predicative'], 'source': 'declension'}, {'form': 'gratis', 'tags': ['feminine', 'indefinite', 'masculine', 'positive', 'singular'], 'source': 'declension'}, {'form': 'gratis', 'tags': ['indefinite', 'neuter', 'positive', 'singular'], 'source': 'declension'}, {'form': 'gratis', 'tags': ['indefinite', 'plural', 'positive'], 'source': 'declension'}, {'form': 'gratis', 'tags': ['definite', 'positive'], 'source': 'declension'}, {'form': 'gratis', 'tags': ['partitive', 'positive'], 'source': 'declension'}]"
3,word,verb,nl,0,0,inflection of worden:,ENF,[{'word': 'worden'}]
4,word,verb,nl,0,1,first-person singular present indicative,ENF,[{'word': 'worden'}]
5,word,verb,nl,1,0,inflection of worden:,ENF,[{'word': 'worden'}]
...,...,...,...,...,...,...,...,...
213278,zwaarste,adj,nl,1,1,definite neuter singular attributive,ENF,"[{'word': 'zwaarst', 'extra': 'the superlative degree of zwaar:'}]"
213279,zwaarste,adj,nl,2,0,"inflection of zwaarst, the superlative degree of zwaar:",ENF,"[{'word': 'zwaarst', 'extra': 'the superlative degree of zwaar:'}]"
213280,zwaarste,adj,nl,2,1,plural attributive,ENF,"[{'word': 'zwaarst', 'extra': 'the superlative degree of zwaar:'}]"
213284,knollen voor citroenen verkopen,verb,nl,0,0,to scam,ENF,


In [51]:
ENF_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227284 entries, 0 to 227283
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   word        227284 non-null  object
 1   pos         227284 non-null  object
 2   lang_code   227284 non-null  object
 3   sense_code  227284 non-null  object
 4   gloss_code  227284 non-null  int64 
 5   gloss       227275 non-null  object
 6   wl_code     227284 non-null  object
 7   forms       200665 non-null  object
dtypes: int64(1), object(7)
memory usage: 13.9+ MB


In [79]:
both_lang_words = EEF_df.merge(NNF_df, on=['word', 'pos'], how='inner')

In [82]:
both_lang_words = both_lang_words[both_lang_words['pos'] != 'name']

In [83]:
both_lang_words

Unnamed: 0,word,pos,lang_code_x,senses_x,wl_code_x,nl_translations_x,lang_code_y,senses_y,wl_code_y,en_translations,nl_translations_y
0,thesaurus,noun,en,{'0': {'glosses': ['A publication that provide...,EEF,"{0: {'word': 'thesaurus', 'sense': 'book of sy...",nl,{'0': {'glosses': ['een systematisch georganis...,NNF,,
1,gratis,adj,en,{'0': {'glosses': ['Free: without charge.']}},EEF,,nl,{'0': {'glosses': ['wat zonder betaling verkre...,NNF,"{0: {'word': 'free', 'sense': 'wat zonder beta...",
2,word,verb,en,{'0': {'glosses': ['To say or write (something...,EEF,"{0: {'word': 'verwoorden', 'sense': 'to say or...",nl,{'0': {'glosses': ['eerste persoon enkelvoud t...,NNF,,
3,word,verb,en,{'0': {'glosses': ['Alternative form of worth ...,EEF,,nl,{'0': {'glosses': ['eerste persoon enkelvoud t...,NNF,,
4,pond,noun,en,{'0': {'glosses': ['An inland body of standing...,EEF,"{0: {'word': 'vijver', 'sense': 'small lake', ...",nl,{'0': {'glosses': ['naam voor verschillende mu...,NNF,"{0: {'word': 'pound', 'sense': 'naam voor vers...",
...,...,...,...,...,...,...,...,...,...,...,...
26185,galoppades,noun,en,"{'0': {'glosses': ['plural of galoppade.'], 'f...",EEF,,nl,{'0': {'glosses': ['meervoud van het zelfstand...,NNF,,
26186,geelbek,noun,en,{'0': {'glosses': ['Alternative form of geelbe...,EEF,,nl,"{'0': {'glosses': ['dier met een gele bek']}, ...",NNF,"{0: {'word': 'African weakfish', 'sense': 'vis...",
26191,longread,noun,en,{'0': {'glosses': ['A piece of (online) long-f...,EEF,,nl,{'0': {'glosses': ['lang achtergrond artikel i...,NNF,,
26192,longreads,noun,en,"{'0': {'glosses': ['plural of longread'], 'for...",EEF,,nl,{'0': {'glosses': ['meervoud van het zelfstand...,NNF,,


In [84]:
both_lang_words.sort_values(by='word')

Unnamed: 0,word,pos,lang_code_x,senses_x,wl_code_x,nl_translations_x,lang_code_y,senses_y,wl_code_y,en_translations,nl_translations_y
5891,'m,pron,en,{'0': {'glosses': ['Alternative form of 'em.']}},EEF,,nl,{'0': {'glosses': ['clitische voorwerpsvorm vo...,NNF,,
11918,'t,pron,en,"{'0': {'glosses': ['Contraction of it.']}, '1'...",EEF,,nl,{'0': {'glosses': ['clitische vorm van het als...,NNF,,
11919,'t,pron,en,"{'0': {'glosses': ['Contraction of it.']}, '1'...",EEF,,nl,{'0': {'glosses': ['clitische vorm van het']}},NNF,,
5114,-a,suffix,en,"{'0': {'glosses': ['plural of -um'], 'forms': ...",EEF,,nl,{'0': {'glosses': ['vormt een zelfstandig naam...,NNF,,
5115,-a,suffix,en,"{'0': {'glosses': ['plural of -on'], 'forms': ...",EEF,,nl,{'0': {'glosses': ['vormt een zelfstandig naam...,NNF,,
...,...,...,...,...,...,...,...,...,...,...,...
4601,über,adv,en,{'0': {'glosses': ['Alternative form of uber.']}},EEF,,nl,"{'0': {'glosses': ['erg veel, zeer, enorm']}}",NNF,,
20622,über-,prefix,en,{'0': {'glosses': ['Alternative form of uber-....,EEF,,nl,"{'0': {'glosses': ['in hoge mate, heel erg']},...",NNF,,
26117,übercool,adj,en,{'0': {'glosses': ['Alternative form of uberco...,EEF,,nl,"{'0': {'glosses': ['meer dan cool, supergaaf']}}",NNF,,
5415,übermensch,noun,en,{'0': {'glosses': ['An overman (“higher” man) ...,EEF,,nl,{'0': {'glosses': ['persoon met aangeboren eig...,NNF,,


In [87]:
both_english_defs_df = EEF_df.merge(ENF_df, on=['word', 'pos'], how='inner')

In [90]:
both_english_defs_df[both_english_defs_df['pos'] != 'name']

Unnamed: 0,word,pos,lang_code_x,senses_x,wl_code_x,nl_translations,lang_code_y,senses_y,wl_code_y
0,gratis,adj,en,{'0': {'glosses': ['Free: without charge.']}},EEF,,en,"{'0': {'glosses': ['free, without charge']}}",ENF
1,word,verb,en,{'0': {'glosses': ['To say or write (something...,EEF,"{0: {'word': 'verwoorden', 'sense': 'to say or...",en,"{'0': {'glosses': ['inflection of worden:', 'f...",ENF
2,word,verb,en,{'0': {'glosses': ['Alternative form of worth ...,EEF,,en,"{'0': {'glosses': ['inflection of worden:', 'f...",ENF
3,pond,noun,en,{'0': {'glosses': ['An inland body of standing...,EEF,"{0: {'word': 'vijver', 'sense': 'small lake', ...",en,"{'0': {'glosses': ['unit of mass, often broadl...",ENF
4,pies,noun,en,"{'0': {'glosses': ['plural of pie'], 'forms': ...",EEF,,en,{'0': {'glosses': ['alternative form of pis; p...,ENF
...,...,...,...,...,...,...,...,...,...
11352,Madame Jeanette,noun,en,{'0': {'glosses': ['A chili pepper (Capsicum c...,EEF,"{0: {'word': 'madame-jeanette', 'sense': 'chil...",en,{'0': {'glosses': ['alternative form of madame...,ENF
11353,labaria,noun,en,{'0': {'glosses': ['The fer-de-lance.']}},EEF,,en,{'0': {'glosses': ['synonym of gewone lanspunt...,ENF
11354,dirty chai,noun,en,{'0': {'glosses': ['A chai latte with a shot o...,EEF,"{0: {'word': 'dirty chai', 'sense': 'chai latt...",en,{'0': {'glosses': ['dirty chai (chai latte wit...,ENF
11355,horribile auditu,adv,en,{'0': {'glosses': ['Horrible to hear.']}},EEF,"{0: {'word': 'horribile auditu', 'sense': 'hor...",en,{'0': {'glosses': ['horribile auditu (horrible...,ENF


In [91]:
def extract_single_definition(sense):
    sense = dict(sense)
    if len(sense) == 1:
        if sense['0'].get('glosses'):
            if len(sense['0']['glosses']) == 1:
                definition = sense['0']['glosses'][0]
                return definition
            else:
                return f"multi_gloss: {len(sense['0']['glosses'])}"
        else:
            return "no_gloss"
    else:
        return f"multi_sense: {len(sense)}"

In [92]:
EEF_df['definition'] = EEF_df['senses'].apply(lambda x: extract_single_definition(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  EEF_df['definition'] = EEF_df['senses'].apply(lambda x: extract_single_definition(x))


In [95]:
EEF_df_path = Path(current_save_folder, 'EEF_definitions_df.csv')
EEF_df.to_csv(EEF_df_path)

In [94]:
ENF_df['definition'] = ENF_df['senses'].apply(lambda x: extract_single_definition(x))

In [97]:
ENF_df_path = Path(current_save_folder, 'ENF_definitions_df.csv')
ENF_df.to_csv(ENF_df_path)

In [102]:
NEF_df['definition'] = NEF_df['senses'].apply(lambda x: extract_single_definition(x))

In [103]:
NNF_df['definition'] = NNF_df['senses'].apply(lambda x: extract_single_definition(x))

In [109]:
NEF_df.head(10)

Unnamed: 0,word,pos,lang_code,senses,wl_code,en_translations,definition
0,busybody,noun,en,{'0': {'glosses': ['bemoeial; iemand die zich ...,NEF,,bemoeial; iemand die zich bemoeit met zaken va...
1,interfere,verb,en,{'0': {'glosses': ['bemoeien']}},NEF,,bemoeien
2,IPA,noun,en,{'0': {'glosses': ['biersoort gebrouwen uit li...,NEF,,biersoort gebrouwen uit lichte mout waaraan ex...
7,French,noun,en,"{'0': {'glosses': ['Frans zn']}, '1': {'glosse...",NEF,,multi_sense: 3
8,French,adj,en,{'0': {'glosses': ['Frans bn']}},NEF,,Frans bn
9,English,noun,en,"{'0': {'glosses': ['Engels']}, '1': {'glosses'...",NEF,,multi_sense: 2
10,English,adj,en,{'0': {'glosses': ['Engels']}},NEF,,Engels
11,German,noun,en,{'0': {'glosses': ['Duitser; inwoner van Duits...,NEF,,multi_sense: 3
12,German,adj,en,{'0': {'glosses': ['Duits; oorspronkelijk uit ...,NEF,,Duits; oorspronkelijk uit Duitsland of uit het...
13,Hebrew,noun,en,{'0': {'glosses': ['Hebreeuws']}},NEF,,Hebreeuws


In [None]:
def transform_df_to_col_table(origin_df, col_name, group_key='def_id'):
    dropped_df = origin_df[~origin_df[col_name].isna()]
    words_df = dropped_df.loc[:, ['entry_id', 'word']]
    df = dropped_df.loc[:, ['entry_id', 'word_id', 'pos', col_name]]
    df[col_name] = df[col_name].apply(safe_eval)
    df = df.explode(col_name, ignore_index=True)
    
    df = pd.concat([df.drop(columns=col_name), pd.json_normalize(df[col_name])],axis=1)
    
    df = df.groupby(group_key, as_index=False).aggregate('first').reindex(columns=df.columns)
    if 'word' in df.columns:
        df = df.rename(columns={'word': col_name})
    
    df = df.merge(words_df, on='def_id')
    df_cols = df.columns.tolist()
    df_cols = df_cols[0:2] + ['word'] + df_cols[2:-1]
    df = df[df_cols]
    #re_df = re_df.insert(1, 'word', col)
    return df

In [114]:
NEF_df_exploded = NEF_df.explode('senses', ignore_index=True)
NEF_df_exploded = pd.concat([NEF_df_exploded.drop(columns='senses'), pd.json_normalize(NEF_df_exploded['senses'])],axis=1)
NEF_df_exploded

Unnamed: 0,word,pos,lang_code,wl_code,en_translations,definition
0,busybody,noun,en,NEF,,bemoeial; iemand die zich bemoeit met zaken va...
1,interfere,verb,en,NEF,,bemoeien
2,IPA,noun,en,NEF,,biersoort gebrouwen uit lichte mout waaraan ex...
3,French,noun,en,NEF,,multi_sense: 3
4,French,noun,en,NEF,,multi_sense: 3
...,...,...,...,...,...,...
24907,decay,noun,en,NEF,,multi_sense: 2
24908,decay,noun,en,NEF,,multi_sense: 2
24909,decay,verb,en,NEF,,multi_sense: 3
24910,decay,verb,en,NEF,,multi_sense: 3


In [113]:
NEF_df_exploded

Unnamed: 0,word,pos,lang_code,senses,wl_code,en_translations,definition
0,busybody,noun,en,0,NEF,,bemoeial; iemand die zich bemoeit met zaken va...
1,interfere,verb,en,0,NEF,,bemoeien
2,IPA,noun,en,0,NEF,,biersoort gebrouwen uit lichte mout waaraan ex...
3,French,noun,en,0,NEF,,multi_sense: 3
4,French,noun,en,1,NEF,,multi_sense: 3
...,...,...,...,...,...,...,...
24907,decay,noun,en,0,NEF,,multi_sense: 2
24908,decay,noun,en,1,NEF,,multi_sense: 2
24909,decay,verb,en,0,NEF,,multi_sense: 3
24910,decay,verb,en,1,NEF,,multi_sense: 3


In [106]:
equal_defs_nef = NEF_df[NEF_df['word'] == NEF_df['definition']]

In [72]:
lang_codes_to_keep = ['nl', 'en', 'simple', 'ang', 'dum', 'nds', 'odt', 'nds-nl', 'enm']

In [78]:
print(len(one_gloss_defs))
print(len(multi_gloss_defs))

2165735
26278


In [80]:
display(one_gloss_defs[0:100])

[{'word': 'woordenboek',
  'pos': 'noun',
  'definition': 'dictionary',
  'wl_code': 'ENF',
  'lang_code': 'nl'},
 {'word': 'gratis',
  'pos': 'adj',
  'definition': 'free, without charge',
  'wl_code': 'ENF',
  'lang_code': 'nl'},
 {'word': 'gratuit',
  'pos': 'adj',
  'definition': 'gratuitous, not obliged to',
  'wl_code': 'ENF',
  'lang_code': 'nl'},
 {'word': 'pies',
  'pos': 'noun',
  'definition': 'alternative form of pis; pee, piss',
  'wl_code': 'ENF',
  'lang_code': 'nl'},
 {'word': 'A',
  'pos': 'character',
  'definition': 'The first letter of the Dutch alphabet, written in the Latin script.',
  'wl_code': 'ENF',
  'lang_code': 'nl'},
 {'word': 'raven',
  'pos': 'verb',
  'definition': 'to (hold a) rave, to party wildly',
  'wl_code': 'ENF',
  'lang_code': 'nl'},
 {'word': 'raven',
  'pos': 'noun',
  'definition': 'obsolete form of raaf',
  'wl_code': 'ENF',
  'lang_code': 'nl'},
 {'word': 'raven',
  'pos': 'noun',
  'definition': 'plural of raaf',
  'wl_code': 'ENF',
  'la

In [34]:
one_gloss_defs_df = pd.DataFrame(one_gloss_defs)

In [None]:
display(one_gloss_defs_df[~one_gloss_defs_df['translations'].isna()])

Unnamed: 0,word,pos,definition,wl_code,lang_code,translations
124707,event,noun,gebeurtenis,NEF,en,"[{'lang_code': 'is', 'lang': 'IJslands', 'word..."
133321,tonic,noun,een koolzuurhoudende frisdrank met als kenmerk...,NEF,en,"[{'lang_code': 'en', 'lang': 'Engels', 'word':..."
136657,LOL,intj,hier moet ik hard om lachen,NEF,en,"[{'lang_code': 'fr', 'lang': 'Frans', 'word': ..."
139710,ja,adv,"duidt bevestiging, instemming, toestemming, in...",NNF,nl,"[{'lang_code': 'af', 'lang': 'Afrikaans', 'wor..."
139711,ja,intj,nou ja: kijk zo simpel is het! dat je dat niet...,NNF,nl,"[{'lang_code': 'de', 'lang': 'Duits', 'word': ..."
...,...,...,...,...,...,...
2164044,Tongeren-Borgloon,name,"A municipality of Limburg, Flanders, Belgium.",EEF,en,"[{'lang': 'Dutch', 'code': 'nl', 'lang_code': ..."
2164129,Evere,name,"A municipality of Brussels, Belgium.",EEF,en,"[{'lang': 'Dutch', 'code': 'nl', 'lang_code': ..."
2164130,Aarschot,name,"A city in Aerschot, Flanders, Belgium.",EEF,en,"[{'lang': 'Dutch', 'code': 'nl', 'lang_code': ..."
2164362,Lake Flevo,name,"An ancient lake in Germania, Europe.",EEF,en,"[{'lang': 'Dutch', 'code': 'nl', 'lang_code': ..."


In [38]:
dups = one_gloss_defs_df.duplicated(subset=['word'])

In [42]:
one_gloss_defs_df['duplicated'] = one_gloss_defs_df.duplicated(subset='word')

In [None]:
one_gloss_defs_df[one_gloss_defs_df['duplicated'] == True]
one

Unnamed: 0,word,pos,definition,wl_code,lang_code,translations,duplicated
6,raven,noun,obsolete form of raaf,ENF,nl,,True
7,raven,noun,plural of raaf,ENF,nl,,True
14,name,noun,obsolete form of naam,ENF,nl,,True
29,week,adj,"weak, gentle, weakhearted",ENF,nl,,True
30,week,verb,singular past indicative of wijken,ENF,nl,,True
...,...,...,...,...,...,...,...
2165608,pyridylamine,noun,Aminopyridine: an amine derived from pyridine,EEF,en,,True
2165649,0/10,adj,"Bad, horrible, not good.",EEF,en,,True
2165654,fcking,noun,Censored spelling of fucking.,EEF,en,,True
2165687,jorks,verb,third-person singular simple present indicativ...,EEF,en,,True


In [None]:
one_gloss_defs_df['def_lang'] = one_gloss_defs_df['wl_code'].apply(lambda x: 'en' if x[0] == "E" else "nl")

In [54]:
one_gloss_defs_df.to_csv(Path(current_save_folder, 'one_gloss_defs.csv'))

In [51]:
one_gloss_defs_df.rename(columns={'def_code':'def_lang'}, inplace=True)

In [52]:
one_gloss_defs_df = one_gloss_defs_df.loc[:, ['word', 'pos', 'lang_code', 'def_lang', 'definition', 'translations', 'wl_code', 'duplicated']]

In [53]:
one_gloss_defs_df

Unnamed: 0,word,pos,lang_code,def_lang,definition,translations,wl_code,duplicated
0,woordenboek,noun,nl,en,dictionary,,ENF,False
1,gratis,adj,nl,en,"free, without charge",,ENF,False
2,gratuit,adj,nl,en,"gratuitous, not obliged to",,ENF,False
3,pies,noun,nl,en,"alternative form of pis; pee, piss",,ENF,False
4,A,character,nl,en,"The first letter of the Dutch alphabet, writte...",,ENF,False
...,...,...,...,...,...,...,...,...
2165730,machine taper,noun,en,en,A system for securing cutting tools or toolhol...,,EEF,False
2165731,machine tapers,noun,en,en,plural of machine taper,,EEF,False
2165732,authentic assessments,noun,en,en,plural of authentic assessment,,EEF,False
2165733,barn-raisings,noun,en,en,plural of barn-raising,,EEF,False


In [57]:
duplicated_df = one_gloss_defs_df[one_gloss_defs_df['duplicated'] == True]

In [64]:
same_rows = one_gloss_defs_df[one_gloss_defs_df['word'] == one_gloss_defs_df['definition']]

In [65]:
same_rows

Unnamed: 0,word,pos,lang_code,def_lang,definition,translations,wl_code,duplicated
12,product,noun,nl,en,product,,ENF,False
18,aam,noun,nl,en,aam,,ENF,False
19,aardwolf,noun,nl,en,aardwolf,,ENF,False
26,quiz,noun,nl,en,quiz,,ENF,False
27,millennium,noun,nl,en,millennium,,ENF,False
...,...,...,...,...,...,...,...,...
1349491,K. pneumoniae carbapenemase,noun,en,en,K. pneumoniae carbapenemase,,EEF,False
1385442,RGSS,name,en,en,RGSS,,EEF,False
1513937,I don't want to buy anything,phrase,en,en,I don't want to buy anything,"[{'lang': 'Dutch', 'code': 'nl', 'lang_code': ...",EEF,False
1736129,Litvin,noun,en,en,Litvin,,EEF,False


In [67]:
duplicates = one_gloss_defs_df.groupby('word')['lang_code'].nunique().reset_index()
duplicates = duplicates[duplicates['lang_code'] > 1]

Unnamed: 0,word,lang_code
4,!Xóõ,2
73,'Auhelawa,2
250,'er,2
339,'m,2
360,'n,2
...,...,...
1920924,ǃKung,2
1921053,β-lactam,2
1921238,◌̀,2
1921239,◌́,2


In [60]:
groups = duplicated_df.groupby(group_keys=['word', 'pos'], by='word')

In [61]:
groups.describe()

KeyboardInterrupt: 

## NNF - EEF Replacement Rules Test

In [29]:
previous_save_path = Path(WIKT_CLEANING_DIR, '13-11-25')
NNF_df_file = Path(previous_save_path, 'NNF_definitions_df.csv')
EEF_df_file = Path(previous_save_path, 'EEF_definitions_df.csv')
NNF_df = pd.read_csv(NNF_df_file, index_col=0)
EEF_df = pd.read_csv(EEF_df_file, index_col=0)

  NNF_df = pd.read_csv(NNF_df_file, index_col=0)


In [32]:
NNF_df

Unnamed: 0,word,pos,lang_code,sense_code,gloss_code,gloss,wl_code,en_translations,forms,nl_translations
0,ja,adv,nl,0,0,"duidt bevestiging, instemming, toestemming, in...",NNF,"[{'word': 'yes', 'sense': 'duidt bevestiging o...",,
1,ja,intj,nl,0,0,kreet van opwinding,NNF,"[{'word': 'yes', 'sense': 'kreet van opwinding'}]",,
2,ja,intj,nl,1,0,nou ja: kijk zo simpel is het! dat je dat niet...,NNF,"[{'word': 'yes', 'sense': 'kreet van opwinding'}]",,
3,ja,noun,nl,0,0,bevestigend of instemmend antwoord,NNF,"[{'word': 'yes', 'sense': 'bevestigend of inst...",,
4,neen,intj,nl,0,0,ontkenning van de gestelde vraag of gedane bew...,NNF,"[{'word': 'no', 'sense': 'nee'}]",,
...,...,...,...,...,...,...,...,...,...,...
709451,mottigaardje,noun,nl,0,0,verkleinwoord enkelvoud van het zelfstandig na...,NNF,,[{'word': 'mottigaard'}],
709452,mottigaards,noun,nl,0,0,meervoud van het zelfstandig naamwoord mottigaard,NNF,,[{'word': 'mottigaard'}],
709453,bankpootjes,noun,nl,0,0,verkleinwoord meervoud van het zelfstandig naa...,NNF,,[{'word': 'bankpoot'}],
709454,bankpootje,noun,nl,0,0,verkleinwoord enkelvoud van het zelfstandig na...,NNF,,[{'word': 'bankpoot'}],


In [87]:
recorder_df = pd.DataFrame(columns=['word', 'pos', 'code', 'test1', 'result1', 'in_en_df'])

In [88]:
recorder_df = NNF_df.loc[:, ['word', 'pos', 'lang_code']]
recorder_df['word_code'] = recorder_df['word']+ '_' + recorder_df['pos']
recorder_df = recorder_df.drop_duplicates(subset=['word_code'])

In [89]:
recorder_df = recorder_df.drop_duplicates(subset=['word_code'])
recorder_df = recorder_df.sort_values(by='word_code')

In [90]:
recorder_df.head(10)

Unnamed: 0,word,pos,lang_code,word_code
45841,'k,pron,nl,'k_pron
80854,'m,pron,nl,'m_pron
18767,'n,article,nl,'n_article
604771,'nieuwe' tweezaadlobbigen,noun,nl,'nieuwe' tweezaadlobbigen_noun
229473,'r,adv,nl,'r_adv
229469,'r,pron,nl,'r_pron
27585,'s anderendaags,adv,nl,'s anderendaags_adv
34532,'s avonds,adv,nl,'s avonds_adv
34501,'s daags,adv,nl,'s daags_adv
501430,'s dinsdags,adv,nl,'s dinsdags_adv


In [50]:
EEF_df['word_code'] = EEF_df['word'] + '_' + EEF_df['pos']

In [91]:
def replace_suffix_tie(word, na_return_val=''):
    if len(word) < 3 or not word.endswith('tie'):
        return na_return_val
    return word[0:-3] + 'tion'
    
def replace_z_begin(word, na_return_val=''):
    if word.startswith('z'):
        return 's' + word[1:]
    return na_return_val
    
def remove_trailing_n(word, na_return_val=''):
    if word.endswith('n'):
        return word[0:-1]
    return na_return_val
    
def remove_end_en(word, na_return_val=''):
    if len(word) > 2 and word.endswith('en'):
        return word[0:-2]
    return na_return_val

def replace_oe_to_oo(word, na_return_val=''):
    index = word.find('oe')
    if index != -1:
        word.replace('oe', 'oo')
        return word
    else:
        return na_return_val

In [98]:
def replace_letters_in_word(word, original_letters, replace_with, na_return=''):
    if len(word) < len(original_letters):
        return na_return
    index = word.find(original_letters)
    if index != -1:
        word = word.replace(original_letters, replace_with)
        return word
    return na_return

In [None]:
def replace_beginning_letters(word, original_prefix, replace_with, na_return=''):
    if len(word) < len(original_prefix):
        return na_return
    
    if word.beginswith(original_prefix):
        word = replace_with + word[len(original_prefix):]
        return word
    return na_return

In [99]:
def replace_end_letters(word, original_suffix, replace_with, na_return=''):
    if len(word) < len(original_suffix):
        return na_return
    if word.endswith(original_suffix):
        word = word[0:-len(original_suffix)] + replace_with
        return word
    return na_return

In [92]:
recorder_df['test1'] = 'no change'
recorder_df['result1'] = recorder_df['word'].isin(EEF_df['word'])

In [93]:
display(recorder_df['result1'].value_counts())
recorder_df.head(10)

result1
False    578143
True      22189
Name: count, dtype: int64

Unnamed: 0,word,pos,lang_code,word_code,test1,result1
45841,'k,pron,nl,'k_pron,no change,False
80854,'m,pron,nl,'m_pron,no change,True
18767,'n,article,nl,'n_article,no change,True
604771,'nieuwe' tweezaadlobbigen,noun,nl,'nieuwe' tweezaadlobbigen_noun,no change,False
229473,'r,adv,nl,'r_adv,no change,True
229469,'r,pron,nl,'r_pron,no change,True
27585,'s anderendaags,adv,nl,'s anderendaags_adv,no change,False
34532,'s avonds,adv,nl,'s avonds_adv,no change,False
34501,'s daags,adv,nl,'s daags_adv,no change,False
501430,'s dinsdags,adv,nl,'s dinsdags_adv,no change,False


In [94]:
recorder_df['test2'] = 'drop trailing n'
recorder_df['word2'] = recorder_df['word'].apply(lambda x: remove_trailing_n(x, x))

In [95]:
recorder_df['result2'] = recorder_df['word2'].isin(EEF_df['word'])

In [97]:
test = 'result2'
display(recorder_df[test].value_counts())
recorder_df.head(10)

result2
False    576789
True      23543
Name: count, dtype: int64

Unnamed: 0,word,pos,lang_code,word_code,test1,result1,test2,word2,result2
45841,'k,pron,nl,'k_pron,no change,False,drop trailing n,'k,False
80854,'m,pron,nl,'m_pron,no change,True,drop trailing n,'m,True
18767,'n,article,nl,'n_article,no change,True,drop trailing n,',True
604771,'nieuwe' tweezaadlobbigen,noun,nl,'nieuwe' tweezaadlobbigen_noun,no change,False,drop trailing n,'nieuwe' tweezaadlobbige,False
229473,'r,adv,nl,'r_adv,no change,True,drop trailing n,'r,True
229469,'r,pron,nl,'r_pron,no change,True,drop trailing n,'r,True
27585,'s anderendaags,adv,nl,'s anderendaags_adv,no change,False,drop trailing n,'s anderendaags,False
34532,'s avonds,adv,nl,'s avonds_adv,no change,False,drop trailing n,'s avonds,False
34501,'s daags,adv,nl,'s daags_adv,no change,False,drop trailing n,'s daags,False
501430,'s dinsdags,adv,nl,'s dinsdags_adv,no change,False,drop trailing n,'s dinsdags,False


In [81]:
display(recorder_df['result1'].value_counts())
display(recorder_df['transformed2'].value_counts())
display(recorder_df['result2'].value_counts())
recorder_df.head(10)

result1
False    580935
True      19397
Name: count, dtype: int64

transformed2
eene                    7
ene                     7
meerdere                6
al                      6
boeie                   6
                       ..
überseksuelers          1
α-proteobacterië        1
β-lactam                1
β-lactamantibioticum    1
's najaars              1
Name: count, Length: 558357, dtype: int64

result2
False    576789
True      23543
Name: count, dtype: int64

Unnamed: 0,word,pos,lang_code,word_code,test1,result1,test2,transformed2,result2
45841,'k,pron,nl,'k_pron,no change,False,drop trailing n,'k,False
80854,'m,pron,nl,'m_pron,no change,True,drop trailing n,'m,True
18767,'n,article,nl,'n_article,no change,False,drop trailing n,',True
604771,'nieuwe' tweezaadlobbigen,noun,nl,'nieuwe' tweezaadlobbigen_noun,no change,False,drop trailing n,'nieuwe' tweezaadlobbige,False
229473,'r,adv,nl,'r_adv,no change,False,drop trailing n,'r,True
229469,'r,pron,nl,'r_pron,no change,False,drop trailing n,'r,True
27585,'s anderendaags,adv,nl,'s anderendaags_adv,no change,False,drop trailing n,'s anderendaags,False
34532,'s avonds,adv,nl,'s avonds_adv,no change,False,drop trailing n,'s avonds,False
34501,'s daags,adv,nl,'s daags_adv,no change,False,drop trailing n,'s daags,False
501430,'s dinsdags,adv,nl,'s dinsdags_adv,no change,False,drop trailing n,'s dinsdags,False


In [104]:
def apply_test(df, check_df, test_num, test_description, original_letters, replace_with, replace_function='middle', na_val='', with_pos_check=False):
    df[f'test{test_num}'] = test_description
    if replace_function == 'begin':
        if not na_val == 'word':
            df[f'word{test_num}'] = df['word'].apply(lambda x: replace_beginning_letters(x, original_letters, replace_with, na_val))
        else:
            df[f'word{test_num}'] = df['word'].apply(lambda x: replace_beginning_letters(x, original_letters, replace_with, x))
    if replace_function == 'end':
        if not na_val == 'word':
            df[f'word{test_num}'] = df['word'].apply(lambda x: replace_end_letters(x, original_letters, replace_with, na_val))
        else:
            df[f'word{test_num}'] = df['word'].apply(lambda x: replace_end_letters(x, original_letters, replace_with, x))
    if replace_function == 'middle':
        if not na_val == 'word':
            df[f'word{test_num}'] = df['word'].apply(lambda x: replace_letters_in_word(x, original_letters, replace_with, na_val))
        else:
            df[f'word{test_num}'] = df['word'].apply(lambda x: replace_letters_in_word(x, original_letters, replace_with, x))
    if with_pos_check:
        temp_col = df[f'word{test_num}'] + '_' + df['pos']
        df[f'result{test_num}'] = temp_col.isin(EEF_df['word_code'])
    else:
        df[f'result{test_num}'] = df[f'word{test_num}'].isin(check_df['word'])
    return df


In [None]:
apply_test(recorder_df, EEF_df, 3, 'replace oe with oo', 'oe', 'oo', replace_function='middle', na_val='word')

Unnamed: 0,word,pos,lang_code,word_code,test1,result1,test2,word2,result2,test3,word3,result3
45841,'k,pron,nl,'k_pron,no change,False,drop trailing n,'k,False,replace oe with oo,'k,False
80854,'m,pron,nl,'m_pron,no change,True,drop trailing n,'m,True,replace oe with oo,'m,True
18767,'n,article,nl,'n_article,no change,True,drop trailing n,',True,replace oe with oo,'n,True
604771,'nieuwe' tweezaadlobbigen,noun,nl,'nieuwe' tweezaadlobbigen_noun,no change,False,drop trailing n,'nieuwe' tweezaadlobbige,False,replace oe with oo,'nieuwe' tweezaadlobbigen,False
229473,'r,adv,nl,'r_adv,no change,True,drop trailing n,'r,True,replace oe with oo,'r,True
...,...,...,...,...,...,...,...,...,...,...,...,...
164447,€50-biljetten,noun,nl,€50-biljetten_noun,no change,False,drop trailing n,€50-biljette,False,replace oe with oo,€50-biljetten,False
164411,€500-biljet,noun,nl,€500-biljet_noun,no change,False,drop trailing n,€500-biljet,False,replace oe with oo,€500-biljet,False
164466,€500-biljetje,noun,nl,€500-biljetje_noun,no change,False,drop trailing n,€500-biljetje,False,replace oe with oo,€500-biljetje,False
164467,€500-biljetjes,noun,nl,€500-biljetjes_noun,no change,False,drop trailing n,€500-biljetjes,False,replace oe with oo,€500-biljetjes,False


In [None]:
apply_test(recorder_df, EEF_df, 4, 'replace aa with a', 'aa', 'a', replace_function='middle', na_val='')

Unnamed: 0,word,pos,lang_code,word_code,test1,result1,test2,word2,result2,test3,word3,result3,test4,word4,result4
45841,'k,pron,nl,'k_pron,no change,False,drop trailing n,'k,False,replace oe with oo,'k,False,replace aa with a,'k,False
80854,'m,pron,nl,'m_pron,no change,True,drop trailing n,'m,True,replace oe with oo,'m,True,replace aa with a,'m,True
18767,'n,article,nl,'n_article,no change,True,drop trailing n,',True,replace oe with oo,'n,True,replace aa with a,'n,True
604771,'nieuwe' tweezaadlobbigen,noun,nl,'nieuwe' tweezaadlobbigen_noun,no change,False,drop trailing n,'nieuwe' tweezaadlobbige,False,replace oe with oo,'nieuwe' tweezaadlobbigen,False,replace aa with a,'nieuwe' tweezadlobbigen,False
229473,'r,adv,nl,'r_adv,no change,True,drop trailing n,'r,True,replace oe with oo,'r,True,replace aa with a,'r,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164447,€50-biljetten,noun,nl,€50-biljetten_noun,no change,False,drop trailing n,€50-biljette,False,replace oe with oo,€50-biljetten,False,replace aa with a,€50-biljetten,False
164411,€500-biljet,noun,nl,€500-biljet_noun,no change,False,drop trailing n,€500-biljet,False,replace oe with oo,€500-biljet,False,replace aa with a,€500-biljet,False
164466,€500-biljetje,noun,nl,€500-biljetje_noun,no change,False,drop trailing n,€500-biljetje,False,replace oe with oo,€500-biljetje,False,replace aa with a,€500-biljetje,False
164467,€500-biljetjes,noun,nl,€500-biljetjes_noun,no change,False,drop trailing n,€500-biljetjes,False,replace oe with oo,€500-biljetjes,False,replace aa with a,€500-biljetjes,False


In [109]:
if not Path.exists(current_save_folder):
    Path.mkdir(current_save_folder)
recorder_df.to_csv(Path(current_save_folder, 'recorder_df.csv'))

In [179]:
def display_results_overview(df, check_changed=True):
    test_indexes = []
    word_indexes = []
    result_indexes = []
    number_words_changed = None
    for i, col in enumerate(df.columns):
        if 'test' in col:
            print('------------')
            print('Test: ', df.iloc[0, i])
        elif 'word' in col and col != 'word_code':
            if check_changed:
                number_words_changed = df[df['word'] != df[col]]
            
                print('Words affected: ', number_words_changed.shape[0])
                
        elif 'result' in col:
            print("Number of words in other dictionary from test: ", len(df[df[col] == True]))
            
            print("Number of words changed in other dictionary: ", number_words_changed[col].value_counts())

In [180]:
display_results_overview(recorder_df)

Words affected:  0
------------
Test:  no change
Number of words in other dictionary from test:  22189
Number of words changed in other dictionary:  Series([], Name: count, dtype: int64)
------------
Test:  drop trailing n
Words affected:  122718
Number of words in other dictionary from test:  23543
Number of words changed in other dictionary:  result2
False    120195
True       2523
Name: count, dtype: int64
------------
Test:  replace oe with oo
Words affected:  43172
Number of words in other dictionary from test:  22471
Number of words changed in other dictionary:  result3
False    42779
True       393
Name: count, dtype: int64
------------
Test:  replace aa with a
Words affected:  74535
Number of words in other dictionary from test:  23647
Number of words changed in other dictionary:  result4
False    72969
True      1566
Name: count, dtype: int64


In [181]:
apply_test(recorder_df, EEF_df, 5, 'replace tie with tion', 'tie', 'tion', replace_function='end', na_val='word')

Unnamed: 0,word,pos,lang_code,word_code,test1,result1,test2,word2,result2,test3,word3,result3,test4,word4,result4,test5,word5,result5
45841,'k,pron,nl,'k_pron,no change,False,drop trailing n,'k,False,replace oe with oo,'k,False,replace aa with a,'k,False,replace tie with tion,'k,False
80854,'m,pron,nl,'m_pron,no change,True,drop trailing n,'m,True,replace oe with oo,'m,True,replace aa with a,'m,True,replace tie with tion,'m,True
18767,'n,article,nl,'n_article,no change,True,drop trailing n,',True,replace oe with oo,'n,True,replace aa with a,'n,True,replace tie with tion,'n,True
604771,'nieuwe' tweezaadlobbigen,noun,nl,'nieuwe' tweezaadlobbigen_noun,no change,False,drop trailing n,'nieuwe' tweezaadlobbige,False,replace oe with oo,'nieuwe' tweezaadlobbigen,False,replace aa with a,'nieuwe' tweezadlobbigen,False,replace tie with tion,'nieuwe' tweezaadlobbigen,False
229473,'r,adv,nl,'r_adv,no change,True,drop trailing n,'r,True,replace oe with oo,'r,True,replace aa with a,'r,True,replace tie with tion,'r,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164447,€50-biljetten,noun,nl,€50-biljetten_noun,no change,False,drop trailing n,€50-biljette,False,replace oe with oo,€50-biljetten,False,replace aa with a,€50-biljetten,False,replace tie with tion,€50-biljetten,False
164411,€500-biljet,noun,nl,€500-biljet_noun,no change,False,drop trailing n,€500-biljet,False,replace oe with oo,€500-biljet,False,replace aa with a,€500-biljet,False,replace tie with tion,€500-biljet,False
164466,€500-biljetje,noun,nl,€500-biljetje_noun,no change,False,drop trailing n,€500-biljetje,False,replace oe with oo,€500-biljetje,False,replace aa with a,€500-biljetje,False,replace tie with tion,€500-biljetje,False
164467,€500-biljetjes,noun,nl,€500-biljetjes_noun,no change,False,drop trailing n,€500-biljetjes,False,replace oe with oo,€500-biljetjes,False,replace aa with a,€500-biljetjes,False,replace tie with tion,€500-biljetjes,False


In [182]:
display_results_overview(recorder_df)

Words affected:  0
------------
Test:  no change
Number of words in other dictionary from test:  22189
Number of words changed in other dictionary:  Series([], Name: count, dtype: int64)
------------
Test:  drop trailing n
Words affected:  122718
Number of words in other dictionary from test:  23543
Number of words changed in other dictionary:  result2
False    120195
True       2523
Name: count, dtype: int64
------------
Test:  replace oe with oo
Words affected:  43172
Number of words in other dictionary from test:  22471
Number of words changed in other dictionary:  result3
False    42779
True       393
Name: count, dtype: int64
------------
Test:  replace aa with a
Words affected:  74535
Number of words in other dictionary from test:  23647
Number of words changed in other dictionary:  result4
False    72969
True      1566
Name: count, dtype: int64
------------
Test:  replace tie with tion
Words affected:  3258
Number of words in other dictionary from test:  23229
Number of words ch