In [1]:
import Levenshtein
import pandas as pd
import numpy as np
from dutchanalyzer.config import *
from dutchanalyzer.utilities.pandas_utils import *
from dutchanalyzer.utilities.replacement_utils import *
from dutchanalyzer.utilities.json_utils import *

In [4]:
import pickle

In [5]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(WIKT_CLEANING_DIR, str(today))
folders = {'en': ['EEF', 'ENF'], 'nl':['NEF', 'NNF']}


for k, v in folders.items():
    for f in v:
        Path.mkdir(Path(current_save_folder, k, f), parents=True, exist_ok=True)

In [6]:
enf_one_sense_path = Path(ENF_FOLDER, 'one_sense_entries.pkl')
enf_one_word_glosses_path = Path(ENF_FOLDER, 'one_word_entries.pkl')

In [14]:
enf_one_word_entries = []
with open(enf_one_word_glosses_path, 'rb') as f:
    enf_one_word_entries = pickle.load(f)

- The one word defs are not actually one word defs. Will need to be paired down. 

In [None]:
enf_one_word_entries

{('woordenboek', 'noun'): 'dictionary',
 ('gratis', 'adj'): 'free, without charge',
 ('gratuit', 'adj'): 'gratuitous, not obliged to',
 ('pies', 'noun'): 'alternative form of pis; pee, piss',
 ('raven', 'verb'): 'to (hold a) rave, to party wildly',
 ('raven', 'noun'): 'plural of raaf',
 ('trade', 'verb'): 'singular past subjunctive of treden',
 ('patronage', 'noun'): 'patronage (act of providing approval and support)',
 ('product', 'noun'): 'product',
 ('name', 'verb'): 'singular past subjunctive of nemen',
 ('a-', 'prefix'): 'a-: not, without, opposite of',
 ('aam', 'noun'): 'aam',
 ('aardwolf', 'noun'): 'aardwolf',
 ('abattoir', 'noun'): 'abattoir, slaughterhouse',
 ('abject', 'adj'): 'reprehensible, despicable, abject',
 ('abrupt', 'adj'): 'abrupt (sudden)',
 ('on',
  'adv'): 'rarely used as shorthand for oneven (odd), the prefix on- means not (corresponds to English un-)',
 ('kun', 'verb'): 'second-person singular present indicative of kunnen',
 ('quiz', 'noun'): 'quiz',
 ('millenn

## Levenshtein Distance

In [24]:
words_with_dist_0_2 = {}
words_with_higher = {}
words_with_100 = {}
lev_counts = {}
for i, j in enf_one_word_entries.items():
    distance = Levenshtein.distance(i[0], j)
    gloss = j
    if distance in lev_counts:
        lev_counts[distance] += 1
    else:
        lev_counts[distance] = 1
    if distance <= 2:
        words_with_dist_0_2[i] = {'gloss': j, 'lev_distance': distance}
    elif distance > 100:
        words_with_100[i] = {'gloss': j, 'lev_distance': distance}
    else:
        words_with_higher[i] = {'gloss': j, 'lev_distance': distance}

   

In [25]:
words_with_100

{('een',
  'article'): {'gloss': 'a, an; indefinite article, placed before a singular noun, indicating a general case of a person or thing', 'lev_distance': 101},
 ('most',
  'noun'): {'gloss': 'must (unfermented or partially fermented mashed grapes or rarely other fruits, an early stage in the production of wine)', 'lev_distance': 116},
 ('guanaco',
  'noun'): {'gloss': 'guanaco (a South American ruminant (Lama guanicoe), closely related to the other lamoids, the alpaca, llama, and vicuña in the family Camelidae)', 'lev_distance': 137},
 ('rechtsomkeert',
  'adv'): {'gloss': "(only used with maken) rechtsomkeert maken: do an about-turn, do an about-face; turn on one's heels, make a U-turn", 'lev_distance': 101},
 ('gans',
  'noun'): {'gloss': 'a goose, a waterfowl of the family Anatidae, especially of the subfamily Anserinae excepting the genera Cygnus and Coscoroba; in non-scientific usage the term may be applied to larger waterfowl of the Anatidae who do not belong to the Anserinae'

In [22]:
list_lev = list(lev_counts.items())
list_lev.sort()


In [None]:
list_lev

[(0, 1091),
 (1, 787),
 (2, 1642),
 (3, 1137),
 (4, 1087),
 (5, 1123),
 (6, 1465),
 (7, 1545),
 (8, 1568),
 (9, 1549),
 (10, 1581),
 (11, 15921),
 (12, 10201),
 (13, 5941),
 (14, 1641),
 (15, 1497),
 (16, 5078),
 (17, 4192),
 (18, 1243),
 (19, 1242),
 (20, 1712),
 (21, 2593),
 (22, 1590),
 (23, 4665),
 (24, 1894),
 (25, 853),
 (26, 613),
 (27, 583),
 (28, 604),
 (29, 685),
 (30, 732),
 (31, 1074),
 (32, 920),
 (33, 3132),
 (34, 706),
 (35, 640),
 (36, 402),
 (37, 345),
 (38, 324),
 (39, 280),
 (40, 272),
 (41, 297),
 (42, 275),
 (43, 277),
 (44, 245),
 (45, 276),
 (46, 274),
 (47, 352),
 (48, 636),
 (49, 674),
 (50, 1201),
 (51, 340),
 (52, 268),
 (53, 171),
 (54, 148),
 (55, 133),
 (56, 157),
 (57, 132),
 (58, 123),
 (59, 132),
 (60, 126),
 (61, 105),
 (62, 152),
 (63, 550),
 (64, 354),
 (65, 104),
 (66, 140),
 (67, 163),
 (68, 220),
 (69, 252),
 (70, 495),
 (71, 320),
 (72, 55),
 (73, 92),
 (74, 65),
 (75, 72),
 (76, 126),
 (77, 100),
 (78, 71),
 (79, 62),
 (80, 58),
 (81, 55),
 (82,

## Form Of Pairing

In [None]:
def is_likely_form_of_def(gloss):
    """ 
    returns a bool and the word it is a form of probably this word
    """
    form_keywords = ['plural', 'subjunctive', 'second-person', 'indicative', 'first-person', 'third-person', 'singular', 'present', 'participle']

    isform = False
    if 'of' in gloss:
        if any(word in gloss for word in form_keywords):
            word = gloss.split(' ')[-1]
            return True, word
    print('is form of')
    return False

In [None]:
def get_first_word(gloss):
    isform, likely = is_likely_form_of_def(gloss)
    if isform: 
        return likely
    if ',' in gloss:
        gloss_parts = gloss.split(',')

    if gloss.startswith():
        return 'form'