# Extract Definitions

In [7]:
%reload_ext dutchanalyzer.utilities.utils
%reload_ext dutchanalyzer.utilities.json_utils

In [1]:
import json
from pathlib import Path
from dutchanalyzer.config import *
from dutchanalyzer.utilities.utils import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.replacement_utils import *
from dutchanalyzer.utilities.pandas_utils import *
from pathlib import Path
import datetime
from tqdm import tqdm

In [6]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))
folders = ['EEF', 'ENF', 'NEF', 'NNF']

for f in folders:
    Path.mkdir(Path(current_save_folder, f), parents=True, exist_ok=True)

In [4]:
# Paths
NNF_folder = Path(WIKT_CLEANING_DIR, 'nl', 'NNF')
NEF_folder = Path(WIKT_CLEANING_DIR, 'nl', 'NEF')
EEF_folder = Path(WIKT_CLEANING_DIR, 'en', 'EEF')
ENF_folder = Path(WIKT_CLEANING_DIR, 'en', 'ENF')

## Extracting Words/Pos/Senses

- Extract words and parts of speech to dict, add all senses to dict

In [4]:
def extract_entry_senses(entry) -> list:
    entry_id, word, pos, senses = get_eid_word_pos_senses(entry)
    sense_entries = []
    for i, sense in enumerate(senses):
        sense_entries.append({'entry_id':entry_id, 'sid':i, 'word':word, 'pos':pos})
        sense_entries[-1].update(sense)
    return sense_entries

In [21]:
def save_batch_to_file(batch, file, mode):
    with open(file, mode, encoding='utf-8') as out:
        for obj in batch:
            json.dump(obj, out, ensure_ascii=False)
            out.write('\n')

In [6]:
def extract_file_senses(file, save_folder):
    batch = []
    mode = 'w+'
    out_file = Path(save_folder, 'senses_extracted.jsonl')
    batch_size = 50000
    with open(file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, total=count_lines_with_progress(file, quiet=True)):
            loaded = json.loads(line)
            if loaded:
                entry_senses = extract_entry_senses(loaded)
                batch.extend(entry_senses)
                if len(batch) > batch_size:
                    save_batch_to_file(batch, out_file, mode)
                    if mode == 'w+':
                        mode = 'a'
                    batch = []
        if batch:
            save_batch_to_file(batch, out_file, mode)
    return batch

In [12]:
batch = extract_file_senses(EEF_FILE, Path(current_save_folder, 'en', 'EEF'))

100%|██████████| 1230364/1230364 [00:50<00:00, 24418.51it/s]


In [13]:
display(batch[0:10])

[{'entry_id': 'EEF_1197835',
  'sid': 0,
  'word': 'weight',
  'pos': 'verb',
  'glosses': ['To add weight to something; to make something heavier.'],
  'categories': ['English transitive verbs'],
  'links': [['add', 'add'], ['heavier', 'heavier']],
  'raw_glosses': ['(transitive) To add weight to something; to make something heavier.'],
  'tags': ['transitive']},
 {'entry_id': 'EEF_1197835',
  'sid': 1,
  'word': 'weight',
  'pos': 'verb',
  'glosses': ['To add weight to something; to make something heavier.',
   'To load (fabrics) with barite, etc. to increase the weight.'],
  'categories': ['English transitive verbs'],
  'links': [['add', 'add'], ['heavier', 'heavier'], ['barite', 'barite']],
  'raw_glosses': ['(transitive) To add weight to something; to make something heavier.',
   '(transitive, dyeing) To load (fabrics) with barite, etc. to increase the weight.'],
  'tags': ['transitive'],
  'topics': ['business', 'dyeing', 'manufacturing', 'textiles']},
 {'entry_id': 'EEF_1197835

In [14]:
batch = extract_file_senses(ENF_FILE, Path(current_save_folder, 'en', 'ENF'))

100%|██████████| 127859/127859 [00:06<00:00, 21220.71it/s]


In [15]:
display(batch[0:10])

[{'entry_id': 'ENF_113790',
  'sid': 0,
  'word': 'versombere',
  'pos': 'verb',
  'glosses': ['singular present subjunctive of versomberen'],
  'form_of': [{'word': 'versomberen'}],
  'categories': ['Dutch entries with incorrect language header',
   'Dutch non-lemma forms',
   'Dutch verb forms',
   'Pages with 1 entry',
   'Pages with entries'],
  'links': [['versomberen', 'versomberen#Dutch']],
  'raw_glosses': ['(dated or formal) singular present subjunctive of versomberen'],
  'tags': ['dated',
   'form-of',
   'formal',
   'present',
   'singular',
   'subjunctive']},
 {'entry_id': 'ENF_113791',
  'sid': 0,
  'word': 'versomberen',
  'pos': 'verb',
  'glosses': ['to sadden, to become sombre'],
  'categories': ['Dutch entries with incorrect language header',
   'Dutch lemmas',
   'Dutch prefixed verbs',
   'Dutch prefixed verbs with ver-',
   'Dutch terms circumfixed with ver- -en',
   'Dutch terms with usage examples',
   'Dutch verbs',
   'Dutch weak verbs',
   'Pages with 1 ent

In [16]:
batch = extract_file_senses(NNF_FILE, Path(current_save_folder, 'nl', 'NNF'))

100%|██████████| 598925/598925 [00:17<00:00, 33966.63it/s]


In [17]:
display(batch[0:10])

[{'entry_id': 'NNF_559475',
  'sid': 0,
  'word': 'wauwelaar',
  'pos': 'noun',
  'glosses': ['iemand die wauwelt']},
 {'entry_id': 'NNF_559476',
  'sid': 0,
  'word': 'wauwelaars',
  'pos': 'noun',
  'glosses': ['meervoud van het zelfstandig naamwoord wauwelaar'],
  'form_of': [{'word': 'wauwelaar'}],
  'categories': ['Zelfstandignaamwoordsvorm in het Nederlands'],
  'tags': ['form-of', 'plural']},
 {'entry_id': 'NNF_559477',
  'sid': 0,
  'word': 'wauwelde',
  'pos': 'verb',
  'glosses': ['enkelvoud verleden tijd van wauwelen'],
  'form_of': [{'word': 'wauwelen'}],
  'tags': ['form-of']},
 {'entry_id': 'NNF_559478',
  'sid': 0,
  'word': 'wauwelden',
  'pos': 'verb',
  'glosses': ['meervoud verleden tijd van wauwelen'],
  'form_of': [{'word': 'wauwelen'}],
  'tags': ['form-of']},
 {'entry_id': 'NNF_559479',
  'sid': 0,
  'word': 'wauwele',
  'pos': 'verb',
  'glosses': ['aanvoegende wijs van wauwelen'],
  'form_of': [{'word': 'wauwelen'}],
  'tags': ['form-of']},
 {'entry_id': 'NNF_5

In [18]:
batch = extract_file_senses(NEF_FILE, Path(current_save_folder, 'nl', 'NEF'))

100%|██████████| 16343/16343 [00:00<00:00, 141469.25it/s]


In [19]:
batch

[{'entry_id': 'NEF_0',
  'sid': 0,
  'word': 'A',
  'pos': 'noun',
  'glosses': ['de toon “a”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_1',
  'sid': 0,
  'word': 'A double flat',
  'pos': 'noun',
  'glosses': ['de toon “ases”, een verlaagde “as”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_2',
  'sid': 0,
  'word': 'A double flat major',
  'pos': 'noun',
  'glosses': ['Ases-majeur, een theoretische toonladder (11 mollen) die overeenkomt met G-majeur (1 kruis)'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_3',
  'sid': 0,
  'word': 'A double sharp',
  'pos': 'noun',
  'glosses': ['de toon “aïsis”, een verhoogde “aïs”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_4',
  'sid': 0,
  'word': 'A flat',
  'pos': 'noun',
  'glosses': ['de toon “as”, een verlaagde “a”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': [

## 

## Finding Definitions with , ; :

In [3]:
previous_save_folder = get_previous_save_folder(WIKT_CLEANING_DIR,days_ago=1)

In [47]:
eef_senses_file = Path(previous_save_folder, 'en','EEF', 'senses_extracted.jsonl')
enf_senses_file = Path(previous_save_folder, 'en','ENF', 'senses_extracted.jsonl')
nef_senses_file = Path(previous_save_folder, 'nl','NEF', 'senses_extracted.jsonl')
nnf_senses_file = Path(previous_save_folder, 'nl','NNF', 'senses_extracted.jsonl')
senses_files = [eef_senses_file, enf_senses_file, nnf_senses_file, nef_senses_file]

In [7]:
def get_current_save_folder():
    today = datetime.date.today().__format__("%d-%m-%y")
    return Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))

In [45]:
def has_seperator_symbol(str_obj):
    seperator_symbols = [';', ',', ':', '[a]', '[b]', '[c]', '[d]', '(']
    if any(el in str_obj for el in seperator_symbols):
        return True
    else:
        return False

In [13]:
test_str = 'best def ever, next best'
test_str2 = 'less cool def [d]'
test_str3 = 'no seperator'
print(has_seperator_symbol(test_str))
print(has_seperator_symbol(test_str2))
print(has_seperator_symbol(test_str3))

True
True
False


In [16]:
def get_file_wl_code(file):
    if isinstance(file, Path):
        file = file.__str__()

    # elif '\\' in file and '.' in file:
    #     file = file.split('\\')[-1]
    #     file = file.split('.')[0]
    
    if 'EEF' in file or 'eef' in file:
        return 'EEF'
    if 'ENF' in file or 'enf' in file:
        return 'ENF'
    if 'NNF' in file or 'nnf' in file:
        return 'NNF'
    if 'NEF' in file or 'nef' in file:
        return 'NEF'
    if 'ENR' in file or 'enr' in file:
        return 'ENR'
    if 'EER' in file or 'eer' in file:
        return 'EER'
    if 'NNR' in file or 'nnr' in file:
        return 'NNR'
    if 'NER' in file or 'ner':
        return 'NER'
    return ''

In [None]:
def extract_glosses_with_seperator_symbols(file, save_current_f_folder=True, save_path=''):
    if save_current_f_folder:
        save_path = get_current_save_folder()
        save_path_ext = get_file_wl_code(file)
        save_path = Path(save_path, save_path_ext)
        if not save_path.exists():
            save_path.mkdir(parents=True)
    batch = []
    batch_size = 10000
    out_file = Path(save_path, 'glosses_with_seperator_symbols.jsonl')
    mode = 'w+'
    with open(file, 'r', encoding='utf-8') as f:
        
        for i, line in tqdm(enumerate(f), desc='extracting glosses that have seperator symbols'):
            if line:
                search_str = '"glosses": ['
                glosses_loc = line.find(search_str)
                if glosses_loc != -1:
                    end_loc = line.find(']', glosses_loc)
                    glosses_str = line[glosses_loc + len(search_str): end_loc]
                    if has_seperator_symbol(glosses_str):
                        loaded = json.loads(line)
                        batch.append(loaded)
                        if len(batch) > batch_size:
                            save_batch_to_file(batch, out_file, mode)
                            if mode == 'w+':
                                mode = 'a'
                            batch = []
        if batch:
            save_batch_to_file(batch, out_file, mode)
    return batch                    
                            

In [48]:
final_batches = []
for f in senses_files:
    batch = extract_glosses_with_seperator_symbols(f)
    final_batches.append(batch[:100])

1438367it [00:16, 86310.29it/s] 
168718it [00:03, 56145.69it/s]
695045it [00:03, 208258.09it/s]
24310it [00:00, 400829.98it/s]


In [50]:
for b in final_batches:
    display(b)

[{'entry_id': 'EEF_1201398',
  'sid': 10,
  'word': 'whiff',
  'pos': 'noun',
  'glosses': ['A sound like that of air passing through a small opening; a short or soft whistle.'],
  'categories': ['English terms with quotations'],
  'links': [['sound', 'sound#Noun'],
   ['passing', 'pass#Verb'],
   ['opening', 'opening#Noun'],
   ['soft', 'soft#Adjective'],
   ['whistle', 'whistle#Noun']],
  'raw_glosses': ['(figuratively)',
   'A sound like that of air passing through a small opening; a short or soft whistle.'],
  'tags': ['figuratively']},
 {'entry_id': 'EEF_1201398',
  'sid': 11,
  'word': 'whiff',
  'pos': 'noun',
  'glosses': ['A failure to hit a ball in various sports (for example, golf); a miss.'],
  'categories': ['American English', 'English slang', 'en:Sports'],
  'links': [['sports', 'sports'],
   ['failure', 'failure'],
   ['hit', 'hit#Verb'],
   ['ball', 'ball#Noun'],
   ['various', 'various'],
   ['sports', 'sport#Noun'],
   ['golf', 'golf#Noun'],
   ['miss', 'miss#Noun']]

[{'entry_id': 'ENF_127050',
  'sid': 0,
  'word': 'zwanenzang',
  'pos': 'noun',
  'glosses': ["swan song (last major work, accomplishment or effort before one's demise or retirement)"],
  'categories': ['Dutch compound terms',
   'Dutch entries with incorrect language header',
   'Dutch lemmas',
   'Dutch masculine nouns',
   'Dutch nouns',
   'Dutch nouns with plural in -en',
   'Dutch terms interfixed with -en-',
   'Pages with 1 entry',
   'Pages with entries'],
  'links': [['swan song', 'swan song']],
  'tags': ['masculine']},
 {'entry_id': 'ENF_127053',
  'sid': 0,
  'word': 'zwangere',
  'pos': 'adj',
  'glosses': ['inflection of zwanger:',
   'indefinite masculine and feminine singular'],
  'form_of': [{'word': 'zwanger'}],
  'links': [['zwanger', 'zwanger#Dutch']],
  'tags': ['feminine', 'form-of', 'indefinite', 'masculine', 'singular']},
 {'entry_id': 'ENF_127053',
  'sid': 1,
  'word': 'zwangere',
  'pos': 'adj',
  'glosses': ['inflection of zwanger:', 'indefinite plural'],


[{'entry_id': 'NNF_544703',
  'sid': 6,
  'word': 'voet',
  'pos': 'noun',
  'glosses': ['basis, onderstuk, voetstuk'],
  'categories': ['Gereedschap_in_het_Nederlands',
   'Techniek_in_het_Nederlands'],
  'topics': ['technology', 'tools']},
 {'entry_id': 'NNF_544704',
  'sid': 1,
  'word': 'voetafdruk',
  'pos': 'noun',
  'glosses': ['voetafdruk: de hoeveelheid land- en wateroppervlak die een bepaalde activiteit gebruikt'],
  'raw_tags': ['ecologische']},
 {'entry_id': 'NNF_544716',
  'sid': 0,
  'word': 'voetbal',
  'pos': 'noun',
  'glosses': ['een balsport waarbij twee teams van 11 spelers met hun voeten (of hoofd) een bal in het doel van de tegenstander proberen te krijgen'],
  'categories': ['Sport_in_het_Nederlands',
   'Woorden met artikelreferenties',
   'Woorden met boekreferenties'],
  'tags': ['neuter'],
  'topics': ['sports']},
 {'entry_id': 'NNF_544767',
  'sid': 0,
  'word': 'voetbalkaart',
  'pos': 'noun',
  'glosses': ['een kaart uit een serie, met afbeeldingen van voe

[{'entry_id': 'NEF_1',
  'sid': 0,
  'word': 'A double flat',
  'pos': 'noun',
  'glosses': ['de toon “ases”, een verlaagde “as”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_2',
  'sid': 0,
  'word': 'A double flat major',
  'pos': 'noun',
  'glosses': ['Ases-majeur, een theoretische toonladder (11 mollen) die overeenkomt met G-majeur (1 kruis)'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_3',
  'sid': 0,
  'word': 'A double sharp',
  'pos': 'noun',
  'glosses': ['de toon “aïsis”, een verhoogde “aïs”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_4',
  'sid': 0,
  'word': 'A flat',
  'pos': 'noun',
  'glosses': ['de toon “as”, een verlaagde “a”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_5',
  'sid': 0,
  'word': 'A flat major scale',
  'pos': 'noun',
  'glosses': [': As-majeurtoonschaal'],
  'categories': ['Muziek_in_

In [36]:
batch = extract_glosses_with_seperator_symbols(nef_senses_file)

24310it [00:00, 435388.06it/s]


In [None]:
batch = extract_glosses_with_seperator_symbols(eef_senses_file)

1438367it [00:15, 92268.57it/s] 


In [39]:
batch

[{'entry_id': 'EEF_1206077',
  'sid': 1,
  'word': 'wind',
  'pos': 'verb',
  'glosses': ['To cause (someone) to become breathless, as by a blow to the abdomen, or by physical exertion, running, etc.'],
  'categories': ['English terms with usage examples',
   'English transitive verbs'],
  'links': [['breathless', 'breathless'],
   ['abdomen', 'abdomen'],
   ['physical', 'physical'],
   ['exertion', 'exertion']],
  'raw_glosses': ['(transitive) To cause (someone) to become breathless, as by a blow to the abdomen, or by physical exertion, running, etc.'],
  'tags': ['transitive']},
 {'entry_id': 'EEF_1206077',
  'sid': 3,
  'word': 'wind',
  'pos': 'verb',
  'glosses': ['To turn a boat or ship around, so that the wind strikes it on the opposite side.'],
  'categories': ['British English', 'English transitive verbs'],
  'raw_glosses': ['(transitive, British) To turn a boat or ship around, so that the wind strikes it on the opposite side.'],
  'tags': ['British', 'transitive']},
 {'entry_

In [43]:
enf_batch = extract_glosses_with_seperator_symbols(enf_senses_file)

168718it [00:02, 66665.42it/s]


In [44]:
enf_batch

[{'entry_id': 'ENF_125340',
  'sid': 0,
  'word': 'zetten weg',
  'pos': 'verb',
  'glosses': ['inflection of wegzetten:', 'plural present/past indicative'],
  'form_of': [{'word': 'wegzetten'}],
  'links': [['wegzetten', 'wegzetten#Dutch']],
  'tags': ['form-of']},
 {'entry_id': 'ENF_125340',
  'sid': 1,
  'word': 'zetten weg',
  'pos': 'verb',
  'glosses': ['inflection of wegzetten:', 'plural present/past subjunctive'],
  'form_of': [{'word': 'wegzetten'}],
  'links': [['wegzetten', 'wegzetten#Dutch']],
  'raw_glosses': ['inflection of wegzetten:',
   '(dated or formal) plural present/past subjunctive'],
  'tags': ['dated', 'form-of', 'formal']},
 {'entry_id': 'ENF_125342',
  'sid': 0,
  'word': 'zettende',
  'pos': 'verb',
  'glosses': ['inflection of zettend:',
   'masculine/feminine singular attributive'],
  'form_of': [{'word': 'zettend'}],
  'links': [['zettend', 'zettend#Dutch']],
  'tags': ['attributive',
   'feminine',
   'form-of',
   'masculine',
   'participle',
   'singul

## Finding Definitions that are the same as the English/Dutch Word

In [20]:
nef_same_defs = []
short_glosses = []
for obj in batch:
    word = obj.get('word')
    glosses = obj.get('glosses')
    for gloss in glosses:
        if gloss == word:
            nef_same_defs.append(obj)
        gloss = gloss.strip()
        words_in_gloss = gloss.count(' ') + 1
        if words_in_gloss <= 3:
            short_glosses.append(obj)

In [23]:
print(len(nef_same_defs))
nef_same_defs

846


[{'entry_id': 'NEF_21',
  'sid': 0,
  'word': 'ADHD',
  'pos': 'noun',
  'glosses': ['ADHD'],
  'categories': ['Afkorting_in_het_Engels',
   'Initiaalwoord_in_het_Engels',
   'Psychologie_in_het_Engels'],
  'tags': ['abbreviation', 'acronym'],
  'topics': ['psychology']},
 {'entry_id': 'NEF_38',
  'sid': 0,
  'word': 'Aboriginal',
  'pos': 'noun',
  'glosses': ['Aboriginal']},
 {'entry_id': 'NEF_41',
  'sid': 0,
  'word': 'Achilles',
  'pos': 'noun',
  'glosses': ['Achilles'],
  'categories': ['Mythologie_in_het_Engels'],
  'topics': ['mythology']},
 {'entry_id': 'NEF_46',
  'sid': 0,
  'word': 'Afrikaans',
  'pos': 'noun',
  'glosses': ['Afrikaans'],
  'categories': ['Taal_in_het_Engels'],
  'tags': ['linguistics']},
 {'entry_id': 'NEF_55',
  'sid': 0,
  'word': 'American football',
  'pos': 'noun',
  'glosses': ['American football'],
  'categories': ['Sport_in_het_Engels'],
  'topics': ['sports']},
 {'entry_id': 'NEF_59',
  'sid': 0,
  'word': 'Ara',
  'pos': 'noun',
  'glosses': ['A

In [25]:
print(len(short_glosses))
short_glosses

15906


[{'entry_id': 'NEF_0',
  'sid': 0,
  'word': 'A',
  'pos': 'noun',
  'glosses': ['de toon “a”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_5',
  'sid': 0,
  'word': 'A flat major scale',
  'pos': 'noun',
  'glosses': [': As-majeurtoonschaal'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_6',
  'sid': 0,
  'word': 'A flat minor chord',
  'pos': 'noun',
  'glosses': ['as-mineurakkoord'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_7',
  'sid': 0,
  'word': 'A major scale',
  'pos': 'noun',
  'glosses': [': A-grotetertstoonschaal'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_8',
  'sid': 0,
  'word': 'A minor chord',
  'pos': 'noun',
  'glosses': ['a-mineurakkoord'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_9',
  'sid': 0,
  'word': 'A minor scale',
  'pos': 'noun',
  'glosses': ['a