# Extract Definitions

In [20]:
%load_ext autoreload
%autoreload 3

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import json
from pathlib import Path
from dutchanalyzer.config import *
from dutchanalyzer.utilities.utils import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.replacement_utils import *
from dutchanalyzer.utilities.pandas_utils import *
from pathlib import Path
import datetime
from tqdm import tqdm

In [None]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))
folders = ['EEF', 'ENF', 'EOF', 'NEF', 'NNF', 'NOF']

for f in folders:
    Path.mkdir(Path(current_save_folder, f), parents=True, exist_ok=True)

## Extracting Words/Pos/Senses

- Extract words and parts of speech to dict, add all senses to dict

In [None]:
def extract_entry_senses(entry) -> list:
    # SID starts at 1 so that sid 0 can be the top level in future analysis
    entry_id, word, pos, senses = get_eid_word_pos_senses(entry)
    sense_entries = []
    for i, sense in enumerate(senses):
        sense_entries.append({'entry_id':entry_id, 'sid':i + 1, 'word':word, 'pos':pos})
        sense_entries[-1].update(sense)
    return sense_entries

In [None]:
def save_batch_to_file(batch, out_file, mode):
    with open(out_file, mode, encoding='utf-8') as out:
        for obj in batch:
            json.dump(obj, out, ensure_ascii=False)
            out.write('\n')

In [12]:
def extract_file_senses(file, save_folder):
    batch = []
    mode = 'w+'
    if not save_folder.exists():
        save_folder.mkdir()
    out_file = Path(save_folder, 'senses_extracted.jsonl')
    batch_size = 50000
    with open(file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, total=count_lines_with_progress(file, quiet=True)):
            loaded = json.loads(line)
            if loaded:
                entry_senses = extract_entry_senses(loaded)
                batch.extend(entry_senses)
                if len(batch) > batch_size:
                    save_batch_to_file(batch, out_file, mode)
                    if mode == 'w+':
                        mode = 'a'
                    batch = []
        if batch:
            save_batch_to_file(batch, out_file, mode)
    return batch, out_file

In [13]:
batch, eef_senses_file = extract_file_senses(EEF_FILE, Path(current_save_folder, 'EEF'))

100%|██████████| 1230354/1230354 [00:49<00:00, 24932.13it/s]


In [9]:
display(batch[0:10])

[{'entry_id': 'EEF_1197825',
  'sid': 1,
  'word': 'weight',
  'pos': 'verb',
  'glosses': ['To add weight to something; to make something heavier.'],
  'categories': ['English transitive verbs'],
  'links': [['add', 'add'], ['heavier', 'heavier']],
  'raw_glosses': ['(transitive) To add weight to something; to make something heavier.'],
  'tags': ['transitive']},
 {'entry_id': 'EEF_1197825',
  'sid': 2,
  'word': 'weight',
  'pos': 'verb',
  'glosses': ['To add weight to something; to make something heavier.',
   'To load (fabrics) with barite, etc. to increase the weight.'],
  'categories': ['English transitive verbs'],
  'links': [['add', 'add'], ['heavier', 'heavier'], ['barite', 'barite']],
  'raw_glosses': ['(transitive) To add weight to something; to make something heavier.',
   '(transitive, dyeing) To load (fabrics) with barite, etc. to increase the weight.'],
  'tags': ['transitive'],
  'topics': ['business', 'dyeing', 'manufacturing', 'textiles']},
 {'entry_id': 'EEF_1197825

In [14]:
batch, enf_senses_file = extract_file_senses(ENF_FILE, Path(current_save_folder, 'ENF'))

100%|██████████| 127859/127859 [00:06<00:00, 20859.92it/s]


In [15]:
display(batch[0:10])

[{'entry_id': 'ENF_113790',
  'sid': 1,
  'word': 'versombere',
  'pos': 'verb',
  'glosses': ['singular present subjunctive of versomberen'],
  'categories': ['Dutch entries with incorrect language header',
   'Dutch non-lemma forms',
   'Dutch verb forms',
   'Pages with 1 entry',
   'Pages with entries'],
  'form_of': [{'word': 'versomberen'}],
  'links': [['versomberen', 'versomberen#Dutch']],
  'raw_glosses': ['(dated or formal) singular present subjunctive of versomberen'],
  'tags': ['dated',
   'form-of',
   'formal',
   'present',
   'singular',
   'subjunctive']},
 {'entry_id': 'ENF_113791',
  'sid': 1,
  'word': 'versomberen',
  'pos': 'verb',
  'glosses': ['to sadden, to become sombre'],
  'categories': ['Dutch entries with incorrect language header',
   'Dutch lemmas',
   'Dutch prefixed verbs',
   'Dutch prefixed verbs with ver-',
   'Dutch terms circumfixed with ver- -en',
   'Dutch verbs',
   'Dutch weak verbs',
   'Pages with 1 entry',
   'Pages with entries'],
  'link

In [16]:
batch, nnf_senses_file = extract_file_senses(NNF_FILE, Path(current_save_folder, 'NNF'))

100%|██████████| 598925/598925 [00:18<00:00, 33004.05it/s]


In [21]:
display(batch[0:10])

[{'entry_id': 'NNF_559475',
  'sid': 0,
  'word': 'wauwelaar',
  'pos': 'noun',
  'glosses': ['iemand die wauwelt']},
 {'entry_id': 'NNF_559476',
  'sid': 0,
  'word': 'wauwelaars',
  'pos': 'noun',
  'glosses': ['meervoud van het zelfstandig naamwoord wauwelaar'],
  'categories': ['Zelfstandignaamwoordsvorm in het Nederlands'],
  'form_of': [{'word': 'wauwelaar'}],
  'tags': ['form-of', 'plural']},
 {'entry_id': 'NNF_559477',
  'sid': 0,
  'word': 'wauwelde',
  'pos': 'verb',
  'glosses': ['enkelvoud verleden tijd van wauwelen'],
  'form_of': [{'word': 'wauwelen'}],
  'tags': ['form-of']},
 {'entry_id': 'NNF_559478',
  'sid': 0,
  'word': 'wauwelden',
  'pos': 'verb',
  'glosses': ['meervoud verleden tijd van wauwelen'],
  'form_of': [{'word': 'wauwelen'}],
  'tags': ['form-of']},
 {'entry_id': 'NNF_559479',
  'sid': 0,
  'word': 'wauwele',
  'pos': 'verb',
  'glosses': ['aanvoegende wijs van wauwelen'],
  'form_of': [{'word': 'wauwelen'}],
  'tags': ['form-of']},
 {'entry_id': 'NNF_5

In [17]:
batch, nef_senses_file = extract_file_senses(NEF_FILE, Path(current_save_folder, 'NEF'))

100%|██████████| 16331/16331 [00:00<00:00, 83829.72it/s]


In [23]:
batch

[{'entry_id': 'NEF_0',
  'sid': 0,
  'word': 'A',
  'pos': 'noun',
  'glosses': ['de toon “a”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_1',
  'sid': 0,
  'word': 'A double flat',
  'pos': 'noun',
  'glosses': ['de toon “ases”, een verlaagde “as”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_2',
  'sid': 0,
  'word': 'A double flat major',
  'pos': 'noun',
  'glosses': ['Ases-majeur, een theoretische toonladder (11 mollen) die overeenkomt met G-majeur (1 kruis)'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_3',
  'sid': 0,
  'word': 'A double sharp',
  'pos': 'noun',
  'glosses': ['de toon “aïsis”, een verhoogde “aïs”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_4',
  'sid': 0,
  'word': 'A flat',
  'pos': 'noun',
  'glosses': ['de toon “as”, een verlaagde “a”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': [

In [23]:
batch, nof_senses_file = extract_file_senses(Path(WIKT_CLEANING_DIR, 'NOF', 'NOF.jsonl'), Path(current_save_folder, 'NOF'))

100%|██████████| 4368/4368 [00:00<00:00, 134065.49it/s]


In [25]:
batch, eof_senses_file = extract_file_senses(Path(WIKT_CLEANING_DIR, 'EOF', 'EOF.jsonl'), Path(current_save_folder, 'EOF'))

100%|██████████| 46278/46278 [00:01<00:00, 23341.37it/s]


## 

## Finding Definitions with , ; :

In [71]:
CUR_EEF_FOLDER = Path(current_save_folder, 'EEF')
CUR_ENF_FOLDER = Path(current_save_folder, 'ENF')
CUR_EOF_FOLDER = Path(current_save_folder, 'EOF')
CUR_NNF_FOLDER = Path(current_save_folder, 'NNF')
CUR_NEF_FOLDER = Path(current_save_folder, 'NEF')
CUR_NOF_FOLDER = Path(current_save_folder, 'NOF')

In [69]:
eef_senses_file = Path(CUR_EEF_FOLDER, 'senses_extracted.jsonl')
enf_senses_file = Path(CUR_ENF_FOLDER, 'senses_extracted.jsonl')
eof_senses_file = Path(CUR_EOF_FOLDER, 'senses_extracted.jsonl')
nef_senses_file = Path(CUR_NEF_FOLDER, 'senses_extracted.jsonl')
nnf_senses_file = Path(CUR_NNF_FOLDER, 'senses_extracted.jsonl')
nof_senses_file = Path(CUR_NOF_FOLDER, 'senses_extracted.jsonl')


In [72]:
senses_files = [eef_senses_file, enf_senses_file, eof_senses_file, nnf_senses_file, nef_senses_file, nof_senses_file]

In [76]:
import shutil

for file in senses_files:
    wl_code = get_file_wl_code(file)
    path = f'{wl_code}/{wl_code}_senses_extracted.jsonl'
    shutil.copy(file, Path(WIKT_CLEANING_DIR, path))

In [28]:
def get_current_save_folder():
    today = datetime.date.today().__format__("%d-%m-%y")
    return Path(INTERIM_DATA_DIR, 'cleaning', 'wikt', str(today))

In [None]:
def has_seperator_symbol(str_obj):
    seperator_symbols = [';', ',', ':', '[a]', '[b]', '[c]', '[d]', '(', '[']
    if any(el in str_obj for el in seperator_symbols):
        return True
    else:
        return False

In [30]:
test_str = 'best def ever, next best'
test_str2 = 'less cool def [d]'
test_str3 = 'no seperator'
print(has_seperator_symbol(test_str))
print(has_seperator_symbol(test_str2))
print(has_seperator_symbol(test_str3))

True
True
False


In [32]:
def extract_glosses_with_seperator_symbols(file, save_current_f_folder=True, save_path=''):
    if save_current_f_folder:
        save_path = get_current_save_folder()
        save_path_ext = get_file_wl_code(file)
        save_path = Path(save_path, save_path_ext)
        if not save_path.exists():
            save_path.mkdir(parents=True)
    batch = []
    batch_size = 10000
    out_file = Path(save_path, 'glosses_with_seperator_symbols.jsonl')
    mode = 'w+'
    with open(file, 'r', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f), desc='extracting glosses that have seperator symbols'):
            if line:
                search_str = '"glosses": ['
                glosses_loc = line.find(search_str)
                if glosses_loc != -1:
                    end_loc = line.find(']', glosses_loc)
                    glosses_str = line[glosses_loc + len(search_str): end_loc]
                    if has_seperator_symbol(glosses_str):
                        loaded = json.loads(line)
                        batch.append(loaded)
                        if len(batch) > batch_size:
                            save_batch_to_file(batch, out_file, mode)
                            if mode == 'w+':
                                mode = 'a'
                            batch = []
        if batch:
            save_batch_to_file(batch, out_file, mode)
    return batch, out_file                    
                            

In [36]:
final_batches = []
seperator_files =[]
for f in senses_files:
    batch, seperator_file = extract_glosses_with_seperator_symbols(f)
    final_batches.append(batch[:100])
    seperator_files.append(seperator_file)

extracting glosses that have seperator symbols: 1438357it [00:16, 86561.90it/s] 
extracting glosses that have seperator symbols: 168718it [00:02, 59123.69it/s]
extracting glosses that have seperator symbols: 59058it [00:00, 75576.04it/s] 
extracting glosses that have seperator symbols: 695041it [00:02, 237341.99it/s]
extracting glosses that have seperator symbols: 24284it [00:00, 234145.53it/s]
extracting glosses that have seperator symbols: 4837it [00:00, 326155.47it/s]


In [34]:
for b in final_batches:
    display(b)

[{'entry_id': 'EEF_1201388',
  'sid': 11,
  'word': 'whiff',
  'pos': 'noun',
  'glosses': ['A sound like that of air passing through a small opening; a short or soft whistle.'],
  'categories': ['English terms with quotations'],
  'links': [['sound', 'sound#Noun'],
   ['passing', 'pass#Verb'],
   ['opening', 'opening#Noun'],
   ['soft', 'soft#Adjective'],
   ['whistle', 'whistle#Noun']],
  'raw_glosses': ['(figuratively)',
   'A sound like that of air passing through a small opening; a short or soft whistle.'],
  'tags': ['figuratively']},
 {'entry_id': 'EEF_1201388',
  'sid': 12,
  'word': 'whiff',
  'pos': 'noun',
  'glosses': ['A failure to hit a ball in various sports (for example, golf); a miss.'],
  'categories': ['American English', 'English slang', 'en:Sports'],
  'links': [['sports', 'sports'],
   ['failure', 'failure'],
   ['hit', 'hit#Verb'],
   ['ball', 'ball#Noun'],
   ['various', 'various'],
   ['sports', 'sport#Noun'],
   ['golf', 'golf#Noun'],
   ['miss', 'miss#Noun']]

[{'entry_id': 'ENF_127050',
  'sid': 1,
  'word': 'zwanenzang',
  'pos': 'noun',
  'glosses': ["swan song (last major work, accomplishment or effort before one's demise or retirement)"],
  'categories': ['Dutch compound terms',
   'Dutch entries with incorrect language header',
   'Dutch lemmas',
   'Dutch masculine nouns',
   'Dutch nouns',
   'Dutch nouns with plural in -en',
   'Dutch terms interfixed with -en-',
   'Pages with 1 entry',
   'Pages with entries'],
  'links': [['swan song', 'swan song']],
  'tags': ['masculine']},
 {'entry_id': 'ENF_127053',
  'sid': 1,
  'word': 'zwangere',
  'pos': 'adj',
  'glosses': ['inflection of zwanger:',
   'indefinite masculine and feminine singular'],
  'form_of': [{'word': 'zwanger'}],
  'links': [['zwanger', 'zwanger#Dutch']],
  'tags': ['feminine', 'form-of', 'indefinite', 'masculine', 'singular']},
 {'entry_id': 'ENF_127053',
  'sid': 2,
  'word': 'zwangere',
  'pos': 'adj',
  'glosses': ['inflection of zwanger:', 'indefinite plural'],


[{'entry_id': 'NEF_44989',
  'sid': 2,
  'word': 'ydele',
  'pos': 'adj',
  'glosses': ['inflection of ydel:', 'strong/weak plural'],
  'form_of': [{'word': 'ydel'}],
  'links': [['ydel', 'ydel#Middle_English']],
  'tags': ['form-of']},
 {'entry_id': 'NEF_44991',
  'sid': 1,
  'word': 'ydell',
  'pos': 'adj',
  'glosses': ['alternative form of ydel (“empty”)'],
  'categories': ['Middle English alternative forms',
   'Middle English entries with incorrect language header',
   'Pages with 1 entry',
   'Pages with entries'],
  'alt_of': [{'word': 'ydel', 'extra': 'empty'}],
  'links': [['ydel', 'ydel#Middle_English:_empty']],
  'tags': ['alt-of', 'alternative']},
 {'entry_id': 'NEF_44994',
  'sid': 1,
  'word': 'ydelly',
  'pos': 'adv',
  'glosses': ['uselessly, pointlessly'],
  'links': [['uselessly', 'uselessly'], ['pointlessly', 'pointlessly']]},
 {'entry_id': 'NEF_44994',
  'sid': 2,
  'word': 'ydelly',
  'pos': 'adv',
  'glosses': ['idly, inactively'],
  'links': [['idly', 'idly'], [

[{'entry_id': 'NNF_544703',
  'sid': 7,
  'word': 'voet',
  'pos': 'noun',
  'glosses': ['basis, onderstuk, voetstuk'],
  'categories': ['Gereedschap_in_het_Nederlands',
   'Techniek_in_het_Nederlands'],
  'topics': ['technology', 'tools']},
 {'entry_id': 'NNF_544704',
  'sid': 2,
  'word': 'voetafdruk',
  'pos': 'noun',
  'glosses': ['voetafdruk: de hoeveelheid land- en wateroppervlak die een bepaalde activiteit gebruikt'],
  'raw_tags': ['ecologische']},
 {'entry_id': 'NNF_544716',
  'sid': 1,
  'word': 'voetbal',
  'pos': 'noun',
  'glosses': ['een balsport waarbij twee teams van 11 spelers met hun voeten (of hoofd) een bal in het doel van de tegenstander proberen te krijgen'],
  'categories': ['Sport_in_het_Nederlands'],
  'tags': ['neuter'],
  'topics': ['sports']},
 {'entry_id': 'NNF_544767',
  'sid': 1,
  'word': 'voetbalkaart',
  'pos': 'noun',
  'glosses': ['een kaart uit een serie, met afbeeldingen van voetballers, die wordt verzameld en geruild met andere spelers']},
 {'entr

[{'entry_id': 'NEF_1',
  'sid': 1,
  'word': 'A double flat',
  'pos': 'noun',
  'glosses': ['de toon “ases”, een verlaagde “as”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_2',
  'sid': 1,
  'word': 'A double flat major',
  'pos': 'noun',
  'glosses': ['Ases-majeur, een theoretische toonladder (11 mollen) die overeenkomt met G-majeur (1 kruis)'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_3',
  'sid': 1,
  'word': 'A double sharp',
  'pos': 'noun',
  'glosses': ['de toon “aïsis”, een verhoogde “aïs”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_4',
  'sid': 1,
  'word': 'A flat',
  'pos': 'noun',
  'glosses': ['de toon “as”, een verlaagde “a”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_5',
  'sid': 1,
  'word': 'A flat major scale',
  'pos': 'noun',
  'glosses': [': As-majeurtoonschaal'],
  'categories': ['Muziek_in_

[{'entry_id': 'NEF_0',
  'sid': 1,
  'word': 'Aadler',
  'pos': 'noun',
  'glosses': ['arend, adelaar'],
  'categories': ['Havikachtigen_in_het_Nedersaksisch',
   'Vogels in het Nedersaksisch'],
  'raw_tags': ['havikachtigen']},
 {'entry_id': 'NEF_4',
  'sid': 1,
  'word': 'Achterhooks',
  'pos': 'noun',
  'glosses': ['Achterhoeks; een Nedersaksische (streek)taal die gesproken wordt in de Achterhoek'],
  'categories': ['Taal_in_het_Achterhoeks'],
  'tags': ['linguistics']},
 {'entry_id': 'NEF_5',
  'sid': 1,
  'word': 'Achterhooks',
  'pos': 'noun',
  'glosses': ['Achterhoeks; een Nedersaksische (streek)taal die gesproken wordt in de Achterhoek'],
  'categories': ['Taal_in_het_Nedersaksisch'],
  'tags': ['linguistics']},
 {'entry_id': 'NEF_8',
  'sid': 1,
  'word': 'Ackersnacker',
  'pos': 'noun',
  'glosses': ['mobieltje; een gsm, een draagbare telefoon']},
 {'entry_id': 'NEF_9',
  'sid': 1,
  'word': 'Actinium',
  'pos': 'noun',
  'glosses': ['actinium; een scheikundig element met at

## Get Duplicate Glosses

In [None]:
def get_duplicates(file, save_path):
    duplicates_folder = Path(save_path, 'duplicates')
    duplicates_folder.mkdir(parents=True, exist_ok=True)
    out_file = Path(duplicates_folder, f'{file.stem}_duplicates.jsonl')
    duplicate_eids = {}
    words = {}
    glosses = {}
    batch = []
    mode = 'w+'
    batch_size = 1000
    with open(file, 'r', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f), desc= 'finding duplicates'):
            if line:
                obj = json.loads(line)
                if obj:
                    eid, word, pos = get_eid_word_pos(obj)
                    gloss = obj.get('glosses')
                    sid = obj.get('sid')
                    if not (word, pos) in words:
                        words[(word, pos)] = [eid]
                    else:
                        if eid not in words[(word, pos)]:
                            words[(word, pos)].append(eid)
                    if gloss:
                        new_gloss = ';; '.join(gloss)
                        if not new_gloss in glosses:
                            glosses[new_gloss] = [(eid, sid)]
                        else:
                            glosses[new_gloss].append((eid, sid))

        for key, val in words.items():
            if len(val) > 1:
                for v in val:
                    if not v in duplicate_eids:
                        duplicate_eids[v] = ['word']
                    else:
                        duplicate_eids[v].append('word')

        for key, val in glosses.items():
            if len(val) > 1:
                for v in val:
                    if not v[0] in duplicate_eids:
                        duplicate_eids[v[0]] = [(v[1], 'gloss', val)]
                    else:
                        duplicate_eids[v[0]].append((v[1], 'gloss', val))

    with open(file, 'r', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f), desc= 'saving duplicates'):
            if line:
                eid_index = line.find('"entry_id": "')
                if eid_index != -1:
                    end_eid = line.find(',', eid_index)
                    eid = line[eid_index + len('"entry_id": "'):end_eid - 1].strip()
                    if eid in duplicate_eids:

                        obj = json.loads(line)
                        if obj:
                            obj['dup_type'] = duplicate_eids[eid]
                            batch.append(obj)
                            if len(batch) > batch_size:
                                save_batch_to_file(batch, out_file, mode)  
                                if mode == 'w+':
                                    mode = 'a'
                                batch = []
        if batch:
            save_batch_to_file(batch, out_file, mode)

    return duplicate_eids, out_file

In [61]:
duplicate_eids, out_file = get_duplicates(enf_senses_file, Path(CUR_ENF_FOLDER))

finding duplicates: 168718it [00:01, 149207.34it/s]
saving duplicates: 168718it [00:01, 151497.90it/s]


In [62]:
duplicate_eids, eef_out_file = get_duplicates(eef_senses_file, Path(CUR_EEF_FOLDER))

finding duplicates: 1438357it [00:12, 111749.91it/s]
saving duplicates: 1438357it [00:13, 109778.10it/s]


In [65]:
duplicate_eids, nef_out_file = get_duplicates(nef_senses_file, Path(CUR_NEF_FOLDER))

finding duplicates: 24284it [00:00, 101651.78it/s]
saving duplicates: 24284it [00:00, 143860.02it/s]


In [63]:
duplicate_eids, nnf_out_file = get_duplicates(nnf_senses_file, Path(CUR_NNF_FOLDER))

finding duplicates: 695041it [00:04, 152190.09it/s]
saving duplicates: 695041it [00:05, 131607.10it/s]


In [67]:
import shutil


shutil.copy(eef_out_file, Path(EEF_FOLDER, 'EEF_duplicates.jsonl'))
shutil.copy(out_file, Path(ENF_FOLDER, 'ENF_duplicates.jsonl'))
shutil.copy(nef_out_file, Path(NEF_FOLDER, 'NEF_duplicates.jsonl'))
shutil.copy(nnf_out_file, Path(NNF_FOLDER, 'NNF_duplicates.jsonl'))
print('saved top level')

saved top level


## Splitting Short Defs

In [None]:
def split_semi_col_gloss(gloss: str):
    short_glosses = []
    long_glosses = []
    long_glosses_dict = {}
    split_gloss = gloss.split(';')
    further_processing = False
    if isinstance(split_gloss, list):
        if len(split_gloss) == 2:
            split_gloss_0 = split_gloss[0].strip()
            split_gloss_1 = split_gloss[1].strip()
            if has_seperator_symbol(split_gloss_0):
                further_processing = True
            if has_seperator_symbol(split_gloss_1):
                further_processing = True

            if len(split_gloss_0) > len(split_gloss_1):
                short_glosses.append(split_gloss_1)
                long_glosses_dict[split_gloss_1] = split_gloss_0
            else:
                short_glosses.append(split_gloss_0)
                long_glosses_dict[split_gloss_0] = split_gloss_1
        else:
            further_processing = True
    return short_glosses, long_glosses_dict, further_processing
    

In [None]:
def split_sense_glosses(glosses):
    further_processing = False
    is_inflection = False
    short_glosses = []
    long_glosses_dict = {}
   
    if glosses:
        if 'inflection of' in glosses[0]:
            is_inflection = True
            further_processing = True
        elif 'form of' in glosses[0]:
            is_inflection = True
            further_processing = True
    
        elif len(glosses) > 1:
            further_processing = True
            return short_glosses, long_glosses_dict, further_processing, is_inflection
        else:
            if 'inflection of' in glosses[0]:
                is_inflection = True
            if 'form of' in glosses[0]:
                is_inflection = True
            if ';' in glosses[0]:
                short_glosses, long_glosses_dict, further_processing = split_semi_col_gloss(glosses[0])

            else:
                further_processing = True
    return short_glosses, long_glosses_dict, further_processing, is_inflection

In [43]:
def make_sort_def_file(file):
    out_file = make_file_path_with_suffix(file, 'simple_semicol_defs')
    further_processing_file = make_file_path_with_suffix(file, 'futher_processing')
    inflection_file = make_file_path_with_suffix(file, 'form_of')
    inflection_mode = 'w+'
    further_mode = 'w+'
    mode = 'w+'
    batch_size = 10000
    batch = []
    inflection_batch = []
    further_processing_batch = []
    with open(file, 'r', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f), desc= 'splitting glosses'):
            if line:
                obj = json.loads(line)
                if obj:
                    glosses = obj.get('glosses')
                    short_glosses, long_glosses_dict, further_processing, is_inflection = split_sense_glosses(glosses)
                    if short_glosses:
                        obj['short_glosses'] = short_glosses
                    if long_glosses_dict:
                        obj['long_glosses_dict'] = long_glosses_dict

                    if is_inflection:
                        inflection_batch.append(obj)
                        if len(inflection_batch) > batch_size:
                            save_batch_to_file(inflection_batch, inflection_file, inflection_mode)
                            if inflection_mode == 'w+':
                                inflection_mode='a'
                            inflection_batch = []
                        
                    elif further_processing:
                        further_processing_batch.append(obj)
                        if len(further_processing_batch) > batch_size:
                            save_batch_to_file(further_processing_batch, further_processing_file, further_mode)
                            if further_mode == 'w+':
                                further_mode='a'
                            further_processing_batch = []
                        
                    else:
                        batch.append(obj)
                        if len(batch) > batch_size:
                            save_batch_to_file(batch, out_file, mode)
                            if mode == 'w+':
                                mode='a'
                            batch = []
        if inflection_batch:
            save_batch_to_file(inflection_batch, inflection_file, inflection_mode)    
        if further_processing_batch:
            save_batch_to_file(further_processing_batch, further_processing_file, further_mode)
        if batch:
            save_batch_to_file(batch, out_file, mode)
                
    return out_file, further_processing_file, inflection_file

In [46]:
out_files, further_processing_files, inflection_files = [], [], []
for file in seperator_files:
    o, f, i = make_sort_def_file(file)
    out_files.append(o)
    further_processing_files.append(f)
    inflection_files.append(i)

splitting glosses: 359702it [00:17, 20997.41it/s]
splitting glosses: 80540it [00:02, 29724.27it/s]
splitting glosses: 2723it [00:00, 163345.11it/s]
splitting glosses: 99047it [00:02, 45621.78it/s]
splitting glosses: 7599it [00:00, 90195.11it/s]
splitting glosses: 2723it [00:00, 169096.11it/s]


## Finding Definitions that are the same as the English/Dutch Word

In [20]:
nef_same_defs = []
short_glosses = []
for obj in batch:
    word = obj.get('word')
    glosses = obj.get('glosses')
    for gloss in glosses:
        if gloss == word:
            nef_same_defs.append(obj)
        gloss = gloss.strip()
        words_in_gloss = gloss.count(' ') + 1
        if words_in_gloss <= 3:
            short_glosses.append(obj)

In [23]:
print(len(nef_same_defs))
nef_same_defs

846


[{'entry_id': 'NEF_21',
  'sid': 0,
  'word': 'ADHD',
  'pos': 'noun',
  'glosses': ['ADHD'],
  'categories': ['Afkorting_in_het_Engels',
   'Initiaalwoord_in_het_Engels',
   'Psychologie_in_het_Engels'],
  'tags': ['abbreviation', 'acronym'],
  'topics': ['psychology']},
 {'entry_id': 'NEF_38',
  'sid': 0,
  'word': 'Aboriginal',
  'pos': 'noun',
  'glosses': ['Aboriginal']},
 {'entry_id': 'NEF_41',
  'sid': 0,
  'word': 'Achilles',
  'pos': 'noun',
  'glosses': ['Achilles'],
  'categories': ['Mythologie_in_het_Engels'],
  'topics': ['mythology']},
 {'entry_id': 'NEF_46',
  'sid': 0,
  'word': 'Afrikaans',
  'pos': 'noun',
  'glosses': ['Afrikaans'],
  'categories': ['Taal_in_het_Engels'],
  'tags': ['linguistics']},
 {'entry_id': 'NEF_55',
  'sid': 0,
  'word': 'American football',
  'pos': 'noun',
  'glosses': ['American football'],
  'categories': ['Sport_in_het_Engels'],
  'topics': ['sports']},
 {'entry_id': 'NEF_59',
  'sid': 0,
  'word': 'Ara',
  'pos': 'noun',
  'glosses': ['A

In [25]:
print(len(short_glosses))
short_glosses

15906


[{'entry_id': 'NEF_0',
  'sid': 0,
  'word': 'A',
  'pos': 'noun',
  'glosses': ['de toon “a”'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_5',
  'sid': 0,
  'word': 'A flat major scale',
  'pos': 'noun',
  'glosses': [': As-majeurtoonschaal'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_6',
  'sid': 0,
  'word': 'A flat minor chord',
  'pos': 'noun',
  'glosses': ['as-mineurakkoord'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_7',
  'sid': 0,
  'word': 'A major scale',
  'pos': 'noun',
  'glosses': [': A-grotetertstoonschaal'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_8',
  'sid': 0,
  'word': 'A minor chord',
  'pos': 'noun',
  'glosses': ['a-mineurakkoord'],
  'categories': ['Muziek_in_het_Engels'],
  'topics': ['music']},
 {'entry_id': 'NEF_9',
  'sid': 0,
  'word': 'A minor scale',
  'pos': 'noun',
  'glosses': ['a