# Affix Exctraction and Translation

In [1]:
import json
from dutchanalyzer.config import *
from dutchanalyzer.utilities.json_utils import *
from dutchanalyzer.utilities.utils import *

In [7]:
import pandas as pd

In [None]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(WIKT_CLEANING_DIR, str(today))
folders = {'en': ['EEF', 'ENF'], 'nl':['NEF', 'NNF']}


for k, v in folders.items():
    for f in v:
        Path.mkdir(Path(current_save_folder, k, f), parents=True, exist_ok=True)

## Utilities

In [3]:
def count_pos(file):
    pos_count = {}
    with open(file, 'r', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f), total=count_lines_with_progress(file)):
            pos_loc = line.find('"pos": "')
            if pos_loc > 0:
                cut_line = line[pos_loc + len('"pos": "'):]
                
                end = cut_line.find('"')
                pos = cut_line[:end]
            
                pos = pos.strip()
                if pos not in pos_count:
                    pos_count[pos] = 1
                else:
                    pos_count[pos] += 1
    
    sorted_items = sorted(pos_count.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    
    return sorted_items

In [4]:
def extract_affixes(file, out_file='', break_point=-1, out_mode='w+'):
    affix_lines = []
    affixes = ['suffix', 'interfix', 'prefix', 'circumfix', 'infix']
    
    with open(file, 'r', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f), total=count_lines_with_progress(file)):
            if i > break_point and break_point != -1:
                break
            pos_loc = line.find('"pos": "')
            if pos_loc > 0:
                cut_line = line[pos_loc + len('"pos": "'):]
                
                end = cut_line.find('"')
                pos = cut_line[:end]
            
                pos = pos.strip()
                if pos in affixes:
                    affix_lines.append(json.loads(line))
        if out_file:
            with open(out_file, out_mode, encoding='utf-8') as out:    
                for obj in affix_lines:
                    json.dump(obj, out, ensure_ascii=False)
                    out.write('\n')
        return affix_lines

## Make Affix Files

In [5]:
nef_affixes = extract_affixes(NEF_FILE, Path(current_save_folder, 'nl', 'NEF', 'NEF_affixes.jsonl'))
nnf_affixes = extract_affixes(NNF_FILE, Path(current_save_folder, 'nl', 'NNF', 'NNF_affixes.jsonl'))
eef_affixes = extract_affixes(EEF_FILE, Path(current_save_folder, 'en', 'EEF', 'EEF_affixes.jsonl'))
enf_affixes = extract_affixes(ENF_FILE, Path(current_save_folder, 'en', 'ENF', 'ENF_affixes.jsonl'))

Counting Lines: 100%|██████████| 7.96M/7.96M [00:00<00:00, 1.44GB/s]
100%|██████████| 16325/16325 [00:00<00:00, 544266.67it/s]
Counting Lines: 100%|██████████| 639M/639M [00:00<00:00, 1.47GB/s]
100%|██████████| 599938/599938 [00:01<00:00, 362716.11it/s]
Counting Lines: 100%|██████████| 1.25G/1.25G [00:00<00:00, 1.48GB/s]
100%|██████████| 1234864/1234864 [00:03<00:00, 354888.82it/s]
Counting Lines: 100%|██████████| 169M/169M [00:00<00:00, 1.48GB/s]
100%|██████████| 128009/128009 [00:00<00:00, 258955.96it/s]


In [6]:
all_affixes = nef_affixes + nnf_affixes + eef_affixes + enf_affixes

In [8]:
df = pd.DataFrame(all_affixes)
display(df.info(verbose=True))
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4811 entries, 0 to 4810
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   word                  4811 non-null   object 
 1   pos                   4811 non-null   object 
 2   lang_code             4811 non-null   object 
 3   lang                  4811 non-null   object 
 4   standard_lang         4811 non-null   object 
 5   categories            4811 non-null   object 
 6   senses                4811 non-null   object 
 7   pos_title             520 non-null    object 
 8   sounds                1968 non-null   object 
 9   tags                  520 non-null    object 
 10  wl_code               4811 non-null   object 
 11  entry_id              4811 non-null   object 
 12  derived               938 non-null    object 
 13  etymology_texts       358 non-null    object 
 14  hypernyms             12 non-null     object 
 15  related              

None

Unnamed: 0,word,pos,lang_code,lang,standard_lang,categories,senses,pos_title,sounds,tags,...,forms,notes,anagrams,etymology_templates,etymology_number,etymology_text,head_templates,coordinate_terms,descendants,inflection_templates
0,-s-,interfix,en,engels,english,"[Invoegsel in het Engels, Woorden in het Engels]",[{'glosses': ['-s-']}],Invoegsel,[{'ipa': '/ ɛs /'}],[morpheme],...,,,,,,,,,,
1,ac-,prefix,en,engels,english,"[Voorvoegsel in het Engels, Woorden in het Eng...","[{'glosses': ['ac-'], 'categories': ['Elektrot...",Voorvoegsel,,[morpheme],...,,,,,,,,,,
2,ex-,prefix,en,engels,english,"[Voorvoegsel in het Engels, Woorden in het Eng...",[{'glosses': ['uit-']}],Voorvoegsel,,[morpheme],...,,,,,,,,,,
3,ef-,prefix,en,engels,english,"[Voorvoegsel in het Engels, Woorden in het Eng...",[{'glosses': ['uit-']}],Voorvoegsel,,[morpheme],...,,,,,,,,,,
4,arch-,prefix,en,engels,english,"[Voorvoegsel in het Engels, Woorden in het Eng...",[{'glosses': ['aarts-']}],Voorvoegsel,,[morpheme],...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4806,stik-,prefix,nl,dutch,dutch,[],[{'glosses': ['Informal intensifying prefix.']...,,,,...,,,,,,From stikken (“to choke”).,"[{'name': 'head', 'args': {'1': 'nl', '2': 'pr...",,,
4807,-atief,suffix,nl,dutch,dutch,[],"[{'glosses': ['-ative (adjectival suffix)'], '...",,,,...,"[{'form': 'no-table-tags', 'source': 'declensi...",,,"[{'name': 'bor', 'args': {'1': 'nl', '2': 'fr'...",,As a suffix formed by analogy with other Dutch...,"[{'name': 'head', 'args': {'1': 'nl', '2': 'su...",,,"[{'name': 'nl-decl-adj', 'args': {'1': '-atiev..."
4808,-iaan,suffix,nl,dutch,dutch,[],"[{'glosses': ['-ian (noun-forming suffix)'], '...",,[{}],,...,,,,"[{'name': 'bor+', 'args': {'1': 'nl', '2': 'la...",,Borrowed from Latin -iānus.,"[{'name': 'head', 'args': {'1': 'nl', '2': 'su...",,,
4809,-orisch,suffix,nl,dutch,dutch,[],[{'glosses': ['-ory; suffix attached to the su...,,"[{'ipa': '/oː.ris/'}, {'rhymes': '-oːris'}]",,...,"[{'form': 'no-table-tags', 'source': 'declensi...",,,"[{'name': 'clq', 'args': {'1': 'nl', '2': 'la'...",,Calque of Latin -ōrius (“-ory”) (itself from -...,"[{'name': 'head', 'args': {'1': 'nl', '2': 'su...",,,"[{'name': 'nl-decl-adj', 'args': {'1': '', '2'..."


In [14]:
from dutchanalyzer.utilities.pandas_utils import return_non_na
def return_non_na_drop_cols(df, col):
    new_df = df[~df[col].isna()]
    return new_df.dropna(axis=1, how='all')

hyp_df = return_non_na_drop_cols(df, 'hyponyms')
hyp_df = hyp_df.loc[:, ['entry_id', 'word', 'pos', 'senses', 'hyponyms']]

In [16]:
pd.set_option('display.max_colwidth', 3) 
hyp_df

Unnamed: 0,entry_id,word,pos,senses,hyponyms
15,NNF_12399,ge-,prefix,"[{'glosses': ['ge- + de stam van een werkwoord met de uitgang ""-d"", ""-t"" of ""-en"" vormt het voltooid deelwoord:']}, {'glosses': ['ge- + werkwoord vormt een onscheidbaar werkwoord met oorspronkelijk een betekenis als ""samen"", ""mee-"" of ""helemaal""']}, {'glosses': ['ge + stam van werkwoord vormt onzijdig zelfstandig naamwoord van handeling, dat vaak ook het voortduren daarvan uitdrukt']}, {'glosses': ['ge- + naamwoord + -te geeft een verzameling (collectief) aan. Zie omvoegsel ge- -te:']}]","[{'word': 'geaai'}, {'word': 'geaarzel'}, {'word': 'geadem'}, {'word': 'gebal'}, {'word': 'gebarst'}, {'word': 'gebedank'}, {'word': 'gebedelf'}, {'word': 'gebedplas'}, {'word': 'gebedrink'}, {'word': 'gebedroef'}, {'word': 'gebedruk'}, {'word': 'gebedwater'}, {'word': 'gebeier'}, {'word': 'gebeuk'}, {'word': 'gebeuzel'}, {'word': 'gebibber'}, {'word': 'gebiets'}, {'word': 'geblaaskaak'}, {'word': 'geblader'}, {'word': 'geblèr'}, {'word': 'gebliksem'}, {'word': 'geborrel'}, {'word': 'geborstel'}, {'word': 'gebral'}, {'word': 'gebries'}, {'word': 'gebruis'}, {'word': 'gebuitel'}, {'word': 'gebulder'}, {'word': 'gebulk'}, {'word': 'gebuur'}, {'word': 'gechicaneer'}, {'word': 'gecijfer'}, {'word': 'gedans'}, {'word': 'gedartel'}, {'word': 'gedaver'}, {'word': 'gedein'}, {'word': 'gedelibereer'}, {'word': 'gedender'}, {'word': 'gediscussieer'}, {'word': 'gedol'}, {'word': 'gedommel'}, {'word': 'gedonderjaag'}, {'word': 'gedondersteen'}, {'word': 'gedonderstraal'}, {'word': 'gedraaf'}, {'word': 'gedraal'}, {'word': 'gedram'}, {'word': 'gedreig'}, {'word': 'gedrein'}, {'word': 'gedrens'}, {'word': 'gedrentel'}, {'word': 'gedreutel'}, {'word': 'gedribbel'}, {'word': 'gedril'}, {'word': 'gedrink'}, {'word': 'gedroom'}, {'word': 'gedruis'}, {'word': 'gedrum'}, {'word': 'gedrup'}, {'word': 'gedruppel'}, {'word': 'geduikel'}, {'word': 'geduivel'}, {'word': 'geduvel'}, {'word': 'gedwarrel'}, {'word': 'gedweep'}, {'word': 'gefantaseer'}, {'word': 'gefemel'}, {'word': 'gefiedel'}, {'word': 'gefilosofeer'}, {'word': 'geflakker'}, {'word': 'gefleem'}, {'word': 'geflikflooi'}, {'word': 'geflikker'}, {'word': 'geflits'}, {'word': 'geflonker'}, {'word': 'gefoezel'}, {'word': 'gefonkel'}, {'word': 'gefrazel'}, {'word': 'gefrons'}, {'word': 'gegaap'}, {'word': 'gegak'}, {'word': 'gegalm'}, {'word': 'gegap'}, {'word': 'gegiebel'}, {'word': 'gegier'}, {'word': 'geginnegap'}, {'word': 'gegis'}, {'word': 'geglimlach'}, {'word': 'geglinster'}, {'word': 'gegloei'}, {'word': 'gegluur'}, {'word': 'gegniffel'}, {'word': 'gegons'}, {'word': 'gegoochel'}, {'word': 'gegooi'}, {'word': 'gegorgel'}, {'word': 'gegraaf'}, {'word': 'gegrien'}, {'word': 'gegrijns'}, {'word': 'gegrol'}, ...]"
21,NNF_14313,-st,suffix,"[{'glosses': ['vormt een zelfstandig naamwoord van handeling van een werkwoord']}, {'glosses': ['vormt de overtreffende trap van bijvoeglijke naamwoorden']}]","[{'word': 'doorkomst'}, {'word': 'overkomst'}, {'word': 'tafeldienst'}, {'word': 'tegenkomst'}, {'word': 'terugontvangst'}, {'word': 'wederkomst'}]"
33,NNF_17113,-schap,suffix,"[{'glosses': [': maakt van een bijvoeglijk naamwoord een zelfstandig naamwoord dat een toestand aanduidt'], 'tags': ['feminine']}, {'glosses': [': omschrijft een geheel of een instelling dat iets omvat, vaak op basis van een zelfstandig naamwoord'], 'tags': ['neuter']}]","[{'word': 'aalmoezenierschap'}, {'word': 'aankomelingschap'}, {'word': 'afgezantschap'}, {'word': 'afkomelingschap'}, {'word': 'ambachtschap'}, {'word': 'animateurschap'}, {'word': 'beheerschap'}, {'word': 'bestuurschap'}, {'word': 'bevelvoerderschap'}, {'word': 'bezoekmoederschap'}, {'word': 'bijzitterschap'}, {'word': 'boerschap'}, {'word': 'Bondspresidentschap'}, {'word': 'bosschap'}, {'word': 'bottelierschap'}, {'word': 'BV-schap'}, {'word': 'christenschap'}, {'word': 'cliëntschap'}, {'word': 'coachschap'}, {'word': 'compagnieschap'}, {'word': 'compagnonschap'}, {'word': 'dealerschap'}, {'word': 'diakenschap'}, {'word': 'dogeschap'}, {'word': 'drossaardschap'}, {'word': 'drostschap'}, {'word': 'entrepreneurschap'}, {'word': 'eredivisieschap'}, {'word': 'filmerschap'}, {'word': 'filmsterschap'}, {'word': 'gardiaanschap'}, {'word': 'gespanschap'}, {'word': 'gevaderschap'}, {'word': 'honderdschap'}, {'word': 'houtvesterschap'}, {'word': 'huismanschap'}, {'word': 'hulpverlenerschap'}, {'word': 'importeurschap'}, {'word': 'ingenieurschap'}, {'word': 'inspecteurschap'}, {'word': 'jachtschap'}, {'word': 'jongelingschap'}, {'word': 'jufferschap'}, {'word': 'kardinaalschap'}, {'word': 'kennerschap'}, {'word': 'klerkschap'}, {'word': 'knapenschap'}, {'word': 'komenschap'}, {'word': 'kopmanschap'}, {'word': 'korporaalschap'}, {'word': 'kosterschap'}, {'word': 'kunstenaarsschap'}, {'word': 'leenmanschap'}, {'word': 'lordschap'}, {'word': 'luchtschap'}, {'word': 'luitenantschap'}, {'word': 'maalschap'}, {'word': 'maarschalkschap'}, {'word': 'magistraatschap'}, {'word': 'majoorschap'}, {'word': 'makerschap'}, {'word': 'mandarijnschap'}, {'word': 'matischap'}, {'word': 'medehuurderschap'}, {'word': 'messiasschap'}, {'word': 'mevrouwschap'}, {'word': 'momberschap'}, {'word': 'momboorschap'}, {'word': 'monnikschap'}, {'word': 'nazireeërschap'}, {'word': 'neefschap'}, {'word': 'oelèëbalangschap'}, {'word': 'officierschap'}, {'word': 'onderaannemerschap'}, {'word': 'onderdaanschap'}, {'word': 'onderduikschap'}, {'word': 'onderkoningschap'}, {'word': 'ondernemingsschap'}, {'word': 'opaschap'}, {'word': 'opdrachtgeverschap'}, {'word': 'opperbevelhebberschap'}, {'word': 'opzienerschap'}, {'word': 'ornamentschap'}, {'word': 'ouderlingschap'}, {'word': 'pairschap'}, {'word': 'pandelingschap'}, {'word': 'pandschap'}, {'word': 'pariaschap'}, {'word': 'pastoorschap'}, {'word': 'patronaatschap'}, {'word': 'pelgrimschap'}, {'word': 'pionierschap'}, {'word': 'plaatsvervangerschap'}, {'word': 'plassenschap'}, {'word': 'pleegkindschap'}, {'word': 'pleegouderschap'}, {'word': 'portierschap'}, {'word': 'prelaatschap'}, {'word': 'prepromoschap'}, {'word': 'pretorschap'}, ...]"
34,NNF_17325,-aat,suffix,"[{'glosses': ['vormt een naamwoord van handeling van zekere werkwoorden van Latijnse of Romaanse afkomst (meestal eindigend op -eren)']}, {'glosses': ['waarmee van een persoonsnaam een woord wordt gevormd dat een beroep, positie, functie, of een daarbij behorende waardigheid, titel, ambtstermijn of een ambtsgebied aanduidt']}, {'glosses': ['maakt van een gebiedsnaam een woord dat een inwoner van het door het grondwoord genoemde gebied aangeeft']}, {'glosses': ['geeft een oxidisch complex ion aan van een hoofdgroepelement in zijn hoogste oxidatietoestand'], 'categories': ['Scheikunde_in_het_Nederlands'], 'topics': ['chemistry']}]","[{'word': 'ablutievaat'}, {'word': 'actuariaat'}, {'word': 'adiabaat'}, {'word': 'adressaat'}, {'word': 'adsorbaat'}, {'word': 'agglomeraat'}, {'word': 'agnaat'}, {'word': 'albuminaat'}, {'word': 'alcoholaat'}, {'word': 'aleuronaat'}, {'word': 'alkanoaat'}, {'word': 'alkanolaat'}, {'word': 'allabrevemaat'}, {'word': 'aluminaat'}, {'word': 'alumnaat'}, {'word': 'ampullaat'}, {'word': 'anastigmaat'}, {'word': 'animaat'}, {'word': 'anonimaat'}, {'word': 'antranilaat'}, {'word': 'aplanaat'}, {'word': 'arachidaat'}, {'word': 'archivariaat'}, {'word': 'arelaat'}, {'word': 'artisanaat'}, {'word': 'assignaat'}, {'word': 'attentaat'}, {'word': 'auditoraat'}, {'word': 'bakraat'}, {'word': 'boraat'}, {'word': 'botonaat'}, {'word': 'bracteaat'}, {'word': 'bromaat'}, {'word': 'cedraat'}, {'word': 'cellysaat'}, {'word': 'centrifugaat'}, {'word': 'centumviraat'}, {'word': 'cerficaat'}, {'word': 'chloraat'}, {'word': 'citraat'}, {'word': 'coacervaat'}, {'word': 'comitaat'}, {'word': 'concordaat'}, {'word': 'condensaat'}, {'word': 'conglomeraat'}, {'word': 'conglutinaat'}, {'word': 'correlaat'}, {'word': 'crusaat'}, {'word': 'cyclamaat'}, {'word': 'decemviraat'}, {'word': 'delegaat'}, {'word': 'dominaat'}, {'word': 'duümviraat'}, {'word': 'emeritaat'}, {'word': 'exarchaat'}, {'word': 'exsiccaat'}, {'word': 'externaat'}, {'word': 'exudaat'}, {'word': 'falsificaat'}, {'word': 'filtraat'}, {'word': 'flagellaat'}, {'word': 'garnaat'}, {'word': 'generalaat'}, {'word': 'glucosinolaat'}, {'word': 'glutamaat'}, {'word': 'graduaat'}, {'word': 'granulaat'}, {'word': 'hanzeaat'}, {'word': 'ijzerniobaat'}, {'word': 'immediaat'}, {'word': 'inseraat'}, {'word': 'intricaat'}, {'word': 'iteraat'}, {'word': 'juvenaat'}, {'word': 'kalefaat'}, {'word': 'kaliumbromaat'}, {'word': 'kanonikaat'}, {'word': 'karitaat'}, {'word': 'kosteraat'}, {'word': 'kwadernaat'}, {'word': 'labiaat'}, {'word': 'laminaat'}, {'word': 'latifundiaat'}, {'word': 'lauraat'}, {'word': 'laureaat'}, {'word': 'lauwdaat'}, {'word': 'lemniscaat'}, {'word': 'licenciaat'}, {'word': 'manganaat'}, {'word': 'markizaat'}, {'word': 'matriarchaat'}, {'word': 'mecenaat'}, {'word': 'minoraat'}, {'word': 'muskeljaat'}, {'word': 'nazireaat'}, {'word': 'notariaat'}, {'word': 'noviciaat'}, {'word': 'novitiaat'}, {'word': 'numismaat'}, {'word': 'obligaat'}, ...]"
35,NNF_17326,thio-,prefix,"[{'glosses': ['geeft de vervanging van een zuurstof- door een zwavelatoom aan'], 'categories': ['Scheikunde_in_het_Nederlands'], 'topics': ['chemistry']}]","[{'word': 'thioalcohol'}, {'word': 'thiocyaan'}, {'word': 'thiocyaanzuur'}, {'word': 'thioverbinding'}]"
...,...,...,...,...,...
1304,EEF_95338,-centric,suffix,"[{'glosses': ['Having a specified number of centres.'], 'links': [['specified', 'specified'], ['number', 'number#Noun'], ['centres', 'centre#Noun']], 'tags': ['morpheme']}, {'glosses': ['Having a specified object at the centre, or as the focus of attention.'], 'categories': ['English terms with quotations'], 'links': [['object', 'object#Noun'], ['focus', 'focus#Noun'], ['attention', 'attention']], 'tags': ['morpheme']}]","[{'word': 'CLI-centric'}, {'word': 'Polonocentric'}]"
1481,EEF_154472,pluri-,prefix,"[{'glosses': ['several'], 'categories': ['English entries with incorrect language header', 'English lemmas', 'English prefixes', 'English terms borrowed from Latin', 'English terms derived from Latin', 'English terms derived from Old Latin', 'English terms derived from Proto-Indo-European', 'Pages with 5 entries', 'Pages with entries'], 'links': [['several', 'several']], 'tags': ['morpheme']}]","[{'english': 'from Ancient Greek via New Latin', 'translation': 'from Ancient Greek via New Latin', 'word': 'oligo-'}, {'english': 'from Latin', 'translation': 'from Latin', 'word': 'pauci-'}]"
1523,EEF_171430,procto-,prefix,"[{'glosses': ['Dealing with the rectum and anus; anorectal; sometimes also the distal colon.'], 'categories': ['English entries with incorrect language header', 'English lemmas', 'English prefixes', 'English terms derived from Ancient Greek', 'English undefined derivations', 'Pages using catfix', 'Pages with 1 entry', 'Pages with entries'], 'links': [['rectum', 'rectum'], ['anus', 'anus'], ['anorectal', 'anorectal#English'], ['colon', 'colon']], 'tags': ['morpheme']}]","[{'word': 'ano-'}, {'word': 'recto-'}]"
2695,EEF_603304,lyo-,prefix,"[{'glosses': ['solvent, dissolving, dispersion; freeze-drying'], 'categories': ['English entries with incorrect language header', 'English lemmas', 'English prefixes', 'English terms derived from Ancient Greek', 'English terms derived from Proto-Indo-European', 'Entries with translation boxes', 'Pages using catfix', 'Pages with 1 entry', 'Pages with entries', 'Terms with French translations', 'Terms with Italian translations', 'Translation table header lacks gloss', 'en:Chemistry'], 'links': [['chemistry', 'chemistry'], ['solvent', 'solvent'], ['dissolving', 'dissolving'], ['dispersion', 'dispersion'], ['freeze-drying', 'freeze-drying']], 'raw_glosses': ['(chemistry) solvent, dissolving, dispersion; freeze-drying'], 'tags': ['morpheme'], 'topics': ['chemistry', 'natural-sciences', 'physical-sciences']}]",[{'word': 'hydro-'}]
