# Find and Parse Translations from EN Wiktionary

In [1]:
import pandas as pd
import numpy as np
import json
from dutchanalyzer.config import *
from dutchanalyzer.utils import *
from pathlib import Path
from dotenv import load_dotenv
from io import StringIO
import datetime
import re

In [2]:
today = datetime.date.today().__format__("%d-%m-%y")
current_save_folder = Path(INTERIM_DATA_DIR, 'preprocessing', 'wikt', str(today))
folders = {'en': ['pp_en', 'r_en', 'pp_nl', 'r_nl'], 'nl':['pp_en', 'r_en', 'pp_nl', 'r_nl']}
#path_list = make_folder(current_save_folder, folders)

In [3]:
last_save_folder = Path(INTERIM_DATA_DIR, 'preprocessing', 'wikt', '03-11-25')

In [4]:
EEP_file = Path(last_save_folder, 'en', 'pp_en', 'stripped_sort_EEP.csv')
ENP_file = Path(last_save_folder, 'en', 'pp_nl', 'stripped_sort_ENP.csv')
EER_file = Path(last_save_folder, 'en', 'r_en', 'stripped_sort_EER.csv')
ENR_file = Path(last_save_folder, 'en', 'r_nl', 'stripped_sort_ENR.csv')

In [5]:
NEP_file = Path(last_save_folder, 'nl', 'pp_en', 'stripped_sorted_NEP.csv')
NNP_file = Path(last_save_folder, 'nl', 'pp_nl', 'stripped_sorted_NNP.csv')
NER_file = Path(last_save_folder, 'nl', 'r_en', 'stripped_sort_NER.csv')
NNR_file = Path(last_save_folder, 'nl', 'r_nl', 'stripped_sort_NNR.csv')

In [6]:
files = [EEP_file, ENP_file, EER_file, ENR_file, NEP_file,NER_file, NNR_file, NNP_file]

In [7]:
general_enwikt_save_folder = Path(INTERIM_DATA_DIR, 'preprocessing', 'wikt', 'en-wikt')
general_nlwikt_save_folder = Path(INTERIM_DATA_DIR, 'preprocessing', 'wikt', 'nl-wikt')

In [None]:

# for f in files:
#     df = pd.read_csv(f)
#     df = df[df['pos'] != 'name']
#     p = f.parts
#     print(p)
#     wlang = p[-3]
#     tlang = p[-2][-2:]
#     dsrc = p[-2][0]
#     abbr = f'{wlang[0].upper()}{tlang[0].upper()}{dsrc.upper()}'
#     print(abbr)
#     file_path = f'{wlang}-wikt/{abbr}/{abbr}.csv'
#     df.to_csv(Path(INTERIM_DATA_DIR, 'preprocessing','wikt',file_path))

## Utils

In [8]:
def transform_df_to_col_table(origin_df, col_name, group_key='word_id'):
    dropped_df = origin_df[~origin_df[col_name].isna()]

    word_id_col = ''
    if 'word_id' not in origin_df.columns:

        sub_ids = ['EEP_id', 'ENP_id', 'EER_id', 'ENR_id', 'NEP_id', 'NNP_id', 'NER_id', 'NNR_id']
        if group_key in sub_ids:
            word_id_col = group_key
        else:
            for i in sub_ids:
                if i in origin_df.columns:
                    word_id_col = i
                    if group_key == 'word_id':
                        group_key = i
    elif 'word_id' in origin_df.columns:
        word_id_col = 'word_id' 
    
    words_df = dropped_df.loc[:, [word_id_col, 'word']]
    df = dropped_df.loc[:, [word_id_col, 'pos', col_name]]
    df[col_name] = df[col_name].apply(safe_eval)
    df = df.explode(col_name, ignore_index=True)
    
    df = pd.concat([df.drop(columns=col_name), pd.json_normalize(df[col_name])],axis=1)
    
    df = df.groupby(group_key, as_index=False).aggregate('first').reindex(columns=df.columns)
    if 'word' in df.columns:
        df = df.rename(columns={'word': col_name})
    
    df = df.merge(words_df, on=word_id_col)
    df_cols = df.columns.tolist()
    df_cols = df_cols[0:2] + ['word'] + df_cols[2:-1]
    df = df[df_cols]
    
    return df

In [9]:
def extract_translations(df, lang_code, col_prefix=''):
    translations_col = []
    translations_words_col = []
    for i, row in df.iterrows():
        translations = row['translations']
        has_translation = False
        def_translations = []
        word_translations = []
        for j in translations:
            if j['lang_code'] == lang_code:
                def_translations.append(j)
                word_translations.append(j['word'])
                has_translation = True
        
        translations_col.append(def_translations)
        translations_words_col.append(word_translations)
    new_col_name = "{}{}_translation_d".format(col_prefix, lang_code)
    col2_name = "{}{}_translation_words".format(col_prefix, lang_code)
    df[new_col_name] = translations_col
    df[col2_name] = translations_words_col
    return df

In [10]:
def transform_cut(origin_df, col_name, group_key='word_id', word_id_col='word_id', start_cols=[], end_cols=[]):
    df = origin_df[~origin_df[col_name].isna()]

    df = df.loc[:, start_cols + [col_name] + end_cols]
    words_df = df.loc[:, [word_id_col, 'word']]
    df[col_name] = df[col_name].apply(safe_eval)
    df = df.explode(col_name, ignore_index=True)

    df = pd.concat([df.drop(columns=col_name), pd.json_normalize(df[col_name])],axis=1)
    df = df.groupby(group_key, as_index=False).aggregate('first').reindex(columns=df.columns)
    if 'word' in df.columns:
        df = df.rename(columns={'word': col_name})
    
    df = df.merge(words_df, on=word_id_col)
    start_cols.append(col_name)
    df = sort_columns(df, start_cols=start_cols, end_cols=end_cols, col_end_sort=word_id_col)
    
    return df

## Find the Translations EN

In [18]:
last_enp_folder = Path(last_save_folder, 'en', 'pp_nl')
enp_folder = Path(current_save_folder, 'en', 'pp_nl')

In [11]:
ENP_df = pd.read_csv(Path(general_enwikt_save_folder, 'ENP', 'ENP.csv'), index_col=0)

  ENP_df = pd.read_csv(Path(general_enwikt_save_folder, 'ENP', 'ENP.csv'), index_col=0)


In [12]:
ENP_df.columns

Index(['ENP_id', 'word', 'pos', 'lang_code', 'antonyms', 'categories',
       'coordinate_terms', 'derived', 'descendants', 'etymology_templates',
       'etymology_text', 'forms', 'head_templates', 'holonyms', 'hypernyms',
       'hyphenations', 'hyponyms', 'inflection_templates', 'meronyms',
       'related', 'senses', 'sounds', 'synonyms', 'dsrc'],
      dtype='object')

In [29]:
ENP_df.to_csv(Path(general_enwikt_save_folder, 'ENP', 'ENP_df_04-11-25.csv'))

In [30]:
ENP_df.to_csv(Path(general_enwikt_save_folder, 'ENP', 'ENP.csv'))

### Senses

In [20]:
enp2 = transform_cut(ENP_df, 'senses', 'ENP_id', "ENP_id", ['ENP_id', 'word', 'pos', 'lang_code'], ['dsrc'])

['ENP_id', 'word', 'pos', 'lang_code', 'senses', 'alt_of', 'antonyms', 'attestations', 'categories', 'coordinate_terms', 'derived', 'examples', 'form_of', 'glosses', 'head_nr', 'holonyms', 'hypernyms', 'hyponyms', 'id', 'info_templates', 'links', 'meronyms', 'qualifier', 'raw_glosses', 'raw_tags', 'related', 'senseid', 'synonyms', 'tags', 'topics', 'wikidata', 'wikipedia', 'dsrc']


In [23]:
enp2.to_csv(Path(enp_folder, 'cols', 'enp_senses.csv'))

In [13]:
enp_senses_df = pd.read_csv(Path(general_enwikt_save_folder, 'ENP', 'column_dfs','enp_senses_04-11-25.csv'), index_col=0)

In [14]:
enp_senses_df.columns

Index(['ENP_id', 'word', 'pos', 'lang_code', 'senses', 'alt_of', 'antonyms',
       'attestations', 'categories', 'coordinate_terms', 'derived', 'examples',
       'form_of', 'glosses', 'head_nr', 'holonyms', 'hypernyms', 'hyponyms',
       'id', 'info_templates', 'links', 'meronyms', 'qualifier', 'raw_glosses',
       'raw_tags', 'related', 'senseid', 'synonyms', 'tags', 'topics',
       'wikipedia', 'dsrc'],
      dtype='object')

In [19]:
enp_senses_df = enp_senses_df.drop(columns=['wikidata'])

In [15]:
enp_senses_df = enp_senses_df.drop(columns=['wikipedia'])

In [19]:
enp_senses_df.to_csv(Path(enp_folder, 'enp_senses.csv'))

In [16]:
enp_senses_df.to_csv(Path(general_enwikt_save_folder, 'ENP', 'column_dfs', 'enp_senses_04-11-25.csv'))

### Column Exploration

#### Wikidata

In [31]:
enp2['wikidata'].value_counts()

wikidata
[Q27366]       2
[Q26423]       2
[Q170050]      1
[Q181014]      1
[Q1097328]     1
[Q178805]      1
[Q26533]       1
[Q277954]      1
[Q188269]      1
[Q3246226]     1
[Q182186]      1
[Q188463]      1
[Q11875349]    1
[Q770289]      1
[Q182015]      1
[Q691824]      1
[Q56119332]    1
Name: count, dtype: int64

In [None]:
enp2['wikidata'].value_counts()

In [34]:
enp2 = enp2.drop(columns=['wikidata'])

In [25]:
enp_senses_df['wikipedia'].value_counts()

wikipedia
['Nederlandsche vogelen']                 45
['Nicoline van der Sijs']                 34
['Woordenboek der Nederlandsche Taal']     4
['Pipo de Clown']                          3
['nl:deca (prefix)']                       2
                                          ..
['nl:nunchaku']                            1
['nl:xenon']                               1
['nl:yaoi']                                1
['nl:yoghurtijs']                          1
['Friese doorloper']                       1
Name: count, Length: 1434, dtype: int64

In [21]:
enp_senses_df.drop(columns=['wikipedia'], inplace=True)

In [21]:
enp_senses_df.drop(columns=['raw_tags'], inplace=True)

#### Nyms

In [20]:
enp_senses_df.columns

Index(['ENP_id', 'word', 'pos', 'lang_code', 'senses', 'alt_of', 'antonyms',
       'attestations', 'categories', 'coordinate_terms', 'derived', 'examples',
       'form_of', 'glosses', 'head_nr', 'holonyms', 'hypernyms', 'hyponyms',
       'id', 'info_templates', 'links', 'meronyms', 'qualifier', 'raw_glosses',
       'raw_tags', 'related', 'senseid', 'synonyms', 'tags', 'topics', 'dsrc'],
      dtype='object')

In [22]:
nyms_columns = ['antonyms', 'holonyms', 'hypernyms', 'hyponyms', 'meronyms', 'synonyms']

gen_keep_cols = ['word', 'pos', 'lang_code']

gen_end_cols = ['dsrc']
enp_keep_cols = ['ENP_id'] + gen_keep_cols + nyms_columns + gen_end_cols

In [23]:
nyms_df = enp_senses_df.loc[:, enp_keep_cols]
nyms_df.dropna(subset=['synonyms', 'antonyms', 'hypernyms', 'hyponyms', 'holonyms', 'meronyms'], how='all')

Unnamed: 0,ENP_id,word,pos,lang_code,antonyms,holonyms,hypernyms,hyponyms,meronyms,synonyms,dsrc
5,7,'s middags,adv,nl,,,,,,"[{'word': ""'s noens""}]",ENP_src
6,8,'s morgens,adv,nl,,,,,,"[{'word': ""'s ochtends""}]",ENP_src
8,10,'s namiddags,adv,nl,,,,,,"[{'word': ""'s middags""}]",ENP_src
14,69,-a,suffix,nl,,,,,,[{'word': '-ums'}],ENP_src
19,74,-amine,suffix,nl,,,,,,[{'word': 'amino-'}],ENP_src
...,...,...,...,...,...,...,...,...,...,...,...
63998,69464,zwijmen,verb,nl,,,,,,[{'word': 'bezwijmen'}],ENP_src
64001,69467,zwijntjesjager,noun,nl,,,,,,[{'word': 'fietsendief'}],ENP_src
64003,69469,zwik,noun,nl,,,,,,"[{'word': 'bom'}, {'word': 'pen'}, {'word': 'p...",ENP_src
64012,69478,zwoegen,verb,nl,,,,,,"[{'word': 'hosselen'}, {'word': 'knoeften'}, {...",ENP_src


In [24]:
exploded_lines = []
weird_lines = []
nym_id_num = 0
for i, row in tqdm(nyms_df.iterrows()):
    
    #line_gen = {'ENP_id': row['ENP_id'], 'word': row['word'], 'pos': row['pos']}
    line_gen = [row['ENP_id'], row['word'], row['pos']]
    dsrc = row['dsrc']
    line2 = []
    for c in nyms_columns:
        row_c = safe_eval(row[c])
        if not row_c == []:
            try:
                for j in range(len(row_c)):
                    w = row_c[j].get('word', None)
                    if not w:
                        weird_lines.append(line_gen + row_c)
                    else:
                        line2.append(w)
                        line2.append(c)
                        line2.append(dsrc)
                        if i%10000 == 0:
                            display(row)
                        exploded_lines.append(line_gen + line2)
                        line2 = []
            except:
                weird_lines.append(line_gen + row_c)
                line2 = []



9963it [00:02, 6517.22it/s]

ENP_id                      15888
word                   botsballon
pos                          noun
lang_code                      nl
antonyms                      NaN
holonyms                      NaN
hypernyms                     NaN
hyponyms                      NaN
meronyms                      NaN
synonyms     [{'word': 'airbag'}]
dsrc                      ENP_src
Name: 10000, dtype: object

29286it [00:05, 7473.56it/s]

ENP_id                        35743
word                      kruispunt
pos                            noun
lang_code                        nl
antonyms                        NaN
holonyms                        NaN
hypernyms                       NaN
hyponyms                        NaN
meronyms                        NaN
synonyms     [{'word': 'kruising'}]
dsrc                        ENP_src
Name: 30000, dtype: object

64051it [00:10, 6335.27it/s]


In [47]:
def process_nyms(nyms_df, start_cols=['word', 'pos', 'lang_code'], end_cols=['dsrc'],save_path='', word_id_col=''):
    nyms_columns = ['antonyms', 'holonyms', 'hypernyms', 'hyponyms', 'meronyms', 'synonyms']

    
    keep_cols = [word_id_col] + start_cols + nyms_columns + end_cols
    exploded_lines = []
    weird_lines = []
    
    for i, row in tqdm(nyms_df.iterrows()):
        
        #line_gen = {'ENP_id': row['ENP_id'], 'word': row['word'], 'pos': row['pos']}
        start_line = []
        end_line = []
        if word_id_col != '':
            start_line.append(row[word_id_col])
        else:
            start_line.append('')
        for c in start_cols:
            start_line.append(row[c])
        for c in end_cols:
            end_line.append(row[c])
        
        center_line = []
        for c in nyms_columns:
            row_c = safe_eval(row[c])
            if not row_c == []:
                try:
                    for j in range(len(row_c)):
                        w = row_c[j].get('word', None)
                        if not w:
                            weird_lines.append(start_line + row_c + end_line)
                        else:
                            center_line.append(w)
                            center_line.append(c[0:-1])
                            exploded_lines.append(start_line + center_line + end_line)
                        center_line = []
                except:
                    weird_lines.append(line_gen + row_c)
                    center_line = []
    print("Lines with errors: ", len(weird_lines))
    return exploded_lines

In [45]:
exploded_lines = process_nyms(nyms_df, word_id_col='ENP_id')

64051it [00:03, 20914.96it/s]

Lines with errors:  0





In [48]:
display(exploded_lines[0:10])

[[7, "'s middags", 'adv', 'nl', "'s noens", 'synonym', 'ENP_src'],
 [8, "'s morgens", 'adv', 'nl', "'s ochtends", 'synonym', 'ENP_src'],
 [10, "'s namiddags", 'adv', 'nl', "'s middags", 'synonym', 'ENP_src'],
 [69, '-a', 'suffix', 'nl', '-ums', 'synonym', 'ENP_src'],
 [74, '-amine', 'suffix', 'nl', 'amino-', 'synonym', 'ENP_src'],
 [77, '-atie', 'suffix', 'nl', '-ing', 'synonym', 'ENP_src'],
 [80, '-baar', 'suffix', 'nl', '-abel', 'synonym', 'ENP_src'],
 [86, '-e', 'suffix', 'nl', '-es', 'synonym', 'ENP_src'],
 [86, '-e', 'suffix', 'nl', '-in', 'synonym', 'ENP_src'],
 [86, '-e', 'suffix', 'nl', '-es', 'synonym', 'ENP_src']]

In [51]:
print(len(exploded_lines[0]))

7


In [None]:
print(len(['ENP_id', 'word', 'pos', 'lang_code', 'nym', 'nym_type', 'dsrc']))

6


In [56]:
nyms_df_exploded = pd.DataFrame(exploded_lines, columns=['ENP_id', 'word', 'pos', 'lang_code', 'nym', 'nym_type', 'dsrc'])

In [57]:
nyms_df_exploded.head()

Unnamed: 0,ENP_id,word,pos,lang_code,nym,nym_type,dsrc
0,7,'s middags,adv,nl,'s noens,synonym,ENP_src
1,8,'s morgens,adv,nl,'s ochtends,synonym,ENP_src
2,10,'s namiddags,adv,nl,'s middags,synonym,ENP_src
3,69,-a,suffix,nl,-ums,synonym,ENP_src
4,74,-amine,suffix,nl,amino-,synonym,ENP_src


In [58]:
nyms_df_exploded.to_csv(Path(general_enwikt_save_folder, 'ENP', 'column_dfs', 'senses_sub', 'enp_senses-nyms_04-11-25.csv'))

#### Antonyms

In [36]:
enp2['antonyms'].value_counts()

antonyms
[{'word': 'plus'}]                                 5
[{'word': 'tegengesteld'}]                         3
[{'word': 'clericus'}, {'word': 'geestelijke'}]    3
[{'word': 'beperkt'}, {'word': 'beknopt'}]         3
[{'word': 'onkuis'}, {'word': 'onzedig'}]          3
                                                  ..
[{'word': 'buitenboord'}]                          1
[{'word': 'buitenlijn'}]                           1
[{'word': 'buitenshuis'}]                          1
[{'word': 'vleeslul'}]                             1
[{'word': 'voorop'}]                               1
Name: count, Length: 444, dtype: int64

### Alt Col and Translations Export

In [78]:
def make_alt_translations(df,start_cols=['word', 'pos', 'lang_code'], end_cols=['dsrc'], word_id_col='word_id'):
    df = df[~df['alt_of'].isna()]
    alt_cols = [word_id_col] + start_cols + ['alt_of', 'alt_of_translation'] + end_cols
    
    
    exploded_lines = []
    weird_lines = []
    rows = []
    for i, row in tqdm(df.iterrows()):
        r = {}
        try:
            r[word_id_col] = row[word_id_col]
            for c in start_cols:
                r[c] = row[c]
            for c in end_cols:
                r[c] = row[c]

        
            al = row['alt_of']
            al = safe_eval(al)

            for j in range(len(al)):
                r2 = r.copy() 
                a = al[j]
                w = a.get('word', None)
                al_tr = a.get('extra', None)
                
                r2['alt_of'] = w
                r2['alt_of_translation'] = al_tr
                rows.append(r2.copy())

            # if len(al) > 1:
            #     print(al)
            # elif len(al) == 1:
            #     al = al[0]
            #     new_al = al.get('word', None)
            #     al_tr = al.get('extra', None)
            #     r['alt_of'] = new_al, 
            #     'alt_of_translation':al_tr, 'dsrc':dsrc}
            #     rows.append(r)
        except:
            weird_lines.append(row)
    print('Length weird rows: ', len(weird_lines))
    alt_transaltions_df = pd.DataFrame(rows, columns=alt_cols)
    return alt_transaltions_df
      

In [24]:
enp2_alt_of = enp2[~enp2['alt_of'].isna()]
enp2_alt_of

Unnamed: 0,ENP_id,word,pos,lang_code,senses,alt_of,antonyms,attestations,categories,coordinate_terms,...,raw_glosses,raw_tags,related,senseid,synonyms,tags,topics,wikidata,wikipedia,dsrc
0,0,'er,adv,nl,'er,[{'word': 'der'}],,,[{'name': 'Dutch entries with incorrect langua...,,...,,,,,,"[abbreviation, alt-of]",,,,ENP_src
1,1,'n,article,nl,'n,[{'word': 'een'}],,,"[{'name': 'Dutch articles', 'kind': 'other', '...",,...,,,,,,"[abbreviation, alt-of, contraction]",,,,ENP_src
2,2,'ns,adv,nl,'ns,[{'word': 'eens'}],,,[{'name': 'Dutch entries with incorrect langua...,,...,,,,,,"[abbreviation, alt-of, clipping]",,,,ENP_src
3,3,'s,article,nl,'s,[{'word': 'des'}],,,[],,...,,,,,,"[abbreviation, alt-of, contraction]",,,,ENP_src
12,25,'t,article,nl,'t,"[{'word': 'het', 'extra': 'the'}]",,,"[{'name': 'Pages with 10 entries', 'kind': 'ot...",,...,,,,,,"[abbreviation, alt-of, contraction]",,,,ENP_src
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63584,69054,zowiezo,adv,nl,zowiezo,"[{'word': 'sowieso', 'extra': 'anyhow'}]",,,[{'name': 'Dutch entries with incorrect langua...,,...,,,,,,"[alt-of, misspelling]",,,,ENP_src
63630,69099,zuiker,noun,nl,zuiker,"[{'word': 'suiker', 'extra': 'sugar'}]",,,[{'name': 'Dutch entries with incorrect langua...,,...,,,,,,"[alt-of, masculine, obsolete]",,,,ENP_src
63665,69134,zum kotzen,adj,nl,zum kotzen,[{'word': 'zum Kotzen'}],,,[{'name': 'Dutch entries with incorrect langua...,,...,,,,,,"[alt-of, not-comparable]",,,,ENP_src
64043,69509,zyn,verb,nl,zyn,[{'word': 'zijn'}],,,[],,...,,,,,,"[alt-of, obsolete]",,,,ENP_src


In [83]:
alt_translations_df = make_alt_translations(enp_senses_df, word_id_col='ENP_id')

1204it [00:00, 16069.87it/s]

Length weird rows:  0





In [80]:
display(alt_translations_df.head())

Unnamed: 0,ENP_id,word,pos,lang_code,alt_of,alt_of_translation,dsrc
0,0,'er,adv,nl,der,,ENP_src
1,1,'n,article,nl,een,,ENP_src
2,2,'ns,adv,nl,eens,,ENP_src
3,3,'s,article,nl,des,,ENP_src
4,25,'t,article,nl,het,the,ENP_src


In [25]:
alt_transaltions_df = pd.DataFrame(columns=['ENP_id','word', 'pos', 'alt_of', 'alt_of_translation', 'dsrc'])
rows = []
for i, row in enp2_alt_of.iterrows():
    w = row['word']
    p = row['pos']
    id = row['ENP_id']
    al = row['alt_of']
    dsrc = row['dsrc']
    
    try:
        al = safe_eval(al)

        if len(al) > 1:
            print(al)
        elif len(al) == 1:
            al = al[0]
            new_al = al.get('word', None)
            al_tr = al.get('extra', None)
            r = {'ENP_id': id, 'word':w, 'pos':p, 'alt_of':new_al, 'alt_of_translation':al_tr, 'dsrc':dsrc}
            rows.append(r)
    except:
        pass


[{'word': '-tje'}, {'word': 'used for words ending on m'}]
[{'word': 'curriculum vitae'}, {'word': 'curriculum vitae'}]
[{'word': 'curriculum vitae'}, {'word': 'curriculum vitae'}]
[{'word': 'dixi'}, {'word': 'portable toilet'}]
[{'word': 'help'}, {'word': 'when pronounced with epenthetic schwa'}]
[{'word': 'lesbiennes', 'extra': 'the LGB umbrella'}, {'word': "homo's en biseksuelen", 'extra': 'the LGB umbrella'}]
[{'word': 'orthodontist'}, {'word': 'orthodontiste'}]
[{'word': 'rechts', 'extra': 'right, not left'}, {'word': 'rechter', 'extra': 'right, not left'}]
[{'word': 'rechts', 'extra': 'right, not left'}, {'word': 'rechter', 'extra': 'right, not left'}]
[{'word': 'suc6', 'extra': 'good luck'}, {'word': 'abbreviation of succes', 'extra': 'good luck'}]


In [27]:
alt_transaltions_df = pd.DataFrame(rows)

In [85]:
alt_translations_df.to_csv(Path(general_enwikt_save_folder, 'ENP', 'column_dfs', 'senses_sub', 'alt_translations_04-11-25.csv'))

### Head Template

In [None]:
enp_head_df = transform_cut(ENP_df, 'head_templates', 'ENP_id', "ENP_id", ['ENP_id', 'word', 'pos'], ['dsrc'])

['ENP_id', 'word', 'pos', 'head_templates', 'args.1', 'args.10', 'args.11', 'args.2', 'args.3', 'args.4', 'args.5', 'args.6', 'args.7', 'args.8', 'args.9', 'args.cat2', 'args.cat3', 'args.f', 'args.g', 'args.g1', 'args.g2', 'args.g3', 'args.head', 'args.head2', 'args.inv', 'args.m', 'args.nolinkhead', 'args.pl2', 'args.pred', 'expansion', 'name', 'dsrc']


## Find Translations 

## EEP DF

In [None]:
EEP_df = pd.read_csv(Path(general_enwikt_save_folder, 'EEP', 'EEP.csv'), index_col=0)

In [88]:
EEP_df.drop(columns=['Unnamed: 0'], inplace=True)

In [98]:
EEP_df.to_csv(Path(general_enwikt_save_folder, 'EEP', 'EEP.csv'))

In [91]:
print(EEP_df.shape)
print(EEP_df.columns)

(614821, 29)
Index(['EEP_id', 'word', 'pos', 'lang_code', 'antonyms', 'categories',
       'coordinate_terms', 'derived', 'descendants', 'etymology_templates',
       'etymology_text', 'forms', 'head_templates', 'holonyms', 'hypernyms',
       'hyphenation', 'hyphenations', 'hyponyms', 'inflection_templates',
       'instances', 'meronyms', 'original_title', 'related', 'senses',
       'sounds', 'synonyms', 'translations', 'troponyms', 'dsrc'],
      dtype='object')


In [92]:
EEP_df['invalid'] = EEP_df['word'].str.contains(r"[0-9]")

In [94]:
EEP_df = EEP_df[EEP_df['invalid'] == False]

In [96]:
EEP_df.drop(columns=['invalid'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  EEP_df.drop(columns=['invalid'], inplace=True)


### EEP Translations

In [None]:
en_translations_df = pd.DataFrame(columns=['word', 'pos', 'nl_translation', 'dsrc'])

In [158]:
eep_translations = return_non_na(EEP_df, 'translations')

In [159]:
eep_translations_0_10 = eep_translations.iloc[0:10, :]

In [166]:
eep_translations_0_10

Unnamed: 0,EEP_id,word,pos,lang_code,antonyms,categories,coordinate_terms,derived,descendants,etymology_templates,...,instances,meronyms,original_title,related,senses,sounds,synonyms,translations,troponyms,dsrc
310,300,-able,suffix,en,,,,,,"[{'name': 'etymon', 'args': {'1': 'en', 'id': ...",...,,,,,[{'examples': [{'text': 'movable: able to be m...,"[{'tags': ['Western'], 'ipa': '/əbl̩/'}, {'aud...",,"[{'lang': 'Finnish', 'code': 'fi', 'lang_code'...",,EEP_src
347,337,-andry,suffix,en,,,,"[{'word': 'diandry', '_dis1': '0 0 0'}, {'word...",,"[{'name': 'der', 'args': {'1': 'en', '2': 'grc...",...,,,,"[{'word': '-androus', '_dis1': '0 0 0'}, {'wor...",[{'examples': [{'text': 'mono- + -andry → mona...,"[{'tags': ['Received-Pronunciation'], 'enpr': ...",,"[{'lang': 'French', 'code': 'fr', 'lang_code':...",,EEP_src
363,352,-ary,suffix,en,,,,,,"[{'name': 'glossary', 'args': {'1': 'Inherited...",...,,,,,[{'examples': [{'text': 'devolution + -ary → d...,"[{'tags': ['Received-Pronunciation'], 'ipa': '...",,"[{'lang': 'Arabic', 'code': 'ar', 'lang_code':...",,EEP_src
371,359,-ate,suffix,en,,,,,,"[{'name': 'lg', 'args': {'1': 'substantivizati...",...,,,,,[{'examples': [{'text': 'affiliate — “a person...,"[{'ipa': '/ət/'}, {'ipa': '/eɪt/'}, {'audio': ...",,"[{'lang': 'French', 'code': 'fr', 'lang_code':...",,EEP_src
389,375,-bility,suffix,en,,,,,,,...,,,,,"[{'links': [['-ability', '-ability#English']],...",,,"[{'lang': 'Catalan', 'code': 'ca', 'lang_code'...",,EEP_src
480,465,-ee,suffix,en,,,,,,"[{'name': 'inh', 'args': {'1': 'en', '2': 'enm...",...,,,,,[{'examples': [{'text': 'examine + -ee → exami...,,,"[{'lang': 'Albanian', 'code': 'sq', 'lang_code...",,EEP_src
501,480,-er,suffix,en,,,,,,"[{'name': 'etymon', 'args': {'1': 'en', '2': '...",...,,,,"[{'word': '-eer', '_dis1': '0 0 0 0 0 0 0 0 0 ...","[{'examples': [{'text': 'read + -er → reader',...","[{'tags': ['Received-Pronunciation'], 'ipa': '...",,"[{'lang': 'Afrikaans', 'code': 'af', 'lang_cod...",,EEP_src
553,528,-fold,suffix,en,,,,"[{'word': 'onefold', '_dis1': '0 0'}, {'word':...",,"[{'name': 'root', 'args': {'1': 'en', '2': 'in...",...,,,,,[{'examples': [{'text': 'There has been a thre...,"[{'audio': 'en-us--fold.ogg', 'ogg_url': 'http...",,"[{'lang': 'Armenian', 'code': 'hy', 'lang_code...",,EEP_src
595,570,-graph,suffix,en,,,,"[{'word': 'digraph', '_dis1': '0 0 0 0 0'}, {'...",,"[{'name': 'root', 'args': {'1': 'en', '2': 'in...",...,,,,"[{'word': '-graphy', '_dis1': '0 0 0 0 0'}]","[{'examples': [{'text': 'stenograph'}], 'links...",,,"[{'lang': 'Catalan', 'code': 'ca', 'lang_code'...",,EEP_src
658,628,-ing,suffix,en,,,,"[{'word': 'batting', '_dis1': '39 39 22'}, {'w...",,"[{'name': 'etymon', 'args': {'1': 'en', 'id': ...",...,,,,"[{'sense': 'collection', 'word': 'work', '_dis...",[{'examples': [{'text': 'My hearing is not goo...,"[{'ipa': '/ɪŋ/'}, {'ipa': '/ɪn/'}, {'ipa': '/ə...","[{'sense': 'act of doing something, action', '...","[{'lang': 'Chinese Mandarin', 'code': 'zh', 'l...",,EEP_src


In [176]:
for i, row in eep_translations_0_10.iterrows():
    display(row['translations'])
    translations = row['translations']
    print(type(translations))
    if type(translations) == str:
        translations = safe_eval(translations)
        if type(translations) == list:
            for j in translations:
                print(j)
    else:
        print('not')
        

"[{'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-tava', '_dis1': '20 22 20 20 20'}, {'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-tävä', '_dis1': '20 22 20 20 20'}, {'lang': 'German', 'code': 'de', 'lang_code': 'de', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-wert', '_dis1': '20 22 20 20 20'}, {'lang': 'Swedish', 'code': 'sv', 'lang_code': 'sv', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-värd', '_dis1': '20 22 20 20 20'}]"

<class 'str'>
{'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-tava', '_dis1': '20 22 20 20 20'}
{'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-tävä', '_dis1': '20 22 20 20 20'}
{'lang': 'German', 'code': 'de', 'lang_code': 'de', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-wert', '_dis1': '20 22 20 20 20'}
{'lang': 'Swedish', 'code': 'sv', 'lang_code': 'sv', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-värd', '_dis1': '20 22 20 20 20'}


"[{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'man, male', 'tags': ['feminine'], 'word': '-andrie', '_dis1': '33 37 30'}, {'lang': 'Portuguese', 'code': 'pt', 'lang_code': 'pt', 'sense': 'man, male', 'tags': ['feminine'], 'word': '-andria', '_dis1': '33 37 30'}]"

<class 'str'>
{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'man, male', 'tags': ['feminine'], 'word': '-andrie', '_dis1': '33 37 30'}
{'lang': 'Portuguese', 'code': 'pt', 'lang_code': 'pt', 'sense': 'man, male', 'tags': ['feminine'], 'word': '-andria', '_dis1': '33 37 30'}


"[{'lang': 'Arabic', 'code': 'ar', 'lang_code': 'ar', 'sense': 'of or pertaining to', 'roman': '-iyy', 'word': 'ـِيّ', '_dis1': '0 0 0'}, {'lang': 'Catalan', 'code': 'ca', 'lang_code': 'ca', 'sense': 'of or pertaining to', 'word': '-ari', '_dis1': '0 0 0'}, {'lang': 'Galician', 'code': 'gl', 'lang_code': 'gl', 'sense': 'of or pertaining to', 'tags': ['masculine'], 'word': '-ario', '_dis1': '0 0 0'}, {'lang': 'Galician', 'code': 'gl', 'lang_code': 'gl', 'sense': 'of or pertaining to', 'tags': ['feminine'], 'word': '-aria', '_dis1': '0 0 0'}, {'lang': 'Hungarian', 'code': 'hu', 'lang_code': 'hu', 'sense': 'of or pertaining to', 'word': '-árius, -(bel)i, -s, -t/tt', '_dis1': '0 0 0'}, {'lang': 'Interlingua', 'code': 'ia', 'lang_code': 'ia', 'sense': 'of or pertaining to', 'word': '-ari', '_dis1': '0 0 0'}, {'lang': 'Italian', 'code': 'it', 'lang_code': 'it', 'sense': 'of or pertaining to', 'tags': ['masculine'], 'word': '-ario', '_dis1': '0 0 0'}, {'lang': 'Italian', 'code': 'it', 'lang_c

<class 'str'>
{'lang': 'Arabic', 'code': 'ar', 'lang_code': 'ar', 'sense': 'of or pertaining to', 'roman': '-iyy', 'word': 'ـِيّ', '_dis1': '0 0 0'}
{'lang': 'Catalan', 'code': 'ca', 'lang_code': 'ca', 'sense': 'of or pertaining to', 'word': '-ari', '_dis1': '0 0 0'}
{'lang': 'Galician', 'code': 'gl', 'lang_code': 'gl', 'sense': 'of or pertaining to', 'tags': ['masculine'], 'word': '-ario', '_dis1': '0 0 0'}
{'lang': 'Galician', 'code': 'gl', 'lang_code': 'gl', 'sense': 'of or pertaining to', 'tags': ['feminine'], 'word': '-aria', '_dis1': '0 0 0'}
{'lang': 'Hungarian', 'code': 'hu', 'lang_code': 'hu', 'sense': 'of or pertaining to', 'word': '-árius, -(bel)i, -s, -t/tt', '_dis1': '0 0 0'}
{'lang': 'Interlingua', 'code': 'ia', 'lang_code': 'ia', 'sense': 'of or pertaining to', 'word': '-ari', '_dis1': '0 0 0'}
{'lang': 'Italian', 'code': 'it', 'lang_code': 'it', 'sense': 'of or pertaining to', 'tags': ['masculine'], 'word': '-ario', '_dis1': '0 0 0'}
{'lang': 'Italian', 'code': 'it', 'l

"[{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'substantive', 'word': '-é', '_dis1': '34 26 40'}]"

<class 'str'>
{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'substantive', 'word': '-é', '_dis1': '34 26 40'}


"[{'lang': 'Catalan', 'code': 'ca', 'lang_code': 'ca', 'sense': 'suffix', 'tags': ['feminine'], 'word': '-bilitat', '_dis1': '52 48'}, {'lang': 'Galician', 'code': 'gl', 'lang_code': 'gl', 'sense': 'suffix', 'tags': ['feminine'], 'word': '-bilidade', '_dis1': '52 48'}, {'lang': 'Portuguese', 'code': 'pt', 'lang_code': 'pt', 'sense': 'suffix', 'tags': ['feminine'], 'word': '-bilidade', '_dis1': '52 48'}, {'lang': 'Spanish', 'code': 'es', 'lang_code': 'es', 'sense': 'suffix', 'tags': ['feminine'], 'word': '-bilidad', '_dis1': '52 48'}]"

<class 'str'>
{'lang': 'Catalan', 'code': 'ca', 'lang_code': 'ca', 'sense': 'suffix', 'tags': ['feminine'], 'word': '-bilitat', '_dis1': '52 48'}
{'lang': 'Galician', 'code': 'gl', 'lang_code': 'gl', 'sense': 'suffix', 'tags': ['feminine'], 'word': '-bilidade', '_dis1': '52 48'}
{'lang': 'Portuguese', 'code': 'pt', 'lang_code': 'pt', 'sense': 'suffix', 'tags': ['feminine'], 'word': '-bilidade', '_dis1': '52 48'}
{'lang': 'Spanish', 'code': 'es', 'lang_code': 'es', 'sense': 'suffix', 'tags': ['feminine'], 'word': '-bilidad', '_dis1': '52 48'}


"[{'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'word': '-të', '_dis1': '25 21 22 22 10'}, {'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'word': '-atë', '_dis1': '25 21 22 22 10'}, {'lang': 'Esperanto', 'code': 'eo', 'lang_code': 'eo', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'word': '-ato', '_dis1': '25 21 22 22 10'}, {'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'word': '-tu', '_dis1': '25 21 22 22 10'}, {'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'tags': ['masculine'], 'word': '-é', '_dis1': '25 21 22 22 10'}, {'lang': 'French', 'code': 'fr', 'lang_code': 

<class 'str'>
{'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'word': '-të', '_dis1': '25 21 22 22 10'}
{'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'word': '-atë', '_dis1': '25 21 22 22 10'}
{'lang': 'Esperanto', 'code': 'eo', 'lang_code': 'eo', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'word': '-ato', '_dis1': '25 21 22 22 10'}
{'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'word': '-tu', '_dis1': '25 21 22 22 10'}
{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'forming words meaning a person to whom or a thing to which an action is done', 'tags': ['masculine'], 'word': '-é', '_dis1': '25 21 22 22 10'}
{'lang': 'French', 'code': 'fr', 'lang_

"[{'lang': 'Afrikaans', 'code': 'af', 'lang_code': 'af', 'sense': '(used to form agent nouns) person or thing that does...', 'word': '-er', '_dis1': '25 8 9 8 24 5 4 1 14 2'}, {'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': '(used to form agent nouns) person or thing that does...', 'tags': ['masculine'], 'word': '-es', '_dis1': '25 8 9 8 24 5 4 1 14 2'}, {'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': '(used to form agent nouns) person or thing that does...', 'tags': ['feminine'], 'word': '-ese', '_dis1': '25 8 9 8 24 5 4 1 14 2'}, {'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': '(used to form agent nouns) person or thing that does...', 'tags': ['masculine'], 'word': '-ës', '_dis1': '25 8 9 8 24 5 4 1 14 2'}, {'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': '(used to form agent nouns) person or thing that does...', 'tags': ['feminine'], 'word': '-ëse', '_dis1': '25 8 9 8 24 5 4 1 14 2'}, {'lang': 'Albanian', 'code': 'sq', 'lang

<class 'str'>
{'lang': 'Afrikaans', 'code': 'af', 'lang_code': 'af', 'sense': '(used to form agent nouns) person or thing that does...', 'word': '-er', '_dis1': '25 8 9 8 24 5 4 1 14 2'}
{'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': '(used to form agent nouns) person or thing that does...', 'tags': ['masculine'], 'word': '-es', '_dis1': '25 8 9 8 24 5 4 1 14 2'}
{'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': '(used to form agent nouns) person or thing that does...', 'tags': ['feminine'], 'word': '-ese', '_dis1': '25 8 9 8 24 5 4 1 14 2'}
{'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': '(used to form agent nouns) person or thing that does...', 'tags': ['masculine'], 'word': '-ës', '_dis1': '25 8 9 8 24 5 4 1 14 2'}
{'lang': 'Albanian', 'code': 'sq', 'lang_code': 'sq', 'sense': '(used to form agent nouns) person or thing that does...', 'tags': ['feminine'], 'word': '-ëse', '_dis1': '25 8 9 8 24 5 4 1 14 2'}
{'lang': 'Albanian', 'code': 'sq'

'[{\'lang\': \'Armenian\', \'code\': \'hy\', \'lang_code\': \'hy\', \'sense\': \'used to make adjectives\', \'roman\': \'-patik\', \'word\': \'-պատիկ\', \'_dis1\': \'53 47\'}, {\'lang\': \'Chinese Mandarin\', \'code\': \'cmn\', \'lang_code\': \'cmn\', \'sense\': \'used to make adjectives\', \'roman\': \'bèi\', \'word\': \'倍\', \'_dis1\': \'53 47\'}, {\'lang\': \'Cornish\', \'code\': \'kw\', \'lang_code\': \'kw\', \'sense\': \'used to make adjectives\', \'word\': \'-plek\', \'_dis1\': \'53 47\'}, {\'lang\': \'Cornish\', \'code\': \'kw\', \'lang_code\': \'kw\', \'sense\': \'used to make adjectives\', \'word\': \'-blek\', \'_dis1\': \'53 47\'}, {\'lang\': \'Cornish\', \'code\': \'kw\', \'lang_code\': \'kw\', \'sense\': \'used to make adjectives\', \'word\': \'-flek\', \'_dis1\': \'53 47\'}, {\'lang\': \'Dutch\', \'code\': \'nl\', \'lang_code\': \'nl\', \'sense\': \'used to make adjectives\', \'word\': \'-voudig\', \'_dis1\': \'53 47\'}, {\'lang\': \'Esperanto\', \'code\': \'eo\', \'lang_c

<class 'str'>
{'lang': 'Armenian', 'code': 'hy', 'lang_code': 'hy', 'sense': 'used to make adjectives', 'roman': '-patik', 'word': '-պատիկ', '_dis1': '53 47'}
{'lang': 'Chinese Mandarin', 'code': 'cmn', 'lang_code': 'cmn', 'sense': 'used to make adjectives', 'roman': 'bèi', 'word': '倍', '_dis1': '53 47'}
{'lang': 'Cornish', 'code': 'kw', 'lang_code': 'kw', 'sense': 'used to make adjectives', 'word': '-plek', '_dis1': '53 47'}
{'lang': 'Cornish', 'code': 'kw', 'lang_code': 'kw', 'sense': 'used to make adjectives', 'word': '-blek', '_dis1': '53 47'}
{'lang': 'Cornish', 'code': 'kw', 'lang_code': 'kw', 'sense': 'used to make adjectives', 'word': '-flek', '_dis1': '53 47'}
{'lang': 'Dutch', 'code': 'nl', 'lang_code': 'nl', 'sense': 'used to make adjectives', 'word': '-voudig', '_dis1': '53 47'}
{'lang': 'Esperanto', 'code': 'eo', 'lang_code': 'eo', 'sense': 'used to make adjectives', 'word': '-obla', '_dis1': '53 47'}
{'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'used to m

"[{'lang': 'Catalan', 'code': 'ca', 'lang_code': 'ca', 'sense': 'something related to writing etc.', 'tags': ['masculine'], 'word': '-graf', '_dis1': '33 27 13 17 9'}, {'lang': 'Czech', 'code': 'cs', 'lang_code': 'cs', 'sense': 'something related to writing etc.', 'tags': ['masculine'], 'word': '-graf', '_dis1': '33 27 13 17 9'}, {'lang': 'Danish', 'code': 'da', 'lang_code': 'da', 'sense': 'something related to writing etc.', 'tags': ['common-gender'], 'word': '-graf', '_dis1': '33 27 13 17 9'}, {'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'something related to writing etc.', 'word': '-grafi', '_dis1': '33 27 13 17 9'}, {'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'something related to writing etc.', 'word': '-graphe', '_dis1': '33 27 13 17 9'}, {'lang': 'German', 'code': 'de', 'lang_code': 'de', 'sense': 'something related to writing etc.', 'word': '-graph', '_dis1': '33 27 13 17 9'}, {'lang': 'Hungarian', 'code': 'hu', 'lang_code': 'hu', 'sense': 'som

<class 'str'>
{'lang': 'Catalan', 'code': 'ca', 'lang_code': 'ca', 'sense': 'something related to writing etc.', 'tags': ['masculine'], 'word': '-graf', '_dis1': '33 27 13 17 9'}
{'lang': 'Czech', 'code': 'cs', 'lang_code': 'cs', 'sense': 'something related to writing etc.', 'tags': ['masculine'], 'word': '-graf', '_dis1': '33 27 13 17 9'}
{'lang': 'Danish', 'code': 'da', 'lang_code': 'da', 'sense': 'something related to writing etc.', 'tags': ['common-gender'], 'word': '-graf', '_dis1': '33 27 13 17 9'}
{'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'something related to writing etc.', 'word': '-grafi', '_dis1': '33 27 13 17 9'}
{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'something related to writing etc.', 'word': '-graphe', '_dis1': '33 27 13 17 9'}
{'lang': 'German', 'code': 'de', 'lang_code': 'de', 'sense': 'something related to writing etc.', 'word': '-graph', '_dis1': '33 27 13 17 9'}
{'lang': 'Hungarian', 'code': 'hu', 'lang_code': 'hu', 'sense'

"[{'lang': 'Chinese Mandarin', 'code': 'zh', 'lang_code': 'zh', 'sense': 'act of doing something', 'note': '(with no suffix, verbs can be used as nouns)', '_dis1': '50 50 0'}, {'lang': 'Danish', 'code': 'da', 'lang_code': 'da', 'sense': 'act of doing something', 'tags': ['common-gender'], 'word': '-ing', '_dis1': '50 50 0'}, {'lang': 'Dutch', 'code': 'nl', 'lang_code': 'nl', 'sense': 'act of doing something', 'tags': ['feminine'], 'note': 'nominalization of the infinitive', 'word': '-ing', '_dis1': '50 50 0'}, {'lang': 'Faroese', 'code': 'fo', 'lang_code': 'fo', 'sense': 'act of doing something', 'tags': ['feminine'], 'word': '-ing', '_dis1': '50 50 0'}, {'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'act of doing something', 'tags': ['masculine'], 'word': '-age', '_dis1': '50 50 0'}, {'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'act of doing something', 'tags': ['feminine'], 'word': '-tion', '_dis1': '50 50 0'}, {'lang': 'French', 'code': 'fr', 'lang_code

<class 'str'>
{'lang': 'Chinese Mandarin', 'code': 'zh', 'lang_code': 'zh', 'sense': 'act of doing something', 'note': '(with no suffix, verbs can be used as nouns)', '_dis1': '50 50 0'}
{'lang': 'Danish', 'code': 'da', 'lang_code': 'da', 'sense': 'act of doing something', 'tags': ['common-gender'], 'word': '-ing', '_dis1': '50 50 0'}
{'lang': 'Dutch', 'code': 'nl', 'lang_code': 'nl', 'sense': 'act of doing something', 'tags': ['feminine'], 'note': 'nominalization of the infinitive', 'word': '-ing', '_dis1': '50 50 0'}
{'lang': 'Faroese', 'code': 'fo', 'lang_code': 'fo', 'sense': 'act of doing something', 'tags': ['feminine'], 'word': '-ing', '_dis1': '50 50 0'}
{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'act of doing something', 'tags': ['masculine'], 'word': '-age', '_dis1': '50 50 0'}
{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'act of doing something', 'tags': ['feminine'], 'word': '-tion', '_dis1': '50 50 0'}
{'lang': 'French', 'code': 'fr', 'lan

In [177]:
eep_translations = eep_translations.loc[:, ['EEP_id', 'word', 'pos', 'lang_code', 'translations', 'senses', 'dsrc']]

In [None]:
def get_top_translations(df, wktlang='en', wrdlang='en', translation_lang='nl', word_id_col='EEP_id'):
    df = df[~df['translations'].isna()]
    translations = []
    key_list = []
    for i, row in df.iterrows():
        row_translations = row['translations']
        row_info = [row['word'], row['pos'], row['lang_code'], row['dsrc']]
        if type(row_translations) == str:
            row_translations = safe_eval(row_translations)
            if type(row_translations) == list:
                for j in row_translations:
                    if j.get('lang') == 'Dutch':
                        
                        kys = j.keys()
                        for k in kys:
                            if k not in key_list:
                                key_list.append(k) 
                        row_info.append(j.get('word', None))
                        row_info.append(j.get('sense', None))
                        row_info.append(j.get('tags', None))
                        row_info.append(j)
                        translations.append(row_info)
        
    return translations

In [193]:
t = get_top_translations(eep_translations)
print(len(t))

[{'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-tava', '_dis1': '20 22 20 20 20'}, {'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-tävä', '_dis1': '20 22 20 20 20'}, {'lang': 'German', 'code': 'de', 'lang_code': 'de', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-wert', '_dis1': '20 22 20 20 20'}, {'lang': 'Swedish', 'code': 'sv', 'lang_code': 'sv', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-värd', '_dis1': '20 22 20 20 20'}]
[{'lang': 'French', 'code': 'fr', 'lang_code': 'fr', 'sense': 'man, male', 'tags': ['feminine'], 'word': '-andrie', '_dis1': '33 37 30'}, {'lang': 'Portuguese', 'code': 'pt', 'lang_code': 'pt', 'sense': 'man, male', 'tags': ['feminine'], 'word': '-andria', '_dis1': '33 37 30'}]
[{'lang': 'Arabic', 'code': 'ar', 'lang_code': 'ar', 'sense': 'of or 

In [151]:
def get_translations(df, wktlang='en', wrdlang='en', translation_lang='nl', word_id_col='EEP_id'):
    translations = []
    df['translations'].apply(lambda x: safe_eval(x))
    trans_col_name = f"{translation_lang}_translation"
    #display(df.head())
    for i, row in df.iterrows():
        tr = row['translations']
        if type(tr) == list:
            if len(tr) > 0:
                if type(tr[0]) == dict:
                    for j in range(len(tr)):
                        if tr[j].get('code') == translation_lang:
                            translations.append([i, row[word_id_col], row['word'], row['pos'], row['lang_code'], tr[j], row['dsrc']])
    return translations
    #df[trans_col_name] = 

In [152]:
translations = get_translations(eep_translations)

In [None]:
rows = []
rows_with_more_keys = []
translation_id = 0
for i in range(len(translations)):
    row = translations[i]
    rows.append({'id': translation_id, 'word':row[2], 'pos': row[3], 'lang_code': row[4], 'nl_translation': row[5].get('word', None), 'nl_sense': row[5].get('sense', None), 'nl_tags': row[5].get('tags', None), 'dsrc': row[-1]})
    translation_id += 1

In [156]:
nl_translations_df = pd.DataFrame(rows)

In [157]:
nl_translations_df

Unnamed: 0,id,word,pos,lang_code,nl_translation,nl_sense,nl_tags,dsrc
0,0,-er,suffix,en,-er,(used to form agent nouns) person or thing tha...,,EEP_src
1,1,-fold,suffix,en,-voudig,used to make adjectives,,EEP_src
2,2,-fold,suffix,en,-voudig,used to make adverbs,,EEP_src
3,3,-ing,suffix,en,-ing,act of doing something,[feminine],EEP_src
4,4,-ist,suffix,en,-ist,"One who follows a particular ideology, doctrin...",,EEP_src
...,...,...,...,...,...,...,...,...
2515,2515,young,adj,en,jong,as if young,,EEP_src
2516,2516,young,adj,en,jeugdig,as if young,,EEP_src
2517,2517,young,adj,en,jong,belonging in the early part of life,,EEP_src
2518,2518,zeal,noun,en,ijver,fervour or devotion,[masculine],EEP_src


In [153]:
print(len(translations))
display(translations[100])

2520


[142089,
 140496,
 'academy',
 'noun',
 'en',
 {'lang': 'Dutch',
  'code': 'nl',
  'lang_code': 'nl',
  'sense': 'seminary',
  'tags': ['feminine'],
  'word': 'academie',
  '_dis1': '12 9 18 7 8 4 14 7 20'},
 'EEP_src']

In [None]:
t0 = eep_translations.loc[eep_translations['EEP_id'] == 300, 'translations'].values
display(t0.values)
display(len(t0.values[0]))
display(type(t0.values[0][0]))
eep_translations.head()

array([list([{'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-tava', '_dis1': '20 22 20 20 20'}, {'lang': 'Finnish', 'code': 'fi', 'lang_code': 'fi', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-tävä', '_dis1': '20 22 20 20 20'}, {'lang': 'German', 'code': 'de', 'lang_code': 'de', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-wert', '_dis1': '20 22 20 20 20'}, {'lang': 'Swedish', 'code': 'sv', 'lang_code': 'sv', 'sense': 'expressing capacity or worthiness in a passive sense', 'word': '-värd', '_dis1': '20 22 20 20 20'}])],
      dtype=object)

4

dict

Unnamed: 0,EEP_id,word,pos,lang_code,translations,senses,dsrc
310,300,-able,suffix,en,"[{'lang': 'Finnish', 'code': 'fi', 'lang_code'...",[{'examples': [{'text': 'movable: able to be m...,EEP_src
347,337,-andry,suffix,en,"[{'lang': 'French', 'code': 'fr', 'lang_code':...",[{'examples': [{'text': 'mono- + -andry → mona...,EEP_src
363,352,-ary,suffix,en,"[{'lang': 'Arabic', 'code': 'ar', 'lang_code':...",[{'examples': [{'text': 'devolution + -ary → d...,EEP_src
371,359,-ate,suffix,en,"[{'lang': 'French', 'code': 'fr', 'lang_code':...",[{'examples': [{'text': 'affiliate — “a person...,EEP_src
389,375,-bility,suffix,en,"[{'lang': 'Catalan', 'code': 'ca', 'lang_code'...","[{'links': [['-ability', '-ability#English']],...",EEP_src


In [104]:
get_translations(eep_translations)

     EEP_id     word     pos lang_code  \
310     300    -able  suffix        en   
347     337   -andry  suffix        en   
363     352     -ary  suffix        en   
371     359     -ate  suffix        en   
389     375  -bility  suffix        en   

                                          translations  \
310  [{'lang': 'Finnish', 'code': 'fi', 'lang_code'...   
347  [{'lang': 'French', 'code': 'fr', 'lang_code':...   
363  [{'lang': 'Arabic', 'code': 'ar', 'lang_code':...   
371  [{'lang': 'French', 'code': 'fr', 'lang_code':...   
389  [{'lang': 'Catalan', 'code': 'ca', 'lang_code'...   

                                                senses     dsrc  
310  [{'examples': [{'text': 'movable: able to be m...  EEP_src  
347  [{'examples': [{'text': 'mono- + -andry → mona...  EEP_src  
363  [{'examples': [{'text': 'devolution + -ary → d...  EEP_src  
371  [{'examples': [{'text': 'affiliate — “a person...  EEP_src  
389  [{'links': [['-ability', '-ability#English']],...  EEP_src  
