# TR Wiktionary Etymology Parse

We parse the TR Wiktionary corpus and extract the etymology related fields. Note the following key issues:

- Many words are missing etymologies
- There are obvious errors in the source
- We only extract etymology from Turkish words (i.e. if the exact same word exists in another language, we discard the etymology related to that


In [26]:
import xmltodict

import xml, sqlite3 as sql
from xml.dom import minidom
from xml.etree import ElementTree as ET

import re
import pprint
import pandas as pd

import pickle

In [3]:
parsed_xml = ET.parse("../../trwiktionary-20160305-pages-articles-multistream.xml")

In [4]:
root = parsed_xml.getroot()

In [5]:
p = root.findall("page") # grab the pages

In [6]:
texts = {}
for page in p:
    txt = page.find("revision/text").text
    title = page.find("title").text
    if txt is None:
        texts[title] = {}
    else:
        texts[title] = {"text": txt}

Cycle through all of the text in the dictionary and extract the language field:

In [7]:
for k, v in texts.items():
    txtfield = v.get("text")
    if txtfield is not None:
        # grab the language of the word
        langs = re.findall(r"\{\{Dil\|(\w+)\}\}", txtfield, flags=re.U)
        texts[k]["lang"] = langs
        
        # flag it if it is Turkish
        texts[k]["is_tr"] = u"Türkçe" in langs
        
        # extract etymology field
        match = re.findall(r"\{\{K\wken\}\}\n:(.*)", txtfield, flags=re.U)
        if len(match) > 0:
            texts[k]["etyms"] = re.findall(r"\{\{k\w*\|([\w|\s]*)\}\}", match[0], flags=re.U)


In [8]:
df = pd.DataFrame.from_dict(texts, "index")

In [9]:
dftr = df[df["is_tr"]]

In [10]:
dftr

Unnamed: 0,lang,text,is_tr,etyms
-a,"[Türkçe, Macarca]",=={{Dil|Türkçe}}==\n==={{Söztürü|Ek|Türkçe}}==...,True,
-acak,[Türkçe],{{Bakınız|açak|âcâk}}\n\n=={{Dil|Türkçe}}==\n=...,True,
-ak,[Türkçe],{{Bakınız|ak|Ak|âk|AK}}\n\n=={{Dil|Türkçe}}==\...,True,
-aki,[Türkçe],{{bakınız|aki}}\n=={{Dil|Türkçe}}==\n==={{Sözt...,True,
-alım,[Türkçe],=={{Dil|Türkçe}}==\n==={{Söztürü|Ek|Türkçe}}==...,True,
-amaç,[Türkçe],=={{Dil|Türkçe}}==\n==={{Söztürü|Ek|Türkçe}}==...,True,
-anç,[Türkçe],=={{Dil|Türkçe}}==\n==={{Söztürü|Ek|Türkçe}}==...,True,[Eski Türkçe]
-ar,"[Türkçe, İngilizce]",=={{Dil|Türkçe}}==\n==={{Söztürü|Ek|Türkçe}}==...,True,
-ağan,[Türkçe],=={{Dil|Türkçe}}==\n==={{Söztürü|Ek|Türkçe}}==...,True,
-ca,[Türkçe],=={{Dil|Türkçe}}==\n==={{Söztürü|Ek|Türkçe}}==...,True,[Türkçe]


In [24]:
etym_lookup = {}
dftr_etym = dftr[pd.notnull(dftr["etyms"])]

for r in dftr_etym.iterrows():
    e = r[1]["etyms"]
    if len(e) > 0:
        etym_lookup[r[0]] = e[0]

In [29]:
pickle.dump(etym_lookup, open("./trwiktionary_etym_lookup.pkl", "w"))