In [1]:
import re
import pandas as pd
import numpy as np

In [201]:
item_cols = {
    'itemname': 'item_name',
    'wordroot': 'item_root',
    'creator': 'user',
    'time': 'created_date',
    'isroot': 'is_root',
    'meaning': 'meaning',
    'meaningEn': 'meaning_en',
    'mainMeaningWordclass': 'main_meaning_word_class',
    'wordclass': 'word_class',
    'culture': 'cultural_notes', 
    'focus': 'focus',
    'idom': 'phrase',
    'idomEn': 'phrase_en',
    'idomCh' : 'phrase_ch',
    'indexPrefix': 'char_strokes_first',
    'cindex': 'char_strokes_all',
    'occurrence': 'frequency',
    'remark': 'grammar_notes',
    'sentence': 'sentence',
    'sentenceCh': 'sentence_ch',
    'sentenceEn': 'sentence_en',
    'source': 'source',
    'tagging': 'tag',
    'toda': 'toda',
    'todar': 'toda_root',
    'truku': 'truku',
    'trukur': 'truku_root',
    'variant': 'variant'
}

meaning_cols = {
    'itemname': 'item_name',
    'meaningno': 'meaning_no',
    'sentenceno': 'sentence_no',
    'meaning': 'meaning',
    'meaningEn': 'meaning_en',
    'sentence': 'sentence',
    'sentenceCh': 'sentence_ch',
    'sentenceEn': 'sentence_en',
    'wordclass': 'word_class',
}

phrase_cols = {
    'itemname': 'item_name',
    'idom': 'phrase',
    'idomCh': 'phrase_ch',
    'idomEn': 'phrase_en',
}

In [202]:
def _contains_digit(s):
    return any(c.isdigit() for c in s)

def _split_item_name(row):
    s = row['item_name']
    if _contains_digit(s):
        m = re.match(r'([A-Za-z\-\s]+)_?(\d)?', s)
        headword, sense = m.group(1).strip(), m.group(2).strip()
    else:
        headword = s.strip()
        sense = 1
    row['headword'] = headword
    row['headword_sense_no'] = sense
    return row

In [203]:
items = pd.read_csv('item_utf8.csv', usecols=item_cols.keys())
items.rename(mapper=item_cols, axis='columns', inplace=True)
items.drop_duplicates(inplace=True)
items = items.apply(_split_item_name, axis=1)
items.to_csv('seediq_items_updated.csv', index=False)

In [130]:
multi_meaning = items[items.duplicated('headword', keep=False)]['headword'].unique()

In [204]:
extra_meaning = pd.read_csv('extra_meaning_utf8.csv', usecols=meaning_cols.keys())
extra_meaning.rename(mapper=meaning_cols, axis='columns', inplace=True)

# These are duplicates from the items df
idx = extra_meaning.query('sentence_no == 0 & meaning_no == 0').index
extra_meaning.drop(idx, inplace=True)
extra_meaning = extra_meaning.apply(_split_item_name, axis=1)

extra_meaning.to_csv('seediq_extra_meaning_updated.csv', index=False)

In [131]:
extra_meaning[extra_meaning['headword'].isin(multi_meaning)]

Unnamed: 0,item_name,meaning_no,sentence_no,meaning,meaning_en,sentence,sentence_ch,sentence_en,word_class,headword,headword_sense_no
6,plabu,0,1,雲；霧；竹蜂,,Skiya karac kanna ka plabu.,雲在空中飛翔。,Clouds are flying in the sky.,名詞,plabu,1
12,plabu,0,2,雲；霧；竹蜂,,"Uka plabu ka karac de, malu riyung karac denu.",沒有雲的天空會是好天氣。,A sky without clouds predicts good weather.,名詞,plabu,1
19,dara1,1,0,血液,blood,Ini pntena kanna dara knkingal seediq.,人的血型並非全都相同。,People's blood types are not all the same.,名詞,dara,1
59,baso 1,1,0,男性的名字,male name,Baso ngayan na tama mu rudan.,我祖父的名字叫Baso。,My grandpa's name is Baso.,名詞,baso,1
60,baso 1,2,0,西榖米,sago,Hai tmekan baso ga.,去樁打baso米。,Go pound the sago.,名詞,baso,1
62,cmehaq1,0,1,舔,lick,"Cmehaq baga na mekan seediq ga, ye ado malu ri...",那個人連手指頭都在舔，他可能在吃很好吃的東西。,That guy is even licking his fingers; he's pro...,,cmehaq,1
63,cmehaq1,1,0,交配,to mate,Ga cmehaq tama rudux ga.,這公雞正在交配。,The rooster is mating.,,cmehaq,1
190,na1,0,3,屬格(格位標記),genitive case,"Qtehur hari bubu na Pawan ma, rqeling hari ka ...",Pawan的媽媽稍為胖一點，而Walis的媽媽比較瘦。,,其它,na,1
195,qburung,1,0,小鐮刀,sickle,Qburung biciq ge daun daha qmburung sudu ma wa...,小鐮刀是用來割草和地瓜葉的。,Small sickles are used to weed grass and yam l...,名詞,qburung,1
235,na1,0,1,屬格(格位標記),genitive case,"Na dheya ka kiya, nii ka nita.",那是他們的，這才是我們的。,,其它,na,1


In [167]:
extra_meaning.query("headword in @multi_meaning | meaning_no != 0 | meaning.isnull()").to_csv('seediq_double_check.csv')

In [154]:
extra_meaning_words = extra_meaning['headword']

In [159]:
items.query("headword in @extra_meaning_words")[['headword', 'meaning']]

Unnamed: 0,headword,meaning
1,bale,真的；真正的；真實的
2,baki,祖父；岳父；男性長者
6,tyaqung,烏鴉
9,angal,拿取
19,rqenux,水鹿
28,gaga,在某處
34,keeman,晚上；夜晚
35,kadi,網子
36,laqi,小孩；孩童
37,lhengo,岩洞


In [136]:
extra_phrase = pd.read_csv('extra_idom_utf8.csv', usecols=phrase_cols.keys())
extra_phrase.rename(mapper=phrase_cols, axis='columns', inplace=True)
extra_phrase = extra_phrase.apply(_split_item_name, axis=1)
extra_phrase.to_csv('seediq_extra_phrases.csv', index=False)

In [206]:
extra_phrase[extra_phrase['headword'].isin(multi_meaning)].to_csv('check_extra_phrases.csv')

In [205]:
len(extra_phrase)

149

In [39]:
item_set = set(items['item_name'])
meaning_set = set(extra_meaning['item_name'])

# item df doesn't contain all of extra meaning
item_set.issuperset(meaning_set)
 
meaning_set.difference(item_set)

{'ira', 'tn-breyan'}

"ira" isn't found in items, but 'tn-breyan' is found in items as 'tnbreyan' but with different content

In [73]:
extra_meaning.query("item_name == 'tn-breyan'")[['item_name', 'meaning_no', 'sentence_no', 'meaning', 'sentence', 'sentence_ch']]

Unnamed: 0,item_name,meaning_no,sentence_no,meaning,sentence,sentence_ch
240,tn-breyan,0,1,子宮,Saun tloong laqi menaq nbuyas bubu kesun tn-br...,子宮是孩子在媽媽腹中的搖籃。


In [56]:
items.query("item_name == 'tnbreyan'")[['item_name', 'meaning', 'sentence', 'sentence_ch']]

Unnamed: 0,item_name,meaning,sentence,sentence_ch
4157,tnbreyan,??詞根: 胎盤,"Tnbreyan dapa si bnhangan mu yaku, ini ku qbah...",我所聽過的是牛的胎盤，人的胎盤我沒聽說過。


In [60]:
extra_meaning[extra_meaning['item_name'].str.contains('1')]

Unnamed: 0,item_name,meaning_no,sentence_no,meaning,meaning_en,sentence,sentence_ch,sentence_en,word_class
19,dara1,1,0,血液,blood,Ini pntena kanna dara knkingal seediq.,人的血型並非全都相同。,People's blood types are not all the same.,名詞
59,baso 1,1,0,男性的名字,male name,Baso ngayan na tama mu rudan.,我祖父的名字叫Baso。,My grandpa's name is Baso.,名詞
60,baso 1,2,0,西榖米,sago,Hai tmekan baso ga.,去樁打baso米。,Go pound the sago.,名詞
62,cmehaq1,0,1,舔,lick,"Cmehaq baga na mekan seediq ga, ye ado malu ri...",那個人連手指頭都在舔，他可能在吃很好吃的東西。,That guy is even licking his fingers; he's pro...,
63,cmehaq1,1,0,交配,to mate,Ga cmehaq tama rudux ga.,這公雞正在交配。,The rooster is mating.,
190,na1,0,3,屬格(格位標記),genitive case,"Qtehur hari bubu na Pawan ma, rqeling hari ka ...",Pawan的媽媽稍為胖一點，而Walis的媽媽比較瘦。,,其它
235,na1,0,1,屬格(格位標記),genitive case,"Na dheya ka kiya, nii ka nita.",那是他們的，這才是我們的。,,其它
276,daha1,0,1,二(數詞),two,Maxan daha ka mtswai mu.,我有十二位兄弟姐妹。,I have twelve siblings.,名詞
300,bukung 1,1,0,駝背,,"Niqan naq seediq mbukung druri, ani so kiya we...","人也有(天生)駝背，即使如此我們不可以叫人家為""駝背者""。",,動詞
309,na1,0,2,屬格(格位標記),genitive case,Tama mu ka ga mheyu berah na bubu su.,站在你媽媽前面的是我爸爸。,,其它


In [213]:
combined = extra_meaning.merge(items[['item_name', 'meaning']], suffixes=('_small', '_big'), on='item_name', how='left')
filtered_combined = combined.query("meaning_small != meaning_big | headword in @multi_meaning | meaning_small.isnull() | item_name == 'ira' | item_name == 'tn-breyan'")
len(filtered_combined)

333

In [191]:
filtered_combined.query("item_name == 'ira'")

Unnamed: 0,item_name,meaning_no,sentence_no,meaning_small,meaning_en,sentence,sentence_ch,sentence_en,word_class,headword,headword_sense_no,meaning_big
173,ira,0,1,杜虹花,,Malu slmeun icux duri ka ira nii.,這個杜虹花也很適合做陷獵板。,,名詞,ira,1,


In [195]:
filtered_combined.columns

Index(['item_name', 'meaning_no', 'sentence_no', 'meaning_small', 'meaning_en',
       'sentence', 'sentence_ch', 'sentence_en', 'word_class', 'headword',
       'headword_sense_no', 'meaning_big'],
      dtype='object')

In [210]:
filtered_combined = filtered_combined[['item_name', 'meaning_small', 'meaning_big', 'meaning_no', 'sentence_no', 'sentence', 'sentence_ch', 'sentence_en', 'headword', 'headword_sense_no']]

In [211]:
filtered_combined.to_csv('check_extra_meanings.csv')

In [2]:
items = pd.read_csv('seediq_items_updated-20190613-sung.csv')

In [10]:
items.query("'hengak' in item_name")['meaning']

1558    (例句)氣息；個人的聲音、聲頻
4069                 心臟
Name: meaning, dtype: object

In [11]:
items

Unnamed: 0,item_name,toda,truku,is_root,item_root,focus,toda_root,truku_root,word_class,main_meaning_word_class,...,source,grammar_notes,cultural_notes,variant,user,created_date,char_strokes_all,char_strokes_first,tag,frequency
0,cebu,cbu,,yes,cebu,,,,動詞,動詞,...,qada,,,,dakispawan,2009-08-17,10射0；18擲10射0；18,十劃/射,狩獵,2
1,bale,balay,balay,yes,bale,,,,其它,其它,...,mberiq,平日裡 bale 常簡略為ba，如 bale ba，就是bale bale 真實的、是事實。...,,ba,dakispawan,2009-08-17,10真08的0；10真05正0,十劃/真,,178
2,baki,baki,baki,yes,baki,,,,名詞,名詞,...,pai,,,,dakispawan,2009-08-17,10祖04父0；08岳04父0,十劃/祖,親屬稱謂,19
3,cakus,cakus,cakus,yes,cakus,,,,名詞,名詞,...,,,Mukan pkaguh 係指採樟的漢人，Mukan 是漢人， pkaguh 指「刮碎」的動...,,dakispawan,2009-08-17,15樟16樹,十五劃/樟,植物,11
4,aguh,iyah,aguh,yes,aguh,,,,動詞,動詞,...,eyah,aguh 與 eyah 是同義詞，其詞綴雖各自衍生，卻相輔相成。,,,dakispawan,2009-08-17,05叫02人0)13過08來,五劃/叫,,8
5,qdiro,kdiraw,qjiraw,yes,qdiro,,,,名詞,名詞,...,rodux,,,,monapawan,2009-09-13,06老24鷹,六劃/老,動物,1
6,tyaqung,tyaqung,ciyaqun,yes,tyaqung,,,,名詞,名詞,...,pucaq,,,tiyaqung,monapawan,2009-09-13,10烏15鴉,十劃/烏,動物,2
7,wili,wili,wili,yes,wili,,,,名詞,名詞,...,trmediq,,,,monapawan,2009-09-13,04水12蛭,四劃/水,動物,3
8,rapic,rapic,rapic,yes,rapic,,,,名詞,名詞,...,brihuc,,,,monapawan,2009-09-12,09飛13鼠,九劃/飛,動物,23
9,angal,angal,angal,yes,angal,,,,動詞,動詞,...,laxi,,,angan,dakispawan,2009-08-17,10拿08取,十劃/拿,,17
