In [2]:
import pandas as pd

In [3]:
item_cols = {
    'itemname': 'item_name',
    'wordroot': 'item_root',
    'creator': 'user',
    'time': 'created_date',
    'isroot': 'is_root',
    'meaning': 'meaning',
    'meaningEn': 'meaning_en',
    'mainMeaningWordclass': 'main_meaning_word_class',
    'wordclass': 'word_class',
    'culture': 'cultural_notes', 
    'focus': 'focus',
    'idom': 'phrase',
    'idomEn': 'phrase_en',
    'idomCh' : 'phrase_ch',
    'indexPrefix': 'char_strokes_first',
    'cindex': 'char_strokes_all',
    'occurrence': 'frequency',
    'remark': 'grammar_notes',
    'sentence': 'sentence',
    'sentenceCh': 'sentence_ch',
    'sentenceEn': 'sentence_en',
    'source': 'source',
    'tagging': 'tag',
    'toda': 'toda',
    'todar': 'toda_root',
    'truku': 'truku',
    'trukur': 'truku_root',
    'variant': 'variant'
}

meaning_cols = {
    'itemname': 'item_name',
    'meaningno': 'meaning_no',
    'sentenceno': 'sentence_no',
    'meaning': 'meaning',
    'meaningEn': 'meaning_en',
    'sentence': 'sentence',
    'sentenceCh': 'sentence_ch',
    'sentenceEn': 'sentence_en',
    'wordclass': 'word_class',
}

phrase_cols = {
    'itemname': 'item_name',
    'idom': 'phrase',
    'idomCh': 'phrase_ch',
    'idomEn': 'phrase_en',
}

In [63]:
items = pd.read_csv('item_utf8.csv', usecols=item_cols.keys())
items.rename(mapper=item_cols, axis='columns', inplace=True)
items.drop_duplicates(inplace=True)
items.to_csv('seediq_items_updated.csv', index=False)

In [64]:
extra_meaning = pd.read_csv('extra_meaning_utf8.csv', usecols=meaning_cols.keys())
extra_meaning.rename(mapper=meaning_cols, axis='columns', inplace=True)

# These are duplicates from the items df
idx = extra_meaning.query('sentence_no == 0 & meaning_no == 0').index
extra_meaning.drop(idx, inplace=True)

extra_meaning.to_csv('seediq_extra_meaning_updated.csv', index=False)

In [65]:
extra_phrase = pd.read_csv('extra_idom_utf8.csv', usecols=phrase_cols.keys())
extra_phrase.rename(mapper=phrase_cols, axis='columns', inplace=True)
extra_phrase.to_csv('seediq_extra_phrases.csv', index=False)

In [39]:
item_set = set(items['item_name'])
meaning_set = set(extra_meaning['item_name'])

# item df doesn't contain all of extra meaning
item_set.issuperset(meaning_set)
 
meaning_set.difference(item_set)

{'ira', 'tn-breyan'}

"ira" isn't found in items, but 'tn-breyan' is found in items as 'tnbreyan' but with different content

In [73]:
extra_meaning.query("item_name == 'tn-breyan'")[['item_name', 'meaning_no', 'sentence_no', 'meaning', 'sentence', 'sentence_ch']]

Unnamed: 0,item_name,meaning_no,sentence_no,meaning,sentence,sentence_ch
240,tn-breyan,0,1,子宮,Saun tloong laqi menaq nbuyas bubu kesun tn-br...,子宮是孩子在媽媽腹中的搖籃。


In [56]:
items.query("item_name == 'tnbreyan'")[['item_name', 'meaning', 'sentence', 'sentence_ch']]

Unnamed: 0,item_name,meaning,sentence,sentence_ch
4157,tnbreyan,??詞根: 胎盤,"Tnbreyan dapa si bnhangan mu yaku, ini ku qbah...",我所聽過的是牛的胎盤，人的胎盤我沒聽說過。


In [60]:
extra_meaning[extra_meaning['item_name'].str.contains('1')]

Unnamed: 0,item_name,meaning_no,sentence_no,meaning,meaning_en,sentence,sentence_ch,sentence_en,word_class
19,dara1,1,0,血液,blood,Ini pntena kanna dara knkingal seediq.,人的血型並非全都相同。,People's blood types are not all the same.,名詞
59,baso 1,1,0,男性的名字,male name,Baso ngayan na tama mu rudan.,我祖父的名字叫Baso。,My grandpa's name is Baso.,名詞
60,baso 1,2,0,西榖米,sago,Hai tmekan baso ga.,去樁打baso米。,Go pound the sago.,名詞
62,cmehaq1,0,1,舔,lick,"Cmehaq baga na mekan seediq ga, ye ado malu ri...",那個人連手指頭都在舔，他可能在吃很好吃的東西。,That guy is even licking his fingers; he's pro...,
63,cmehaq1,1,0,交配,to mate,Ga cmehaq tama rudux ga.,這公雞正在交配。,The rooster is mating.,
190,na1,0,3,屬格(格位標記),genitive case,"Qtehur hari bubu na Pawan ma, rqeling hari ka ...",Pawan的媽媽稍為胖一點，而Walis的媽媽比較瘦。,,其它
235,na1,0,1,屬格(格位標記),genitive case,"Na dheya ka kiya, nii ka nita.",那是他們的，這才是我們的。,,其它
276,daha1,0,1,二(數詞),two,Maxan daha ka mtswai mu.,我有十二位兄弟姐妹。,I have twelve siblings.,名詞
300,bukung 1,1,0,駝背,,"Niqan naq seediq mbukung druri, ani so kiya we...","人也有(天生)駝背，即使如此我們不可以叫人家為""駝背者""。",,動詞
309,na1,0,2,屬格(格位標記),genitive case,Tama mu ka ga mheyu berah na bubu su.,站在你媽媽前面的是我爸爸。,,其它


In [62]:
items.query("item_name == 'dara1'")['meaning']

417    楓香樹
Name: meaning, dtype: object