In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
df = pd.read_csv("../scripts/elabs_extracted.csv", quoting=csv.QUOTE_ALL)

In [3]:
df

Unnamed: 0,word1,word2,word3,word4
0,vuag,ub,vuag,no
1,xaiv,ntsej,xaiv,muag
2,qhov,phem,qhov,zoo
3,kev,neej,kev,tsav
4,tej,nom,tej,tswv
...,...,...,...,...
3248,khu,mob,khu,nkees
3249,cam,mus,cam,los
3250,me,tes,me,taw
3251,dig,lwj,dig,liam


How many are ABAC vs ABCB?

In [4]:
print('ABAC:', len(df[df.word1==df.word3]))
print('ABCB:', len(df[df.word2==df.word4]))

ABAC: 3253
ABCB: 0


In [5]:
df[(df.word1=='sib') & (df.word2=='hlub')]

Unnamed: 0,word1,word2,word3,word4
226,sib,hlub,sib,pab
379,sib,hlub,sib,fwm
427,sib,hlub,sib,nyiam
658,sib,hlub,sib,nco
693,sib,hlub,sib,txhawb
935,sib,hlub,sib,paab
1037,sib,hlub,sib,haum
1357,sib,hlub,sib,hwm
1753,sib,hlub,sib,tshua


Any words where ABAC and ACAB are both attested? -- Nope

In [17]:
cntr = 0
for i, (word1, word2, word3, word4) in df.iterrows():
    other_order = df[(df.word1==word1) & (df.word2==word4) & (df.word4==word2)]
    if len(other_order) > 0:
        print(i, word1, word2, word3, word4)
        cntr += 1

print(f"there are {cntr}/2={cntr/2} words with both orders attested")

2 qhov phem qhov zoo
5 txhua tsav txhua yam
10 sib txeeb sib tua
24 ua kwv ua tij
75 yuav zoo yuav phem
137 sib pab sib hlub
156 sib tog sib txeeb
177 sib txeeb sib tog
226 sib hlub sib pab
255 qhov zoo qhov phem
264 siab loj siab ntev
285 tso zis tso quav
288 roob tauj roob nqeeb
324 tsis paub tsis pom
356 zoo nkauj zoo nraug
394 hniav ntaj hniav riam
412 sib pab sib koom
427 sib hlub sib nyiam
451 cov zoo cov phem
469 sib txhawb sib pab
485 sib koom sib pab
487 txoj pa txoj siav
510 sib nyiam sib hlub
586 sib ntaus sib tua
645 tim thaib tim nplog
668 cov laus cov hluas
671 hniav riam hniav ntaj
692 yam zoo yam phem
696 tsis pom tsis paub
699 sib tua sib ntaus
750 ntau tsav ntau yam
752 raug txim raug thuv
801 yeej paub yeej pom
858 tus phem tus zoo
885 sib pom sib paub
907 tsi paub tsi pom
958 sib cav sib hais
980 ntau yam ntau tsav
997 lus mos lus muag
1004 neeg zoo neeg ncaj
1009 hais phem hais zoo
1016 sib tua sib txeeb
1025 siab ntev siab loj
1090 tiv tshav tiv nag
1094 lus muag 

In [14]:
word1, word2, word3, word4 = df.iloc[226]
df[(df.word1==word1) & (df.word4==word2) & (df.word2==word4)]

Unnamed: 0,word1,word2,word3,word4
137,sib,pab,sib,hlub


## Test the syllable regex

In [6]:
import sys
sys.path.append("..")
from libraries.hmong_rpa.rpa_regex import RPA_SYLLABLE as rpa

In [7]:
all_syllables = df["word1"].tolist() + df["word2"].tolist() + df["word4"].tolist()

Changed: in all three regex grammars, "ua" is repeated. One of them should be "au"

In [8]:
for syl in all_syllables:
    m = rpa.match(syl)
    ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
    if not "rhy":
        print("no vowel for", syl)
    if ons+rhy+ton != syl:
        print(f"wrong composition for {syl}: {ons}+{rhy}+{ton}")

wrong composition for ntusag: nt+u+s


One word with a weird "ntusag" syllable. Perhaps a mistake in the extraction process or a typo in the corpus?

In [9]:
df[df.word4=='ntusag']

Unnamed: 0,word1,word2,word3,word4
362,puj,nrauj,puj,ntusag


A few words that have the 'd' tone, which is not found in rpa. I was going to suggest perhaps these are Dananshan Miao tones, but 'v' doesn't exist in Dananshan (according to Wikipedia) but exist in the same word as the 'd' tones, so I don't know..

In [10]:
df[df.word1.str.endswith('d')|df.word2.str.endswith('d')|df.word4.str.endswith('d')]

Unnamed: 0,word1,word2,word3,word4
558,plhaw,tod,plhaw,ped
982,rau,ped,rau,nrad
1253,yav,ped,yav,nrad
1377,rau,tod,rau,nrad
2357,xam,ped,xam,nrad
2438,yawg,tod,yawg,ped
3122,nkawd,kwv,nkawd,tij
3137,ntawd,noj,ntawd,haus


Get all possible onsets, rhymes, and tones:

In [11]:
len(all_syllables)

9759

In [19]:
from collections import Counter
onsets, rhymes, tones = Counter(), Counter(), Counter()
for syl in all_syllables:
    m = rpa.match(syl)
    ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
    
    onsets[ons] += 1
    rhymes[rhy] += 1
    tones[ton] += 1
    
print("all possible onsets:", onsets, len(onsets))
print("all possible rhymes:", rhymes, len(rhymes))
print("all possible tones:", tones, len(tones))

NameError: name 'all_syllables' is not defined

# Test Epitran and Panphon

In [22]:
import epitran, panphon

In [23]:
epi = epitran.Epitran('hmn-Latn')

In [30]:
ft = panphon.FeatureTable()
ft.bag_of_features(epi.transliterate('vuag'))

array([2, 2, 1, 2, 2, 1, 1, 2, 2, 3, 2, 0, 0, 2, 3, 0, 2, 3, 0, 2, 3, 1,
       4, 0, 3, 2, 0, 0, 2, 3, 0, 2, 3, 1, 4, 0, 0, 2, 3, 0, 5, 0, 2, 2,
       1, 1, 2, 2, 0, 2, 3, 2, 2, 1, 1, 2, 2, 0, 2, 3, 1, 3, 1, 0, 2, 3,
       0, 4, 1, 0, 4, 1])

In [31]:
epi.transliterate('vuag')

'vuə˧˩'

In [35]:
feat_names = [f'{sign}{n}' for n in ft.names for sign in ('+', '0', '-')]
for fn, val in zip(feat_names, ft.bag_of_features(epi.transliterate('vuag'))):
    print(fn, val)

+syl 2
0syl 2
-syl 1
+son 2
0son 2
-son 1
+cons 1
0cons 2
-cons 2
+cont 3
0cont 2
-cont 0
+delrel 0
0delrel 2
-delrel 3
+lat 0
0lat 2
-lat 3
+nas 0
0nas 2
-nas 3
+strid 1
0strid 4
-strid 0
+voi 3
0voi 2
-voi 0
+sg 0
0sg 2
-sg 3
+cg 0
0cg 2
-cg 3
+ant 1
0ant 4
-ant 0
+cor 0
0cor 2
-cor 3
+distr 0
0distr 5
-distr 0
+lab 2
0lab 2
-lab 1
+hi 1
0hi 2
-hi 2
+lo 0
0lo 2
-lo 3
+back 2
0back 2
-back 1
+round 1
0round 2
-round 2
+velaric 0
0velaric 2
-velaric 3
+tense 1
0tense 3
-tense 1
+long 0
0long 2
-long 3
+hitone 0
0hitone 4
-hitone 1
+hireg 0
0hireg 4
-hireg 1
