In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
df = pd.read_csv("../scripts/elabs_extracted.csv", quoting=csv.QUOTE_ALL)

In [3]:
df

Unnamed: 0,word1,word2,word3,word4
0,vuag,ub,vuag,no
1,xaiv,ntsej,xaiv,muag
2,qhov,phem,qhov,zoo
3,kev,neej,kev,tsav
4,tej,nom,tej,tswv
...,...,...,...,...
3248,khu,mob,khu,nkees
3249,cam,mus,cam,los
3250,me,tes,me,taw
3251,dig,lwj,dig,liam


How many are ABAC vs ABCB?

In [4]:
print('ABAC:', len(df[df.word1==df.word3]))
print('ABCB:', len(df[df.word2==df.word4]))

ABAC: 3253
ABCB: 0


Any words where ABAC and ACAB are both attested? -- Nope

In [5]:
for i, (word1, word2, word3, word4) in df.iterrows():
    other_order = df[(df.word1==word1) & (df.word2==word4) & (df.word4==word2)]
    if len(other_order) > 1:
        print(other_order)

## Test the syllable regex

In [6]:
import sys
sys.path.append("..")
from libraries.hmong_rpa.rpa_regex import RPA_SYLLABLE as rpa

In [7]:
all_syllables = df["word1"].tolist() + df["word2"].tolist() + df["word4"].tolist()

Changed: in all three regex grammars, "ua" is repeated. One of them should be "au"

In [8]:
for syl in all_syllables:
    m = rpa.match(syl)
    ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
    if not "rhy":
        print("no vowel for", syl)
    if ons+rhy+ton != syl:
        print(f"wrong composition for {syl}: {ons}+{rhy}+{ton}")

wrong composition for ntusag: nt+u+s


One word with a weird "ntusag" syllable. Perhaps a mistake in the extraction process or a typo in the corpus?

In [9]:
df[df.word4=='ntusag']

Unnamed: 0,word1,word2,word3,word4
362,puj,nrauj,puj,ntusag


A few words that have the 'd' tone, which is not found in rpa. I was going to suggest perhaps these are Dananshan Miao tones, but 'v' doesn't exist in Dananshan (according to Wikipedia) but exist in the same word as the 'd' tones, so I don't know..

In [10]:
df[df.word1.str.endswith('d')|df.word2.str.endswith('d')|df.word4.str.endswith('d')]

Unnamed: 0,word1,word2,word3,word4
558,plhaw,tod,plhaw,ped
982,rau,ped,rau,nrad
1253,yav,ped,yav,nrad
1377,rau,tod,rau,nrad
2357,xam,ped,xam,nrad
2438,yawg,tod,yawg,ped
3122,nkawd,kwv,nkawd,tij
3137,ntawd,noj,ntawd,haus


Get all possible onsets, rhymes, and tones:

In [11]:
len(all_syllables)

9759

In [13]:
from collections import Counter
onsets, rhymes, tones = Counter(), Counter(), Counter()
for syl in all_syllables:
    m = rpa.match(syl)
    ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
    
    onsets[ons] += 1
    rhymes[rhy] += 1
    tones[ton] += 1
    
print("all possible onsets:", onsets, len(onsets))
print("all possible rhymes:", rhymes, len(rhymes))
print("all possible tones:", tones, len(tones))

all possible onsets: Counter({'t': 932, 'n': 719, 'l': 558, 'ts': 492, 'k': 473, 'p': 447, 'm': 445, '': 366, 's': 363, 'z': 337, 'c': 336, 'tx': 334, 'y': 288, 'd': 276, 'nt': 262, 'h': 248, 'r': 225, 'nts': 210, 'q': 197, 'ny': 176, 'txh': 141, 'ph': 130, 'ch': 122, 'v': 121, 'hl': 119, 'tsh': 115, 'x': 105, 'nc': 100, 'qh': 97, 'kh': 95, 'nr': 94, 'pl': 87, 'th': 85, 'np': 80, 'hm': 74, 'nq': 68, 'nk': 61, 'hn': 59, 'xy': 54, 'ntsh': 46, 'ntxh': 42, 'f': 34, 'dl': 33, 'ntx': 29, 'npl': 25, 'dh': 13, 'nrh': 12, 'ml': 8, 'nqh': 8, 'plh': 3, 'nch': 3, 'rh': 2, 'dlh': 2, 'ndl': 2, 'nkh': 2, 'hny': 2, 'nth': 1, 'nph': 1}) 58
all possible rhymes: Counter({'o': 1314, 'u': 1076, 'ua': 1071, 'e': 956, 'a': 932, 'i': 853, 'au': 674, 'aw': 583, 'ia': 563, 'oo': 505, 'w': 417, 'ee': 413, 'ai': 262, 'aa': 140}) 14
all possible tones: Counter({'b': 1985, 'j': 1697, '': 1560, 'v': 1424, 's': 1181, 'g': 967, 'm': 931, 'd': 14}) 8
