In [13]:
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
import csv, os, re

# Clean up word list

In [9]:
dfraw = pd.read_csv("../data/lahu/elabs_from_ell/lahu-elabs.txt", sep='\t', names=['expression', 'type', 'gloss'])

In [10]:
dfraw.type.value_counts()

Elabn                1243
Elabv                 742
Elabadv               216
Elabq                  68
Extended Elabn         45
Elabtime               11
Extended Elabv          6
Extended Elabadv        5
Elabv; Elabadv          5
Elab                    4
Elabext                 3
Elabv's                 3
ElabN-deverb            2
Elabn; Elabadv          2
Extended Elab           2
Elab couplet            2
Elabv; Elabn            2
Elab Couplet            1
Elabn-intens            1
Elabn-deverb            1
Elabq-time              1
QQ or Elabq             1
Extended Elabq          1
ElabNPq                 1
Extended Elabtime       1
AE; Elabadv             1
Elabadv; Elabn          1
Elabv/adv               1
Name: type, dtype: int64

In [23]:
def elab_order_type(w1, w2, w3, w4):
    if w1 == w3 and w2 != w4:
        return 'ABAC'
    if w2 == w4 and w1 != w3:
        return 'ABCB'
    else:
        return None

def segment(expr):
    expr = re.sub(r'\([^)]*\)', '', expr) # delete optional phonemes
    if '~' in expr:
        expr = expr.split('~')[0].strip()  # take the first pronunciation if multiple
    if '=' in expr or '+' in expr:  
        return None  # exclude all words with '=' or '+' in it 
    if '...' in expr or 'Num1' in expr:
        return None
    
    expr = expr.replace('-', ' ')
    words = expr.split()[:4]
    if len(words) < 4:
        # e.g. expr == "mû ... mì"
        return None
    if elab_order_type(*words) is None:
        return None
    for i in range(4):
        if '/' in words[i]:
            words[i] = words[i].split('/')[0]

    return tuple(words)

In [27]:
extracted_EEs = set([ee for ee in dfraw.expression.apply(segment).tolist() if ee])
print(len(extracted_EEs))
extracted_EEs = sorted(list(extracted_EEs))

1540


In [28]:
pd.DataFrame(extracted_EEs, columns=['word1','word2', 'word3', 'word4']).to_csv("../data/lahu/elabs_from_ell/elabs_extracted.csv", index=False, quoting=csv.QUOTE_ALL)

# Test the syllable regex

In [29]:
df = pd.read_csv("../data/lahu/elabs_from_ell/elabs_extracted.csv", quoting=csv.QUOTE_ALL)

In [30]:
df

Unnamed: 0,word1,word2,word3,word4
0,a,dɔ̂,a,gâ
1,a,ni,šɨ̂ʔ,ni
2,a,pa,a,nɛ̀ʔ
3,a,pū,a,pi
4,a,šàʔ,a,yûʔ
...,...,...,...,...
1535,ɨ̄,la,mâ,la
1536,ɨ̄,la,mu,la
1537,ɨ̄,mɨ̀,câʔ,mɨ̀
1538,ɨ̄,qay,mu,qay


In [31]:
from libraries.lahu_jam.lahu_jam_regex import LAHU_REGEX as lahu
all_syllables = set(df["word1"].tolist() + df["word2"].tolist() + df["word3"].tolist() + df["word4"].tolist())

In [33]:
ok = 0
for syl in all_syllables:
    m = lahu.match(syl)
    if m is None:
        print(syl, "??")
        continue
    ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
    if not "rhy":
        print("no vowel for", syl)
    elif ons+rhy+ton != syl:
        print(f"wrong composition for {syl}: {ons}+{rhy}+{ton}")
    else:
        ok += 1
print(ok, len(all_syllables))

šwɛ̀ ??
cwí ??
wrong composition for lāy: l+a+̄
wrong composition for kɛ̀w: k+ɛ+̀
wrong composition for qhaw: qh+a+
bwɛ̂ ??
wrong composition for līn: l+i+̄
ywɛ̂ ??
wrong composition for hày: h+a+̀
wrong composition for khān: kh+a+̄
wrong composition for hây: h+a+̂
wrong composition for in: +i+
ywɛ ??
cwɛ̂ ??
wrong composition for phašá: ph+a+
wrong composition for hán: h+a+́
wrong composition for qhay: qh+a+
wrong composition for yōn: y+o+̄
cwɛ̀ ??
wrong composition for qháy: qh+a+́
wrong composition for nây: n+a+̂
wrong composition for våy: v+a+
wrong composition for vân: v+a+̂
X ??
kwâ ??
wrong composition for hân: h+a+̂
wrong composition for mə̂n: m+ə+̂
wrong composition for qhə̄n: qh+ə+̄
wrong composition for kán: k+a+́
hwɛ̄ ??
wrong composition for tàn: t+a+̀
twɛ ??
wrong composition for thāy: th+a+̄
wrong composition for vây: v+a+̂
wrong composition for law: l+a+
wrong composition for cāw: c+a+̄
pwɛ̂ ??
wrong composition for hɛ̂n: h+ɛ+̂
wrong composition fo

In [70]:
s = "ɔ̀"

In [80]:
s[0]

'ɔ'