In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
import csv
from libraries.lahu_jam.lahu_jam_regex import LAHU_REGEX as lahu
import epitran, panphon
from collections import Counter, defaultdict

In [2]:
df = pd.read_csv("../data/lahu/elabs_from_ell/elabs_extracted.csv", quoting=csv.QUOTE_ALL)

In [3]:
df

Unnamed: 0,word1,word2,word3,word4
0,a,dɔ̂,a,gâ
1,a,ni,šɨ̂ʔ,ni
2,a,pa,a,nɛ̀ʔ
3,a,pū,a,pi
4,a,šàʔ,a,yûʔ
...,...,...,...,...
1535,ɨ̄,la,mâ,la
1536,ɨ̄,la,mu,la
1537,ɨ̄,mɨ̀,câʔ,mɨ̀
1538,ɨ̄,qay,mu,qay


In [37]:
df_ABAC = df[df.word1==df.word3]
df_ABAC = df_ABAC.rename(columns={'word1': 'rep', 'word2': 'cc1', 'word4': 'cc2'}).drop(columns='word3')
df_ABAC['form'] = 'ABAC'
df_ABAC = df_ABAC[['cc1', 'cc2', 'rep', 'form']]

df_ABCB = df[df.word2==df.word4]
df_ABCB = df_ABCB.rename(columns={'word1': 'cc1', 'word2': 'rep', 'word3': 'cc2'}).drop(columns='word4')
df_ABCB['form'] = 'ABCB'
df_ABCB = df_ABCB[['cc1', 'cc2', 'rep', 'form']]

In [38]:
df_ABAC

Unnamed: 0,cc1,cc2,rep,form
0,dɔ̂,gâ,a,ABAC
2,pa,nɛ̀ʔ,a,ABAC
3,pū,pi,a,ABAC
4,šàʔ,yûʔ,a,ABAC
5,ví,ni,a,ABAC
...,...,...,...,...
1526,cē,phɔ̂,ɔ̂,ABAC
1527,chî,pâʔ,ɔ̄,ABAC
1528,chî,phə̀ʔ,ɔ̄,ABAC
1529,chî,phə́,ɔ̄,ABAC


In [39]:
df_ABCB

Unnamed: 0,cc1,cc2,rep,form
1,a,šɨ̂ʔ,ni,ABCB
10,á,yɛ̀,qhɔ,ABCB
15,bâʔ,vê,ɨ̄,ABCB
16,bè,lɔ̂,tù,ABCB
17,bè,lɔ̂,ve,ABCB
...,...,...,...,...
1535,ɨ̄,mâ,la,ABCB
1536,ɨ̄,mu,la,ABCB
1537,ɨ̄,câʔ,mɨ̀,ABCB
1538,ɨ̄,mu,qay,ABCB


In [40]:
df_bothforms = pd.concat([df_ABAC, df_ABCB], names=['cc1', 'cc2', 'rep', 'form'])

In [7]:
print('ABCB phrases')
bc_counts = Counter(tuple(row) for row in df_ABCB.filter(['word1', 'word3']).to_numpy())
num_unique_bc = len(bc_counts)
print(num_unique_bc, 'unique BC pairs')
for n in (10, 30, 50, 100, 300):
    proportion = sum([count for tp, count in bc_counts.most_common(n)]) / len(df_ABCB)
    print(f'most common {n} make up {proportion*100:.2f}% of EEs')

    
print('ABAC phrases (like hmong dataset)')
bc_counts = Counter(tuple(row) for row in df_ABAC.filter(['word2', 'word4']).to_numpy())
num_unique_bc = len(bc_counts)
print(num_unique_bc, 'unique BC pairs')
for n in (10, 30, 50, 100, 300):
    proportion = sum([count for tp, count in bc_counts.most_common(n)]) / len(df_ABAC)
    print(f'most common {n} make up {proportion*100:.2f}% of EEs')

ABCB phrases
460 unique BC pairs
most common 10 make up 16.75% of EEs
most common 30 make up 33.37% of EEs
most common 50 make up 42.00% of EEs
most common 100 make up 55.05% of EEs
most common 300 make up 80.30% of EEs
ABAC phrases (like hmong dataset)
574 unique BC pairs
most common 10 make up 7.83% of EEs
most common 30 make up 16.62% of EEs
most common 50 make up 22.53% of EEs
most common 100 make up 34.89% of EEs
most common 300 make up 62.36% of EEs


Distribution of unique pairs is a bit flatter than Hmong, but not by too much

In [8]:
print("ABAC phrases")
cntr = 0
for i, (word1, word2, word3, word4) in df_ABAC.iterrows():
    other_order = df[(df.word1==word1) & (df.word2==word4) & (df.word4==word2)]
    if len(other_order) > 0:
#         print(i, word1, word2, word3, word4)
        cntr += 1
print(f"there are {cntr}/2={cntr/2} words with both orders attested")

print("ABCB phrases")
cntr = 0
for i, (word1, word2, word3, word4) in df_ABCB.iterrows():
    other_order = df[(df.word2==word2) & (df.word1==word3) & (df.word3==word1)]
    if len(other_order) > 0:
#         print(i, word1, word2, word3, word4)
        cntr += 1
print(f"there are {cntr}/2={cntr/2} words with both orders attested")

ABAC phrases
there are 28/2=14.0 words with both orders attested
ABCB phrases
there are 24/2=12.0 words with both orders attested


## Classification

647 good ones for ABAC, 753 good ones for ABCB

In [51]:
from libraries.lahu_jam.lahu_jam_regex import LAHU_REGEX as lahu
def is_valid_syl(syl):
    m = lahu.match(syl)
    if m:
        ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
        return ons+rhy+ton == syl
    return False

def remove_invalid_data(df0):
    df = df0.copy()
    c = 0
    for i, (cc1, cc2, rep, form) in df.iterrows():
        if not all(is_valid_syl(w) for w in (cc1, cc2, rep)):
            df.drop(i, inplace=True)
        else:
            c += 1
    print(c, 'good ones')
    return df.reset_index(drop=True)

def add_unattested_data(df0):
    df = df0.copy()
    # for each (cc1, cc2) phrase, add (cc2, cc1) phrase
    unique_order_indices = []
    for i, (cc1, cc2, rep, form) in df.iterrows():
        other_order = df[(df.rep==rep) & (df.cc1==cc2) & (df.cc2==cc1)]
        if len(other_order) == 0:
            unique_order_indices.append(i)
    df['attested'] = True
    unattested = df.rename(columns={'cc1': 'cc2', 'cc2': 'cc1'}).iloc[unique_order_indices]
    unattested['attested'] = False
    print(f'len of attested {len(df)}, len of unattested {len(unattested)}')
    return df.append(unattested, ignore_index=True)

def add_onehot_features(df0, features, tones, rhymes, onsets):
    df = df0.copy()
#     if 'panphon' in features:
#         epi = epitran.Epitran('hmn_Latn')
#         ft = panphon.FeatureTable()

    for wi in ('cc1', 'cc2', 'rep'):
        if 'ton' in features:
            for ton in tones:
                df[f'{wi}_ton_{ton}'] = df[wi].apply(lambda syl: lahu.match(syl).group("ton")==ton)
        if 'rhy' in features:
            for rhy in rhymes:
                df[f'{wi}_rhy_{rhy}'] = df[wi].apply(lambda syl: lahu.match(syl).group("rhy")==rhy)
        if 'ons' in features:
            for ons in onsets:
                df[f'{wi}_ons_{ons}'] = df[wi].apply(lambda syl: lahu.match(syl).group("ons")==ons)
#         if 'panphon' in features:
#             wordi_feats = df[f'word{i}'].apply(lambda syl: ft.bag_of_features(epi.transliterate(syl)))
#             panphon_names = [f'w{i}_{sign}{n}' for n in ft.names for sign in ('+', '0', '-')]

#             df = pd.merge(
#                     df,
#                     pd.DataFrame(wordi_feats.tolist(), index=df.index, columns=panphon_names),
#                     left_index=True, right_index=True)

            
    return df

In [53]:
use_features = 'ton_rhy_ons'  ## change me
df_valid = remove_invalid_data(df_bothforms)

1400 good ones


In [58]:
from collections import Counter
all_syllables = df_valid["cc1"].tolist() + df_valid["cc2"].tolist() + df_valid["rep"].tolist()
onsets, rhymes, tones = Counter(), Counter(), Counter()
for syl in all_syllables:
    m = lahu.match(syl)
    ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
    
    onsets[ons] += 1
    rhymes[rhy] += 1
    tones[ton] += 1
    
print("all possible onsets:", onsets, len(onsets))
print("all possible rhymes:", rhymes, len(rhymes))
print("all possible tones:", ['\U+25CC'+t for t in tones], len(tones))

all possible onsets: Counter({'c': 327, 'š': 325, 'm': 307, 'l': 262, 't': 237, '': 226, 'n': 221, 'v': 212, 'p': 204, 'k': 199, 'h': 199, 'ph': 164, 'qh': 155, 'ch': 152, 'y': 150, 'd': 149, 'q': 125, 'kh': 117, 'g̈': 108, 'b': 98, 'j': 71, 'g': 65, 'th': 64, 'ŋ': 34, 'f': 29}) 25
all possible rhymes: Counter({'a': 1211, 'ɔ': 760, 'i': 410, 'ɛ': 402, 'ɨ': 379, 'u': 374, 'e': 306, 'o': 212, 'ə': 146}) 9
all possible tones: ['_̂', '_', '_̄', '_̀ʔ', '_́', '_̀', '_̂ʔ'] 7


In [None]:
expanded_df = add_onehot_features(add_unattested_data(df_valid), 
                                 features=use_features).drop(columns=['cc1', 'cc2', 'rep'])
expanded_df