In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
import csv
from libraries.lahu_jam.lahu_jam_regex import LAHU_REGEX as lahu
import epitran, panphon
from collections import Counter, defaultdict

In [2]:
df = pd.read_csv("../data/lahu/elabs_from_ell/elabs_extracted.csv", quoting=csv.QUOTE_ALL)

In [3]:
df

Unnamed: 0,word1,word2,word3,word4
0,a,dɔ̂,a,gâ
1,a,ni,šɨ̂ʔ,ni
2,a,pa,a,nɛ̀ʔ
3,a,pū,a,pi
4,a,šàʔ,a,yûʔ
...,...,...,...,...
1535,ɨ̄,la,mâ,la
1536,ɨ̄,la,mu,la
1537,ɨ̄,mɨ̀,câʔ,mɨ̀
1538,ɨ̄,qay,mu,qay


In [65]:
df_ABAC = df[df.word1==df.word3]
df_ABAC = df_ABAC.rename(columns={'word1': 'rep', 'word2': 'cc1', 'word4': 'cc2'}).drop(columns='word3')
df_ABAC['is_ABAC'] = True
df_ABAC = df_ABAC[['cc1', 'cc2', 'rep', 'is_ABAC']]

df_ABCB = df[df.word2==df.word4]
df_ABCB = df_ABCB.rename(columns={'word1': 'cc1', 'word2': 'rep', 'word3': 'cc2'}).drop(columns='word4')
df_ABCB['is_ABAC'] = False
df_ABCB = df_ABCB[['cc1', 'cc2', 'rep', 'is_ABAC']]

In [66]:
df_ABAC

Unnamed: 0,cc1,cc2,rep,is_ABAC
0,dɔ̂,gâ,a,True
2,pa,nɛ̀ʔ,a,True
3,pū,pi,a,True
4,šàʔ,yûʔ,a,True
5,ví,ni,a,True
...,...,...,...,...
1526,cē,phɔ̂,ɔ̂,True
1527,chî,pâʔ,ɔ̄,True
1528,chî,phə̀ʔ,ɔ̄,True
1529,chî,phə́,ɔ̄,True


In [67]:
df_ABCB

Unnamed: 0,cc1,cc2,rep,is_ABAC
1,a,šɨ̂ʔ,ni,False
10,á,yɛ̀,qhɔ,False
15,bâʔ,vê,ɨ̄,False
16,bè,lɔ̂,tù,False
17,bè,lɔ̂,ve,False
...,...,...,...,...
1535,ɨ̄,mâ,la,False
1536,ɨ̄,mu,la,False
1537,ɨ̄,câʔ,mɨ̀,False
1538,ɨ̄,mu,qay,False


In [68]:
df_bothforms = pd.concat([df_ABAC, df_ABCB], names=['cc1', 'cc2', 'rep', 'is_ABAC'])

In [7]:
print('ABCB phrases')
bc_counts = Counter(tuple(row) for row in df_ABCB.filter(['word1', 'word3']).to_numpy())
num_unique_bc = len(bc_counts)
print(num_unique_bc, 'unique BC pairs')
for n in (10, 30, 50, 100, 300):
    proportion = sum([count for tp, count in bc_counts.most_common(n)]) / len(df_ABCB)
    print(f'most common {n} make up {proportion*100:.2f}% of EEs')

    
print('ABAC phrases (like hmong dataset)')
bc_counts = Counter(tuple(row) for row in df_ABAC.filter(['word2', 'word4']).to_numpy())
num_unique_bc = len(bc_counts)
print(num_unique_bc, 'unique BC pairs')
for n in (10, 30, 50, 100, 300):
    proportion = sum([count for tp, count in bc_counts.most_common(n)]) / len(df_ABAC)
    print(f'most common {n} make up {proportion*100:.2f}% of EEs')

ABCB phrases
460 unique BC pairs
most common 10 make up 16.75% of EEs
most common 30 make up 33.37% of EEs
most common 50 make up 42.00% of EEs
most common 100 make up 55.05% of EEs
most common 300 make up 80.30% of EEs
ABAC phrases (like hmong dataset)
574 unique BC pairs
most common 10 make up 7.83% of EEs
most common 30 make up 16.62% of EEs
most common 50 make up 22.53% of EEs
most common 100 make up 34.89% of EEs
most common 300 make up 62.36% of EEs


Distribution of unique pairs is a bit flatter than Hmong, but not by too much

In [8]:
print("ABAC phrases")
cntr = 0
for i, (word1, word2, word3, word4) in df_ABAC.iterrows():
    other_order = df[(df.word1==word1) & (df.word2==word4) & (df.word4==word2)]
    if len(other_order) > 0:
#         print(i, word1, word2, word3, word4)
        cntr += 1
print(f"there are {cntr}/2={cntr/2} words with both orders attested")

print("ABCB phrases")
cntr = 0
for i, (word1, word2, word3, word4) in df_ABCB.iterrows():
    other_order = df[(df.word2==word2) & (df.word1==word3) & (df.word3==word1)]
    if len(other_order) > 0:
#         print(i, word1, word2, word3, word4)
        cntr += 1
print(f"there are {cntr}/2={cntr/2} words with both orders attested")

ABAC phrases
there are 28/2=14.0 words with both orders attested
ABCB phrases
there are 24/2=12.0 words with both orders attested


## Prepare data

647 good ones for ABAC, 753 good ones for ABCB

In [74]:
from libraries.lahu_jam.lahu_jam_regex import LAHU_REGEX as lahu
def is_valid_syl(syl):
    m = lahu.match(syl)
    if m:
        ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
        return ons+rhy+ton == syl
    return False

def remove_invalid_data(df0):
    df = df0.copy()
    c = 0
    for i, (cc1, cc2, rep, form) in df.iterrows():
        if not all(is_valid_syl(w) for w in (cc1, cc2, rep)):
            df.drop(i, inplace=True)
        else:
            c += 1
    print(c, 'good ones')
    return df.reset_index(drop=True)

def add_unattested_data(df0):
    df = df0.copy()
    # for each (cc1, cc2) phrase, add (cc2, cc1) phrase
    unique_order_indices = []
    for i, (cc1, cc2, rep, form) in df.iterrows():
        other_order = df[(df.rep==rep) & (df.cc1==cc2) & (df.cc2==cc1)]
        if len(other_order) == 0:
            unique_order_indices.append(i)
    df['attested'] = True
    unattested = df.rename(columns={'cc1': 'cc2', 'cc2': 'cc1'}).iloc[unique_order_indices]
    unattested['attested'] = False
    print(f'len of attested {len(df)}, len of unattested {len(unattested)}')
    return df.append(unattested, ignore_index=True)

def add_onehot_features(df0, features, tones, rhymes, onsets):
    df = df0.copy()
#     if 'panphon' in features:
#         epi = epitran.Epitran('hmn_Latn')
#         ft = panphon.FeatureTable()

    for wi in ('cc1', 'cc2', 'rep'):
        if 'ton' in features:
            for ton in tones:
                df[f'{wi}_ton_a{ton}'] = df[wi].apply(lambda syl: lahu.match(syl).group("ton")==ton)
        if 'rhy' in features:
            for rhy in rhymes:
                df[f'{wi}_rhy_{rhy}'] = df[wi].apply(lambda syl: lahu.match(syl).group("rhy")==rhy)
        if 'ons' in features:
            for ons in onsets:
                df[f'{wi}_ons_{ons}'] = df[wi].apply(lambda syl: lahu.match(syl).group("ons")==ons)
#         if 'panphon' in features:
#             wordi_feats = df[f'word{i}'].apply(lambda syl: ft.bag_of_features(epi.transliterate(syl)))
#             panphon_names = [f'w{i}_{sign}{n}' for n in ft.names for sign in ('+', '0', '-')]

#             df = pd.merge(
#                     df,
#                     pd.DataFrame(wordi_feats.tolist(), index=df.index, columns=panphon_names),
#                     left_index=True, right_index=True)

            
    return df

In [105]:
use_features = 'ton_rhy_ons'  ## change me
df_valid = remove_invalid_data(df_bothforms)

1400 good ones


In [106]:
from collections import Counter
all_syllables = df_valid["cc1"].tolist() + df_valid["cc2"].tolist() + df_valid["rep"].tolist()
onsets, rhymes, tones = Counter(), Counter(), Counter()
for syl in all_syllables:
    m = lahu.match(syl)
    ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
    
    onsets[ons] += 1
    rhymes[rhy] += 1
    tones[ton] += 1
    
print("all possible onsets:", onsets, len(onsets))
print("all possible rhymes:", rhymes, len(rhymes))
print("all possible tones:", ['a'+t for t in tones], len(tones))

all possible onsets: Counter({'c': 327, 'š': 325, 'm': 307, 'l': 262, 't': 237, '': 226, 'n': 221, 'v': 212, 'p': 204, 'k': 199, 'h': 199, 'ph': 164, 'qh': 155, 'ch': 152, 'y': 150, 'd': 149, 'q': 125, 'kh': 117, 'g̈': 108, 'b': 98, 'j': 71, 'g': 65, 'th': 64, 'ŋ': 34, 'f': 29}) 25
all possible rhymes: Counter({'a': 1211, 'ɔ': 760, 'i': 410, 'ɛ': 402, 'ɨ': 379, 'u': 374, 'e': 306, 'o': 212, 'ə': 146}) 9
all possible tones: ['â', 'a', 'ā', 'àʔ', 'á', 'à', 'âʔ'] 7


In [107]:
expanded_df = add_onehot_features(add_unattested_data(df_valid), 
                                 use_features, tones, rhymes, onsets).drop(columns=['cc1', 'cc2', 'rep'])
expanded_df

len of attested 1400, len of unattested 1348


Unnamed: 0,is_ABAC,attested,cc1_ton_â,cc1_ton_a,cc1_ton_ā,cc1_ton_àʔ,cc1_ton_á,cc1_ton_à,cc1_ton_âʔ,cc1_rhy_ɔ,...,rep_ons_t,rep_ons_g,rep_ons_,rep_ons_b,rep_ons_g̈,rep_ons_kh,rep_ons_k,rep_ons_h,rep_ons_ŋ,rep_ons_f
0,True,True,True,False,False,False,False,False,False,True,...,False,False,True,False,False,False,False,False,False,False
1,True,True,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,True,True,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,True,True,False,False,False,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,True,True,False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2744,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2745,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2746,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [108]:
X = expanded_df.drop(columns=['attested']).to_numpy()
y = expanded_df['attested'].to_numpy()
print(X.shape, y.shape)

(2748, 124) (2748,)


## Use X and y to train model

In [119]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2
from sklearn.metrics import precision_score, recall_score
from scipy.stats import describe

def train_test(clf, X, y):
    accs = []
    for rnd in range(50):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rnd)
        clf_train = clf.fit(X_train, y_train)
        accs.append(clf_train.score(X_test, y_test))

    print(describe(accs))
    
    
def select_features_from_model(clf, X, y, method='from_model'):
    clf_best = None
    best_accs = 0
    if clf.__class__.__name__ == 'SVC':
        num_features_to_try = (260, 240, 220, 200, 180, 160, 140, 120, 100, 80, 60, 40)
    else:
        num_features_to_try = (120, 110, 100, 90, 80, 70, 60, 50, 40, 35, 30, 25, 20, 15, 10)
    for t in num_features_to_try:
        if method == 'from_model':
            model = SelectFromModel(clf, prefit=True, threshold=-np.inf, max_features=t)
            X_new = model.transform(X)
            #     print(f"new feature set has {X_new.shape[1]} features")
        elif method == 'chi2':
            if t > X.shape[1]: continue
            X_new = SelectKBest(chi2, k=t).fit_transform(X, y)
        else:
            raise NotImplementedError

        print(X_new.shape[1], end=', ')
        clf_new = DecisionTreeClassifier(criterion='entropy', random_state=0).fit(X_new, y)
        #     print(f"full training accuracy is {clf_new.score(X_new, y):.3f}")
        print(f"{clf_new.score(X_new, y):.3f}", end=', ')

        accs = []
        prns = []
        rcls = []
        for rnd in range(10):
            X_new_train, X_new_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=rnd)
            clf_train = DecisionTreeClassifier(criterion='entropy', random_state=rnd).fit(X_new_train, y_train)
            accs.append(clf_train.score(X_new_test, y_test))
            y_predict = clf_train.predict(X_new_test)
            prns.append(precision_score(y_test, y_predict))
            rcls.append(recall_score(y_test, y_predict))
        #     print(f"average test accuracy is {np.mean(accs):.3f}")
        #     print("="*40)
        mean_accs = np.mean(accs)
        mean_prns = np.mean(prns)
        mean_rcls = np.mean(rcls)
        if mean_accs > best_accs:
            best_accs = mean_accs
            clf_best = clf_new
        print(f"{mean_accs:.3f}, prn {mean_prns:.3f}, rcl {mean_rcls:.3f}")

    return best_accs

In [114]:

# clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf = SVC()
train_test(clf, X, y)

DescribeResult(nobs=50, minmax=(0.7551515151515151, 0.8145454545454546), mean=0.7816727272727274, variance=0.00017592713779726764, skewness=0.28349393222629893, kurtosis=-0.25832500079211806)


In [120]:
select_features_from_model(clf, X, y, method='chi2')

120, 1.000, 0.753, prn 0.755, rcl 0.758
100, 0.999, 0.761, prn 0.760, rcl 0.770
80, 0.992, 0.773, prn 0.779, rcl 0.770
60, 0.985, 0.780, prn 0.784, rcl 0.779
40, 0.934, 0.755, prn 0.769, rcl 0.737


0.78

## Visualize Tree

In [112]:
!pip install pydot
!pip install graphviz
# note: also need to install graphviz on your system: https://www.graphviz.org/
from sklearn.tree import export_graphviz
import pydot
import re
import graphviz




In [113]:
feature_names = expanded_df.columns.to_list()
feature_names.remove('attested')
class_names = ['FAKE', 'ATT']

d = 15 # max depth. Use None if unlimited

# visualize tree trained on full training set
clf = clf.fit(X, y)

fname = f'../out/Lahu_{use_features}_{d or ""}.dot'
export_graphviz(clf, 
                out_file=fname, 
                impurity=False, 
                feature_names=feature_names,
                class_names=class_names,
                max_depth=d)

f = pydot.graph_from_dot_file(fname)[0].to_string()
# print(len(f), '\n', f[:1000])
# f = re.sub(r'(\\nvalue = \[.*?\])', '', f)  # get rid of nvalue = [anychar, lazy]
# f = f.replace(' <= 0.5', '?')  # change decision to a question
# f = f.replace('headlabel="True"', 'headlabel="No"')  # change to yes no rather than <=0.5 true false
# f = f.replace('headlabel="False"', 'headlabel="Yes"')
# f = f.replace(R'samples = 1\nclass = ', R'***\n')  # change text of leaf node
# print("============================")
# print(len(f), '\n', f[:1000])

with open(fname, 'w') as file:
    file.write(f)
    
savepath = graphviz.render('dot', 'png', fname)

In [122]:
df.columns.tolist()

['word1', 'word2', 'word3', 'word4']