In [194]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
import csv
from libraries.hmong_rpa.rpa_regex import RPA_SYLLABLE as rpa
import epitran, panphon

In [195]:
df = pd.read_csv("../scripts/elabs_extracted.csv", quoting=csv.QUOTE_ALL)

In [196]:
df

Unnamed: 0,word1,word2,word3,word4
0,vuag,ub,vuag,no
1,xaiv,ntsej,xaiv,muag
2,qhov,phem,qhov,zoo
3,kev,neej,kev,tsav
4,tej,nom,tej,tswv
...,...,...,...,...
3248,khu,mob,khu,nkees
3249,cam,mus,cam,los
3250,me,tes,me,taw
3251,dig,lwj,dig,liam


# Word order classification

Before training anything though, how would a rule-based classifier do?

Based on the A -> D -> B -> C and 2 -> 1 orderings, the tones should be ordered like this:
[j, b, m, s, s/g, v, g, 0]

we have to ignore the B2 split here and just work with [j, b, m, s, v, g, 0]


In [197]:
def add_rule_prediction(df0):
    def sgn(x):
        return 0 if x==0 else x//abs(x)
    
    df = df0.copy()
    orders = {'j': 1,
              'b': 2,
              'm': 3, 'd': 3,
              's': 4,
              'v': 5,
              'g': 6,
              '' : 7}
    df['rule_pred'] = (df['word4'].apply(lambda syl: orders[rpa.match(syl).group("ton")]) - 
                        df['word2'].apply(lambda syl: orders[rpa.match(syl).group("ton")]) )
    df['rule_pred'] = df['rule_pred'].apply(sgn)
    return df

In [198]:
res = add_rule_prediction(df)['rule_pred'].value_counts().to_dict()

In [199]:
print(f'Correct: {res[1]/len(df)}')
print(f'Tie: {res[0]/len(df)}')
print(f'Incorrect: {res[-1]/len(df)}')
print()
print(f'Correct with random guess: {res[1]/len(df) + res[0]/len(df)/2}')
print(f'Incorrect with random guess: {res[-1]/len(df) + res[0]/len(df)/2}')

Correct: 0.8383031048263142
Tie: 0.09007070396557024
Incorrect: 0.07162619120811559

Correct with random guess: 0.8833384568090993
Incorrect with random guess: 0.1166615431909007


# Prepare one hot data

add these ton columns to the data, for each $i=1,2,4$:
- wi_ton_b
- wi_ton_0
- wi_ton_s
- wi_ton_j
- wi_ton_v
- wi_ton_m
- wi_ton_g

Similarly for rhy and ons. There are 7 tones, 14 rhymes, and 58 onsets for Hmong. Since the tones, onset, and rhyme for word1 and word3 are the same, there are a total of $3\times(7+14+58)=237$ features

For natural class features, there are 24 for each of onset, rhyme, and tone, for each of three words, for a total of 24 * 3 * 3 = 216 features.

In [200]:


from collections import Counter
all_syllables = df["word1"].tolist() + df["word2"].tolist() + df["word4"].tolist()
onsets, rhymes, tones = Counter(), Counter(), Counter()
for syl in all_syllables:
    m = rpa.match(syl)
    ons, rhy, ton = m.group("ons"), m.group("rhy"), m.group("ton")
    
    onsets[ons] += 1
    rhymes[rhy] += 1
    tones[ton] += 1
    
print("all possible onsets:", onsets, len(onsets))
print("all possible rhymes:", rhymes, len(rhymes))
print("all possible tones:", tones, len(tones))

all possible onsets: Counter({'t': 932, 'n': 719, 'l': 558, 'ts': 492, 'k': 473, 'p': 447, 'm': 445, '': 366, 's': 363, 'z': 337, 'c': 336, 'tx': 334, 'y': 288, 'd': 276, 'nt': 262, 'h': 248, 'r': 225, 'nts': 210, 'q': 197, 'ny': 176, 'txh': 141, 'ph': 130, 'ch': 122, 'v': 121, 'hl': 119, 'tsh': 115, 'x': 105, 'nc': 100, 'qh': 97, 'kh': 95, 'nr': 94, 'pl': 87, 'th': 85, 'np': 80, 'hm': 74, 'nq': 68, 'nk': 61, 'hn': 59, 'xy': 54, 'ntsh': 46, 'ntxh': 42, 'f': 34, 'dl': 33, 'ntx': 29, 'npl': 25, 'dh': 13, 'nrh': 12, 'ml': 8, 'nqh': 8, 'plh': 3, 'nch': 3, 'rh': 2, 'dlh': 2, 'ndl': 2, 'nkh': 2, 'hny': 2, 'nth': 1, 'nph': 1}) 58
all possible rhymes: Counter({'o': 1314, 'u': 1076, 'ua': 1071, 'e': 956, 'a': 932, 'i': 853, 'au': 674, 'aw': 583, 'ia': 563, 'oo': 505, 'w': 417, 'ee': 413, 'ai': 262, 'aa': 140}) 14
all possible tones: Counter({'b': 1985, 'j': 1697, '': 1560, 'v': 1424, 's': 1181, 'g': 967, 'm': 931, 'd': 14}) 8


In [201]:
def add_unattested_data(df0):
    df = df0.copy()
    # for each ABAC phrase, add ACAB phrase
    unique_order_indices = []
    for i, (word1, word2, word3, word4) in df.iterrows():
        other_order = df[(df.word1==word1) & (df.word2==word4) & (df.word4==word2)]
        if len(other_order) == 0:
            unique_order_indices.append(i)
    df['attested'] = True
    unattested = df.rename(columns={'word2': 'word4', 'word4': 'word2'}).iloc[unique_order_indices]
    unattested['attested'] = False
    print(f'len of attested {len(df)}, len of unattested {len(unattested)}')
    return df.append(unattested, ignore_index=True)


def add_onehot_features(df0, features='ton'):
    df = df0.copy()
    # for each of the three (w1 and w3 identical) words, add onehot vector of each tone
    if 'panphon' in features:
        epi = epitran.Epitran('hmn-Latn')
        ft = panphon.FeatureTable()
        # ft.bag_of_features(epi.transliterate('ntshoob'))


    for i in (1, 2, 4):
        if 'ton' in features:
            for ton in tones:
                if ton == '':
                    df[f'w{i}_ton_0'] = df[f'word{i}'].apply(lambda syl: rpa.match(syl).group("ton")=='')
                elif ton == 'd': continue
                elif ton == 'm':
                    df[f'w{i}_ton_m'] = df[f'word{i}'].apply(lambda syl: syl.endswith('m') or syl.endswith('d'))
                else:
                    df[f'w{i}_ton_{ton}'] = df[f'word{i}'].str.endswith(ton)
        if 'rhy' in features:
            for rhy in rhymes:
                df[f'w{i}_rhy_{rhy}'] = df[f'word{i}'].apply(lambda syl: rpa.match(syl).group("rhy")==rhy)
        if 'ons' in features:
            for ons in onsets:
                df[f'w{i}_ons_{ons}'] = df[f'word{i}'].apply(lambda syl: rpa.match(syl).group("ons")==ons)
        if 'panphon' in features:
            wordi_feats = df[f'word{i}'].apply(lambda syl: ft.bag_of_features(epi.transliterate(syl)))
            panphon_names = [f'w{i}_{sign}{n}' for n in ft.names for sign in ('+', '0', '-')]

            df = pd.merge(
                    df,
                    pd.DataFrame(wordi_feats.tolist(), index=df.index, columns=panphon_names),
                    left_index=True, right_index=True)

            
    return df

In [202]:
use_features = 'ton_rhy_ons_panphon'  ## change me
expanded_df = add_onehot_features(add_unattested_data(df), 
                                 features=use_features).drop(columns=['word1', 'word2', 'word3', 'word4'])
expanded_df

len of attested 3253, len of unattested 3169


Unnamed: 0,attested,w1_ton_g,w1_ton_v,w1_ton_j,w1_ton_0,w1_ton_b,w1_ton_s,w1_ton_m,w1_rhy_ua,w1_rhy_ai,...,w4_-tense,w4_+long,w4_0long,w4_-long,w4_+hitone,w4_0hitone,w4_-hitone,w4_+hireg,w4_0hireg,w4_-hireg
0,True,True,False,False,False,False,False,False,True,False,...,1,0,1,2,0,3,0,0,3,0
1,True,False,True,False,False,False,False,False,False,True,...,1,0,2,3,0,4,1,0,4,1
2,True,False,True,False,False,False,False,False,False,False,...,1,0,1,2,0,3,0,0,3,0
3,True,False,True,False,False,False,False,False,False,False,...,0,0,2,2,1,3,0,1,3,0
4,True,False,False,True,False,False,False,False,False,False,...,0,0,2,2,1,3,0,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6417,False,False,False,False,True,False,False,False,False,False,...,1,0,1,2,1,2,0,1,2,0
6418,False,False,False,False,False,False,False,True,False,False,...,0,0,1,2,1,2,0,0,2,1
6419,False,False,False,False,True,False,False,False,False,False,...,0,0,1,2,1,2,0,0,2,1
6420,False,True,False,False,False,False,False,False,False,False,...,0,0,2,2,2,2,0,1,2,1


There are 3253 attested words, 3169 of which do not have the alternate order attested, for a total of 6422 rows
Without Panphon features, there are 238 columns, for 237 features and 1 label column.
With Panphon features, there are $237 + 72\times3 + 1 = 454$ columns

In [203]:
X = expanded_df.drop(columns=['attested']).to_numpy()
y = expanded_df['attested'].to_numpy()
print(X.shape, y.shape)

(6422, 453) (6422,)


In [204]:
expanded_df.iloc[0].to_dict()

{'attested': True,
 'w1_ton_g': True,
 'w1_ton_v': False,
 'w1_ton_j': False,
 'w1_ton_0': False,
 'w1_ton_b': False,
 'w1_ton_s': False,
 'w1_ton_m': False,
 'w1_rhy_ua': True,
 'w1_rhy_ai': False,
 'w1_rhy_o': False,
 'w1_rhy_e': False,
 'w1_rhy_i': False,
 'w1_rhy_u': False,
 'w1_rhy_au': False,
 'w1_rhy_aw': False,
 'w1_rhy_ee': False,
 'w1_rhy_w': False,
 'w1_rhy_ia': False,
 'w1_rhy_a': False,
 'w1_rhy_oo': False,
 'w1_rhy_aa': False,
 'w1_ons_v': True,
 'w1_ons_x': False,
 'w1_ons_qh': False,
 'w1_ons_k': False,
 'w1_ons_t': False,
 'w1_ons_txh': False,
 'w1_ons_': False,
 'w1_ons_nt': False,
 'w1_ons_p': False,
 'w1_ons_s': False,
 'w1_ons_y': False,
 'w1_ons_nq': False,
 'w1_ons_n': False,
 'w1_ons_tsh': False,
 'w1_ons_th': False,
 'w1_ons_ny': False,
 'w1_ons_c': False,
 'w1_ons_kh': False,
 'w1_ons_l': False,
 'w1_ons_ts': False,
 'w1_ons_tx': False,
 'w1_ons_h': False,
 'w1_ons_m': False,
 'w1_ons_r': False,
 'w1_ons_nr': False,
 'w1_ons_d': False,
 'w1_ons_z': False,
 'w1

In [205]:
df.iloc[0]

word1    vuag
word2      ub
word3    vuag
word4      no
Name: 0, dtype: object

# Train Decision Tree
First try the entire dataset, without splitting into train and val

In [206]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion='entropy', random_state=0).fit(X, y)


How accurate is this classifier?  -> 100%! This is overfitting, of course

In [207]:
clf.score(X, y)

1.0

Try split on gini impurity index instead of entropy -- no change

In [208]:
clf_gini = DecisionTreeClassifier(criterion='gini', random_state=0).fit(X, y)
clf_gini.score(X, y)

1.0

What if we do a train/test split  -> 94.0% - 96.2%, mean 95.4%
With panphone features: 93.9% - 96.1%, mean 94.9%. Do feature extraction techniques to improve.

In [209]:
from sklearn.model_selection import train_test_split
from scipy.stats import describe

accs = []
for rnd in range(50):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rnd)
    clf_train = DecisionTreeClassifier(criterion='entropy', random_state=rnd).fit(X_train, y_train)
    accs.append(clf_train.score(X_test, y_test))

print(describe(accs))

DescribeResult(nobs=50, minmax=(0.9392838609237156, 0.9605604566683965), mean=0.9494862480539699, variance=2.168261790903827e-05, skewness=0.20993469006447252, kurtosis=-0.5410236734389335)


# Feature Selection
Now that we can get 100% training accuracy with all 237 features, the natural thing to ask is how many features we can remove to still get close to 100% accuracy.
DecisionTreeClassifier conveniently provides the `feature_importances_` attributes, which directly tells us that 103 of the features are completely useless.

In [210]:
print("number of features that are completely useless")
print((clf.feature_importances_ == 0).sum())
print("useless features: ")
print(expanded_df.columns.to_numpy()[1:][clf.feature_importances_ == 0])

number of features that are completely useless
286
useless features: 
['w1_ton_g' 'w1_ton_m' 'w1_rhy_ua' 'w1_rhy_ai' 'w1_rhy_aw' 'w1_rhy_ee'
 'w1_rhy_w' 'w1_rhy_ia' 'w1_ons_v' 'w1_ons_qh' 'w1_ons_t' 'w1_ons_txh'
 'w1_ons_' 'w1_ons_nt' 'w1_ons_p' 'w1_ons_nq' 'w1_ons_n' 'w1_ons_tsh'
 'w1_ons_th' 'w1_ons_ny' 'w1_ons_c' 'w1_ons_l' 'w1_ons_tx' 'w1_ons_h'
 'w1_ons_m' 'w1_ons_nr' 'w1_ons_np' 'w1_ons_nrh' 'w1_ons_hl' 'w1_ons_ch'
 'w1_ons_hm' 'w1_ons_ntx' 'w1_ons_nts' 'w1_ons_ntsh' 'w1_ons_hn'
 'w1_ons_q' 'w1_ons_dh' 'w1_ons_pl' 'w1_ons_dl' 'w1_ons_ml' 'w1_ons_plh'
 'w1_ons_nk' 'w1_ons_nc' 'w1_ons_npl' 'w1_ons_nch' 'w1_ons_ntxh'
 'w1_ons_f' 'w1_ons_xy' 'w1_ons_ph' 'w1_ons_rh' 'w1_ons_nth' 'w1_ons_dlh'
 'w1_ons_nph' 'w1_ons_ndl' 'w1_ons_nkh' 'w1_ons_nqh' 'w1_ons_hny'
 'w1_+syl' 'w1_0syl' 'w1_0son' 'w1_-son' 'w1_-cons' 'w1_+cont' 'w1_0cont'
 'w1_0delrel' 'w1_+lat' 'w1_0lat' 'w1_+strid' 'w1_-strid' 'w1_-voi'
 'w1_+sg' 'w1_0sg' 'w1_-sg' 'w1_+cg' 'w1_0cg' 'w1_-cg' 'w1_+ant' 'w1_-ant'
 'w1_+cor' 'w1_

In [211]:
from sklearn.feature_selection import SelectFromModel


clf_best = None
best_accs = 0
for t in (100, 90, 80, 70, 60, 50, 40, 35, 30, 25, 20, 15, 10):
    model = SelectFromModel(clf, prefit=True, threshold=-np.inf, max_features=t)
    X_new = model.transform(X)
#     print(f"new feature set has {X_new.shape[1]} features")
    print(X_new.shape[1], end=', ')


    clf_new = DecisionTreeClassifier(criterion='entropy', random_state=0).fit(X_new, y)
#     print(f"full training accuracy is {clf_new.score(X_new, y):.3f}")
    print(f"{clf_new.score(X_new, y):.3f}", end=', ')

    accs = []
    for rnd in range(10):
        X_new_train, X_new_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=rnd)
        clf_train = DecisionTreeClassifier(criterion='entropy', random_state=rnd).fit(X_new_train, y_train)
        accs.append(clf_train.score(X_new_test, y_test))
#     print(f"average test accuracy is {np.mean(accs):.3f}")
#     print("="*40)
    mean_accs = np.mean(accs)
    if mean_accs > best_accs:
        best_accs = mean_accs
        clf_best = clf_new
    print(f"{mean_accs:.3f}")



100, 1.000, 0.952
90, 1.000, 0.952
80, 1.000, 0.952
70, 0.999, 0.953
60, 0.999, 0.953
50, 0.998, 0.953
40, 0.990, 0.958
35, 0.990, 0.957
30, 0.987, 0.957
25, 0.984, 0.956
20, 0.979, 0.954
15, 0.952, 0.938
10, 0.907, 0.903


In [221]:
clf.__class__.__name__

'DecisionTreeClassifier'

## Visualize Tree

In [212]:
!pip install pydot
!pip install graphviz
# note: also need to install graphviz on your system: https://www.graphviz.org/
from sklearn.tree import export_graphviz
import pydot
import re



In [213]:
clf_best.n_features_

40

In [214]:
feature_names = expanded_df.columns.to_list()[1:]
class_names = ['FAKE', 'ATT']

d = 15 # max depth. Use None if unlimited

fname = f'../out/tree_{use_features}_{d or ""}.dot'
export_graphviz(clf, 
                out_file=fname, 
                impurity=False, 
                feature_names=feature_names,
                class_names=class_names,
                max_depth=d)

f = pydot.graph_from_dot_file(fname)[0].to_string()
# print(len(f), '\n', f[:1000])
# f = re.sub(r'(\\nvalue = \[.*?\])', '', f)  # get rid of nvalue = [anychar, lazy]
# f = f.replace(' <= 0.5', '?')  # change decision to a question
# f = f.replace('headlabel="True"', 'headlabel="No"')  # change to yes no rather than <=0.5 true false
# f = f.replace('headlabel="False"', 'headlabel="Yes"')
# f = f.replace(R'samples = 1\nclass = ', R'***\n')  # change text of leaf node
# print("============================")
# print(len(f), '\n', f[:1000])

with open(fname, 'w') as file:
    file.write(f)

In [215]:
import graphviz
from IPython.display import Image
savepath = graphviz.render('dot', 'png', fname)
Image(url=savepath)


In [216]:
pd.set_option('display.max_rows', 100)
df[(df.word4.str.endswith('j')) & (df.word2.str.startswith('k')) & (df.word4.str.contains('i'))]
# pd.set_option('display.max_rows', 10)

Unnamed: 0,word1,word2,word3,word4
18,tuav,kwv,tuav,tij
24,ua,kwv,ua,tij
32,nws,kwv,nws,tij
37,lawv,kwv,lawv,tij
329,yog,kwv,yog,tij
417,rau,kwv,rau,tij
461,cov,kwv,cov,tij
503,tej,kwv,tej,tij
567,nyias,kwv,nyias,tij
571,tsham,kwv,tsham,tij
