In [11]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
import csv
from libraries.hmong_rpa.rpa_regex import RPA_SYLLABLE as rpa

In [2]:
df = pd.read_csv("../scripts/elabs_extracted.csv", quoting=csv.QUOTE_ALL)

In [3]:
df

Unnamed: 0,word1,word2,word3,word4
0,vuag,ub,vuag,no
1,xaiv,ntsej,xaiv,muag
2,qhov,phem,qhov,zoo
3,kev,neej,kev,tsav
4,tej,nom,tej,tswv
...,...,...,...,...
3248,khu,mob,khu,nkees
3249,cam,mus,cam,los
3250,me,tes,me,taw
3251,dig,lwj,dig,liam


# Word order classification

Before training anything though, how would a rule-based classifier do?

Based on the A -> D -> B -> C and 2 -> 1 orderings, the tones should be ordered like this:
[j, b, m, s, s/g, v, g, 0]

we have to ignore the B2 split here and just work with [j, b, m, s, v, g, 0]


In [87]:
def add_rule_prediction(df0):
    def sgn(x):
        return 0 if x==0 else x//abs(x)
    
    df = df0.copy()
    orders = {'j': 1,
              'b': 2,
              'm': 3, 'd': 3,
              's': 4,
              'v': 5,
              'g': 6,
              '' : 7}
    df['rule_pred'] = (df['word4'].apply(lambda syl: orders[rpa.match(syl).group("ton")]) - 
                        df['word2'].apply(lambda syl: orders[rpa.match(syl).group("ton")]) )
    df['rule_pred'] = df['rule_pred'].apply(sgn)
    return df

In [93]:
res = add_rule_prediction(df)['rule_pred'].value_counts().to_dict()

In [97]:
print(f'Correct: {res[1]/len(df)}')
print(f'Tie: {res[0]/len(df)}')
print(f'Incorrect: {res[-1]/len(df)}')
print()
print(f'Correct with random guess: {res[1]/len(df) + res[0]/len(df)/2}')
print(f'Incorrect with random guess: {res[-1]/len(df) + res[0]/len(df)/2}')

Correct: 0.8383031048263142
Tie: 0.09007070396557024
Incorrect: 0.07162619120811559

Correct with random guess: 0.8833384568090993
Incorrect with random guess: 0.1166615431909007


## Prepare one hot data

add these columns to the data, for each i=1,2,3,4:
- wi_ton_b
- wi_ton_0
- wi_ton_s
- wi_ton_j
- wi_ton_v
- wi_ton_m
- wi_ton_g


In [32]:
def add_unattested_data(df0):
    df = df0.copy()
    df['attested'] = True
    # for each ABAC phrase, add ACAB phrase
    unattested = df.rename(columns={'word2': 'word4', 'word4': 'word2'})
    unattested['attested'] = False
    return df.append(unattested, ignore_index=True)

def add_onehot_features(df0):
    df = df0.copy()
    # for each of the four words, add onehot vector of each tone
    for i in (1, 2, 3, 4):
        for ton in 'b0sjvmg':
            if ton == '0':
                df[f'w{i}_ton_0'] = df[f'word{i}'].apply(lambda syl: rpa.match(syl).group("ton")=='')
            else:
                df[f'w{i}_ton_{ton}'] = df[f'word{i}'].str.endswith(ton)
    return df

In [33]:
expanded_df = add_onehot_features(add_unattested_data(df)).drop(columns=['word1', 'word2', 'word3', 'word4'])
expanded_df

Unnamed: 0,attested,w1_ton_b,w1_ton_0,w1_ton_s,w1_ton_j,w1_ton_v,w1_ton_m,w1_ton_g,w2_ton_b,w2_ton_0,...,w3_ton_v,w3_ton_m,w3_ton_g,w4_ton_b,w4_ton_0,w4_ton_s,w4_ton_j,w4_ton_v,w4_ton_m,w4_ton_g
0,True,False,False,False,False,False,False,True,True,False,...,False,False,True,False,True,False,False,False,False,False
1,True,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
2,True,False,False,False,False,True,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False
3,True,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False
4,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6501,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
6502,False,False,False,False,False,False,True,False,False,False,...,False,True,False,False,False,True,False,False,False,False
6503,False,False,True,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
6504,False,False,False,False,False,False,False,True,False,False,...,False,False,True,False,False,False,True,False,False,False


In [40]:
X = expanded_df.drop(columns=['attested']).to_numpy()
y = expanded_df['attested'].to_numpy()
print(X.shape, y.shape)

(6506, 28) (6506,)


## Train Decision Tree
First try the entire dataset, without splitting into train and val

In [41]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion='entropy', random_state=0).fit(X, y)


How accurate is this classifier?  -> 88.9%

In [43]:
clf.score(X, y)

0.8885644020903781

Try split on gini impurity index instead of entropy -- no change

In [76]:
clf_gini = DecisionTreeClassifier(criterion='gini', random_state=0).fit(X, y)
clf_gini.score(X, y)

0.8885644020903781

What if we do a train/test split  -> 85.3% - 87.8%

In [79]:
from sklearn.model_selection import train_test_split
from scipy.stats import describe

accs = []
for rnd in range(50):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rnd)
    clf_train = DecisionTreeClassifier(criterion='entropy', random_state=rnd).fit(X_train, y_train)
    accs.append(clf_train.score(X_test, y_test))

print(describe(accs))

DescribeResult(nobs=50, minmax=(0.8534836065573771, 0.8775614754098361), mean=0.8672438524590163, variance=3.100090101547201e-05, skewness=-0.2524292005013592, kurtosis=-0.34909426134254007)


## Visualize Tree

In [69]:
feature_names = expanded_df.columns.to_list()[1:]
class_names = ['FAKE', 'ATT']

In [70]:
!pip install pydot
!pip install graphviz
# note: also need to install graphviz on your system: https://www.graphviz.org/
from sklearn.tree import export_graphviz
import pydot
import re



In [73]:
d = None # max depth

export_graphviz(clf, 
                out_file=f'../out/tree{d or ""}.dot', 
                impurity=False, 
                feature_names=feature_names,
                class_names=class_names,
                max_depth=d)

PATH = f'../out/tree{d or ""}.dot'
f = pydot.graph_from_dot_file(PATH)[0].to_string()
# print(len(f), '\n', f[:1000])
f = re.sub(r'(\\nvalue = \[.*?\])', '', f)  # get rid of nvalue = [anychar, lazy]
f = f.replace(' <= 0.5', '?')  # change decision to a question
f = f.replace('headlabel="True"', 'headlabel="No"')  # change to yes no rather than <=0.5 true false
f = f.replace('headlabel="False"', 'headlabel="Yes"')
f = f.replace(R'samples = 1\nclass = ', R'***\n')  # change text of leaf node
# print("============================")
# print(len(f), '\n', f[:1000])

with open(f'../out/tree_modified{d or ""}.dot', 'w') as file:
    file.write(f)

In [74]:
import graphviz
from IPython.display import Image
savepath = graphviz.render('dot', 'png', f'../out/tree.dot')
savepath_mod = graphviz.render('dot', 'png', f'../out/tree_modified{d or ""}.dot')

Image(url=savepath)
Image(url=savepath_mod)