In [1]:
from data_loader import get_task1_conver, get_task2_conver
from utils import dump_jsonl, load_jsonl


import pickle

def load_shap_values(filepath):
  with open(filepath, 'rb') as fin:
    obj = pickle.load(fin)
  return obj

def save_shap_values(filepath, obj):
  with open(filepath, 'wb') as fin:
    pickle.dump(obj, fin)

In [2]:
import pandas as pd

In [3]:
from transformers import AutoTokenizer
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_added_toks = tokenizer.add_special_tokens({"additional_special_tokens": ["usr", "sys", "rep"]})

## Load Lexicons

In [4]:
from pythainlp.tokenize import word_tokenize
import numpy as np

In [5]:
import json
with open("../words.json", encoding="utf-8") as fin:
    raw = json.load(fin)
    thaidict_royal = set()
    for k in raw:
        thaidict_royal.update(raw[k])

In [6]:
lexicons_arr = load_jsonl("../lexicons.jsonl")

Loaded 25573 records from ../lexicons.jsonl


In [7]:
from collections import defaultdict
tags = set()
lexicons = {}
lexicons_keys = defaultdict(list)

for key, values  in lexicons_arr:
    if len(key) <= 1:
        continue
        
    key = key.lower()
    if key.endswith("rep"):
        key = key.replace("rep", "")
        
    w = word_tokenize(key)
    
    lexicons_keys[w[0]].append(key)
    
    tag = [t for t in values["tags"] if not t.startswith("cat:")]
    lexicons[key] = tag
    tags.update(tag)

In [8]:
cc = 0
for k in lexicons:
    for t in lexicons[k]:
        if t=="transliterated":
            cc += 1
cc

452

## Calculate Shapley

In [9]:
metric_names = {
    "Reference" : {
#         "all": "All words",
        "pertoken": "Average per token"
    },
    "Linguistic Complexity" : {
#         "nunique": "Vocabulary size",
        "nthai": "Thai words",
        "nnotthai": "Non-Thai words",
        "nlongword": "Long words",
        "ndict": "Dictionary words",
        "transliterated": "Transliteration",
    },
    "Pronoun": {
        "pronoun": "All pronoun",
        "pronoun_1st": ">> 1st person pronoun",
        "pronoun_2nd": ">> 2nd person pronoun",
        "pronoun_3rd": ">> 3rd person pronoun",
        "pronoun_misspelling": ">> Pronoun in non-standard spelling",
    },
    
    "Sentence-ending Particles": {
        "particles": "All particles",
        "particles_SARP": ">> Socially-related particles",
        "particles_notSARP": ">> Non-socially-related particles",
        "particles_misspelling": ">> Particle in non-standard spelling",
    },
    
    "Sentiment-related": {
        "sentiment": "Sentiment words",
        "sentiment_positive": ">> Positive words",
        "sentiment_negative": ">> Negative words",
    },
    
    "Spelling Variation": {
        "misspelling": "All spelling variation",
        "misspelling_common": ">> Common misspelt words",
        "misspelling_intention": ">> Morphophonemic variation",
        "misspelling_shorten": ">> Simplified variation",
        "nrepeat": ">> Repeated characters",
        "nemoji": ">> Emoji",
#         "abbr": "Abbreviation",
#         "slang": "Slang",
#         "swear": "Swear words"
    }
}

In [10]:
def map_token_2_words(words, shap_tokens, debug=False):
    tokens = [w for w, _ in  shap_tokens]
    values = np.array([v for _, v in  shap_tokens])
    
    idxs = []
    sidx = 0
    windows = 10
        
    newwords = []
    newtokens = []
    newvalues = []
    
    
    widx = 0
    w = ""
    while widx < len(words):
        w += words[widx]
        if sidx >= len(tokens):
#             print(newwords)
#             assert(False)
            break
            
#         print(widx, w, sidx, tokens[sidx])
#         break
        s = ""
        matched = False
        for tidx in range(sidx, min(sidx+windows, len(tokens))):
            s += tokens[tidx]
            if s==w:
                matched = True
                break
                
        if matched:
            if debug:
                print("MATCHED", w)
            idxs.append([sidx, tidx+1])
            newwords.append(w)
            newtokens.append("".join(tokens[sidx:tidx+1]))
            newvalues.append(values[sidx:tidx+1].sum())
            sidx = tidx+1
            w = ""
            widx += 1
            continue
        
        if debug:
            print("NOT MATCHED", w, s)
            
        if not s.startswith(w):
            sidx += 1
            w = ""
#             print("SKIP TOKEN")
            continue
        else:                
            widx += 1
#             print("MERGE WORDS")
            continue
            
    if debug:    
        print(newwords)
        print(newtokens)
    
    
    return newtokens, newvalues

def get_shap_lexicons(df, raw_shap_values):
    shap_lexicons = {}
    label_values = df["label"].unique()
    

    _tmp = raw_shap_values[:, :]
    shap_data = _tmp.data
    shap_values = _tmp.values

    for _, label in enumerate(label_values):
        feats = []
        for idx, row in df.iterrows():
            if row["label"]!=label:
                continue
            
            text = row["text"]
            words = word_tokenize(preprocess(row["text"]))
            words = [w.strip() for w in words if len(w.strip())>0]
            
            shap_tokens = [(w.strip(), v) for w,v in zip(shap_data[idx], shap_values[idx]) if len(w.strip())>0]
            shap_tokens = map_token_2_words(words, shap_tokens, debug=False)
            feats.append(shap_tokens)
        shap_lexicons[label] = feats
    return shap_lexicons

In [11]:
from pythainlp.util import countthai
import re 
import emoji

def get_lexicon_feats(token, ref_text):
    feats = ["all"]
    
    if token=="rep":
        feats.append("nrepeat")
        
    if token in lexicons_keys:
        for l in lexicons_keys[token]:
            if not ref_text.startswith(l):
                continue

            feats.extend(lexicons[l])
            
    if token in thaidict_royal:
        feats.append("ndict")
    
    if len(token) > 7:
        feats.append("nlongword")
    
    if countthai(token) < 50:
        nt = re.sub(r'\W+', '', token)
        if token not in ["usr", "sys", "rep"] and len(nt) > 0 and not nt.isnumeric():
            feats.append("nnotthai")
    else:
        feats.append("nthai")
    
    if "particles" in feats and "particles_SARP" not in feats:
        feats.append("particles_notSARP")
    
    if emoji.emoji_count(token) > 0:
        feats.append("nemoji")
    
    return feats


In [12]:
def get_all_shap_feats(shap_lexicons):
    output = {}
    for label in shap_lexicons:
        all_shap_feats = []
        for tokens, values in shap_lexicons[label]:
            shap_feats = defaultdict(list)
            for tidx, (t, v) in enumerate(zip(tokens, values)):
                feats = get_lexicon_feats(t, "".join(tokens[tidx:]))
                    
                for f in feats:
                    shap_feats[f].append(v)
            
            shap_feats["pertoken"] = values
            all_shap_feats.append(shap_feats)
            
        mean_shap_feats = {}
        for g in metric_names:
            for m in metric_names[g]:
                values = []
                for feats in all_shap_feats:
                    if m in feats:
                        absum = np.sum(np.abs(np.array(feats[m])))
                        values += [absum]
                        #values.append(feats[m])
                
                if len(values)==0:
                    mean_shap_feats[m] = (0, 0)
                    continue
                    
                rms = np.mean(np.array(values))
                mean_shap_feats[m] = (rms, len(values))
        
        output[label] = mean_shap_feats
    return output

def get_shap_feats(shap_lexicons):
    output = {}
    for label in shap_lexicons:
        all_shap_feats = []
        for tokens, values in shap_lexicons[label]:
            # shap_feats is per conversation
            shap_feats = defaultdict(list)
            for tidx, (t, v) in enumerate(zip(tokens, values)):
                feats = get_lexicon_feats(t, "".join(tokens[tidx:]))
                    
                for f in feats:
                    shap_feats[f].append(v)
            
#             shap_feats["pertoken"] = sum(values)/len(values)
            shap_feats["pertoken"] = values
            all_shap_feats.append(shap_feats)
            
        mean_shap_feats = {}
        for g in metric_names:
            for m in metric_names[g]:
                values = []
                for feats in all_shap_feats:
                    if m in feats:
                        values += feats[m]
                
                if len(values)==0:
                    mean_shap_feats[m] = (0, 0)
                    continue
                    
                values = np.array(values)
                rms = np.sqrt(np.mean(values**2))
                mean_shap_feats[m] = (rms, len(values))
        
        output[label] = mean_shap_feats
    return output

In [13]:
from collections import defaultdict
from pythainlp.tokenize import word_tokenize
import itertools
from tqdm import tqdm

In [14]:
from itertools import groupby
from data_loader import preprocess

def run_preprocess(train, val, test):
    train["text"] = train["text"].apply(preprocess)
    val["text"] = val["text"].apply(preprocess)
    test["text"] = test["text"].apply(preprocess)
    return train, val, test

In [27]:
def run_lexicons(df, shap_path):
    train, val, test = df
    train["split"] = "train"
    val["split"] = "val"
    test["split"] = "test"

    df = pd.concat([train, test, val])
    shap_values = load_shap_values(shap_path)

    assert(len(df)==len(shap_values))
    
    shap_lexicons = get_shap_lexicons(df, shap_values)
    shap_feats = get_shap_feats(shap_lexicons)
    all_shap_feats = get_all_shap_feats(shap_lexicons)
    
    return shap_feats, all_shap_feats

In [22]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
df = run_preprocess(*df)
shap_feats1 = run_lexicons(df, f"./ShapleyValues/task1_clse_regressor.pkl")

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1096 60 60


In [57]:
"DONE"

'DONE'

In [58]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "authority", skips = ["3. Not respect"], only_user=True)
df = run_preprocess(*df)
shap_feats2 = run_lexicons(df, f"./ShapleyValues/task1_auth_regressor.pkl")

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1098 61 61


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


In [59]:
df = df = get_task2_conver("../Task2/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
df = run_preprocess(*df)
shap_feats3 = run_lexicons(df, f"./ShapleyValues/task2_clse_regressor.pkl")

Loaded 2486 records from ../Task2/annotated/annotated.jsonl
N 1495 186 186


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


In [60]:
df = get_task2_conver("../Task2/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
df = run_preprocess(*df)
shap_feats4 = run_lexicons(df, f"./ShapleyValues/task2_auth_regressor.pkl")

Loaded 2486 records from ../Task2/annotated/annotated.jsonl
N 1876 234 234


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


In [61]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
df = run_preprocess(*df)
shap_feats5 = run_lexicons(df, f"./ShapleyValues/task3_clse_regressor.pkl")

Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1090 60 60


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


In [62]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
df = run_preprocess(*df)
shap_feats6 = run_lexicons(df, f"./ShapleyValues/task3_auth_regressor.pkl")

Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1099 61 61


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


In [63]:
def print_shap(shap_feats, print_labels):
    print_text = ""
    for g in metric_names:
        if g!="Reference":
            print_text += ("\multicolumn{5}{l}{\\textit{"+g+"}} \\\\") +"\n"
            
        for m in metric_names[g]:
            s = f"{metric_names[g][m]} "
            for l in print_labels:
                if l not in shap_feats:
                    s += f"& - "
                elif m not in shap_feats[l]:
                    s += f"& 0.00 "
                else:
                    s += f"& {shap_feats[l][m]*100:.3f} "

            s += "\\\\"
            print_text += (s)+"\n"
        
        if g!="Reference":
            print_text += ("&  & &  & \\\\") +"\n"
        print_text += ("\hline")+"\n"
    return print_text

clse_print_labels = ['1. Close', '2. Know each other', "3. Don't know each other", "4. Don't like each other"]
auth_print_labels = ['0. Very respect', '1. Respect', '2. Normal', '3. Not respect']

In [64]:
# sections = [
#     "Setting 1: Private Conversations with Self-Reported Labels",
#     "Setting 2: Public Conversations with Labels from 3rd Party ",
#     "Setting 3: Private Conversations with Labels from 3rd Party ",
# ]

# table_contents = [
#     (shap_feats1, shap_feats2),
#     (shap_feats3, shap_feats4),
#     (shap_feats5, shap_feats6),
# ]

# printed_text = ""
# for section, (t1, t2) in zip(sections, table_contents):
    
#     printed_text += "\subsection{"+section+"}"
    
#     printed_text += '''
# \subsubsection{Closeness}
# \\begin{longtable}[h]{
#         p{\dimexpr 0.40\linewidth-2\\tabcolsep}|
#         p{\dimexpr 0.15\linewidth-2\\tabcolsep}
#         p{\dimexpr 0.15\linewidth-2\\tabcolsep}
#         p{\dimexpr 0.15\linewidth-2\\tabcolsep}
#         p{\dimexpr 0.15\linewidth-2\\tabcolsep}
#     }
#         \hline

#         Lexical Features & Close & Know each other & Don't know each other &  Don't like each other\\\\
#         \hline
#         \endfirsthead
        
#         \endhead
#             '''
    
#     s = print_shap(t1, clse_print_labels)
#     printed_text += "\n            ".join(s.split("\n"))
#     printed_text += '''
# \end{longtable}
# \clearpage

# '''
#     printed_text += '''
# \subsubsection{Respect}
# \\begin{longtable}[h]{
#         p{\dimexpr 0.40\linewidth-2\\tabcolsep}|
#         p{\dimexpr 0.16\linewidth-2\\tabcolsep}
#         p{\dimexpr 0.15\linewidth-2\\tabcolsep}
#         p{\dimexpr 0.15\linewidth-2\\tabcolsep}
#         p{\dimexpr 0.15\linewidth-2\\tabcolsep}
#     }
#         \hline

#         Lexical Features & Very respect & Respect & Normal &  Not respect\\\\
#         \hline
#         \endfirsthead
        
#         \endhead
#             '''
    
#     s = print_shap(t2, auth_print_labels)
#     printed_text += "\n            ".join(s.split("\n"))
#     printed_text += '''
# \end{longtable}
# \clearpage

# '''


In [65]:
# print(printed_text)

In [66]:
# "particles": "All particles",

In [67]:
"DONE"

'DONE'

#### Print version 2

In [87]:
def outputs_to_dict(outputs):
    coefs = {}
    for s in outputs[0]:
        for feat in outputs[0][s]:
            if feat not in coefs:
                coefs[feat] = (0, 0)
            
            val1, n1 = outputs[0][s][feat]
            val2, n2 = coefs[feat]
            
            if n1+n2 == 0:
                coefs[feat] = (0, 0)
            else:
                val = (val1*n1 + val2*n2)*1.0/(n1+n2)
                n = n1 + n2
                coefs[feat] = (val, n)
                
    all_coefs = {}
    for s in outputs[1]:
        for feat in outputs[1][s]:
            if feat not in all_coefs:
                all_coefs[feat] = (0, 0)
            
            val1, n1 = outputs[1][s][feat]
            val2, n2 = all_coefs[feat]
            
            if n1+n2 == 0:
                all_coefs[feat] = (0, 0)
            else:
                val = (val1*n1 + val2*n2)*1.0/(n1+n2)
                n = n1 + n2
                all_coefs[feat] = (val, n)
                
    return coefs, all_coefs

printed_text = "" 
printed_text += "\subsection{Closeness}"+"\n"
outputs = [
    outputs_to_dict(shap_feats1),
    outputs_to_dict(shap_feats3),
    outputs_to_dict(shap_feats5)
]

printed_text += "\\begin{longtable}[h]{"+"\n"
printed_text += "    p{\dimexpr 0.40\linewidth-2\tabcolsep}|c|c|c|c|c|c|"+"\n"
printed_text += "}"+"\n"
printed_text += "    \hline"+"\n"
printed_text += "    Lexical Features & " + "\n"
printed_text += "    \\multicolumn{2}{|c|}{Setting 1} & " + "\n"
printed_text += "    \\multicolumn{2}{|c|}{Setting 2} & " + "\n"
printed_text += "    \\multicolumn{2}{|c|}{Setting 3} \\\\" + "\n"
printed_text += "    \\cline{2-7}" + "\n"
printed_text += "    & " + "\n"
printed_text += "    Per \\newline token & Total & " + "\n"
printed_text += "    Per \\newline token & Total & " + "\n"
printed_text += "    Per \\newline token & Total \\\\" + "\n"

printed_text += "    \hline"+"\n"
#     printed_text += "    \endfirsthead"+"\n"
#     printed_text += ""+"\n"
printed_text += "    \endhead"+"\n"
printed_text += ""+"\n"

# for sec, results in zip(sections, outputs):
for g in metric_names:
    if g in ["Conversational Statistics"]:
        continue

    printed_text += "    \multicolumn{7}{l}{\\textit{"+g+"}} \\\\"+"\n"
    printed_text += "    \hline"+"\n"

    for m in metric_names[g]:
        s = f"        {metric_names[g][m]} "
        for out, all_out in outputs:    
            if m not in out:
                s += f"& - "
            else:
                val, n = out[m]
                all_val, _ = all_out[m]
                ref, _ = out["pertoken"]
                
                if (val - ref)*100 > 0.1:
                    s += "& \cellcolor{gray!25} "+f"{val*100:.2f}"+" & \cellcolor{gray!25} "+f"{all_val*100:.2f}"
                else:
                    s += f"& {val*100:.2f} & {all_val*100:.2f}"
                
        s += "\\\\"
        # print(s)
        printed_text += s+"\n"


    printed_text += "        & & & & & &\\\\"+"\n"
    printed_text += "    \hline"+"\n"
    printed_text += ""+"\n"
    
printed_text += "\label{closeness_wangchanberta_shapley_value}"+"\n"
printed_text += "\end{longtable}"+"\n"
printed_text += "\clearpage"+"\n"

In [88]:
printed_text += "\subsection{Respect}"+"\n"
outputs = [
    outputs_to_dict(shap_feats2),
    outputs_to_dict(shap_feats4),
    outputs_to_dict(shap_feats6)
]

printed_text += "\\begin{longtable}[h]{"+"\n"
printed_text += "    p{\dimexpr 0.40\linewidth-2\tabcolsep}|c|c|c|c|c|c|"+"\n"
printed_text += "}"+"\n"
printed_text += "    \hline"+"\n"
printed_text += "    Lexical Features & " + "\n"
printed_text += "    \\multicolumn{2}{|c|}{Setting 1} & " + "\n"
printed_text += "    \\multicolumn{2}{|c|}{Setting 2} & " + "\n"
printed_text += "    \\multicolumn{2}{|c|}{Setting 3} \\\\" + "\n"
printed_text += "    \\cline{2-7}" + "\n"
printed_text += "    & " + "\n"
printed_text += "    Per \\newline token & Total & " + "\n"
printed_text += "    Per \\newline token & Total & " + "\n"
printed_text += "    Per \\newline token & Total \\\\" + "\n"

printed_text += "    \hline"+"\n"
#     printed_text += "    \endfirsthead"+"\n"
#     printed_text += ""+"\n"
printed_text += "    \endhead"+"\n"
printed_text += ""+"\n"

# for sec, results in zip(sections, outputs):
for g in metric_names:
    if g in ["Conversational Statistics"]:
        continue

    printed_text += "    \multicolumn{7}{l}{\\textit{"+g+"}} \\\\"+"\n"
    printed_text += "    \hline"+"\n"

    for m in metric_names[g]:
        s = f"        {metric_names[g][m]} "
        for out, all_out in outputs:    
            if m not in out:
                s += f"& - "
            else:
                val, n = out[m]
                all_val, _ = all_out[m]
                ref, _ = out["pertoken"]
                
                if (val - ref)*100 > 0.1:
                    s += "& \cellcolor{gray!25} "+f"{val*100:.2f}"+" & \cellcolor{gray!25} "+f"{all_val*100:.2f}"
                else:
                    s += f"& {val*100:.2f} & {all_val*100:.2f}"
                
        s += "\\\\"
        # print(s)
        printed_text += s+"\n"


    printed_text += "        & & & & & &\\\\"+"\n"
    printed_text += "    \hline"+"\n"
    printed_text += ""+"\n"
printed_text += "\label{respect_wangchanberta_shapley_value}"+"\n"
printed_text += "\end{longtable}"+"\n"

In [89]:
print(printed_text)

\subsection{Closeness}
\begin{longtable}[h]{
    p{\dimexpr 0.40\linewidth-2	abcolsep}|c|c|c|c|c|c|
}
    \hline
    Lexical Features & 
    \multicolumn{2}{|c|}{Setting 1} & 
    \multicolumn{2}{|c|}{Setting 2} & 
    \multicolumn{2}{|c|}{Setting 3} \\
    \cline{2-7}
    & 
    Per \newline token & Total & 
    Per \newline token & Total & 
    Per \newline token & Total \\
    \hline
    \endhead

    \multicolumn{7}{l}{\textit{Reference}} \\
    \hline
        Average per token & 1.97 & 77.89& 2.73 & 37.52& 2.65 & 108.07\\
        & & & & & &\\
    \hline

    \multicolumn{7}{l}{\textit{Linguistic Complexity}} \\
    \hline
        Thai words & \cellcolor{gray!25} 2.09 & \cellcolor{gray!25} 68.47& 2.74 & 40.69& 2.65 & 90.46\\
        Non-Thai words & 2.06 & 3.40& \cellcolor{gray!25} 3.25 & \cellcolor{gray!25} 6.48& 2.71 & 4.17\\
        Long words & \cellcolor{gray!25} 2.43 & \cellcolor{gray!25} 10.90& 2.75 & 7.71& \cellcolor{gray!25} 2.86 & \cellcolor{gray!25} 13.25\\
        Dict