In [1]:
from data_loader import get_task1_conver, get_task2_conver
from utils import dump_jsonl, load_jsonl


import pickle

def load_shap_values(filepath):
  with open(filepath, 'rb') as fin:
    obj = pickle.load(fin)
  return obj

def save_shap_values(filepath, obj):
  with open(filepath, 'wb') as fin:
    pickle.dump(obj, fin)

In [2]:
import pandas as pd

In [3]:
from transformers import AutoTokenizer
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_added_toks = tokenizer.add_special_tokens({"additional_special_tokens": ["usr", "sys", "rep"]})

## Load Lexicons

In [5]:
from pythainlp.tokenize import word_tokenize
import numpy as np

In [9]:
import json
with open("../words.json", encoding="utf-8") as fin:
    raw = json.load(fin)
    thaidict_royal = set()
    for k in raw:
        thaidict_royal.update(raw[k])

In [10]:
lexicons_arr = load_jsonl("../lexicons.jsonl")

Loaded 25573 records from ../lexicons.jsonl


In [11]:
from collections import defaultdict
tags = set()
lexicons = {}
lexicons_keys = defaultdict(list)

for key, values  in lexicons_arr:
    if len(key) <= 1:
        continue
        
    key = key.lower()
    if key.endswith("rep"):
        key = key.replace("rep", "")
        
    w = word_tokenize(key)
    
    lexicons_keys[w[0]].append(key)
    
    tag = [t for t in values["tags"] if not t.startswith("cat:")]
    lexicons[key] = tag
    tags.update(tag)

## Calculate Shapley

In [12]:
metric_names = {
    "Reference" : {
        "all": "All words",
        "pertoken": "Average per token"
    },
    
    "Function Words": {

        "pronoun": "Pronoun",
        "pronoun_1st": ">> 1st person pronoun",
        "pronoun_2nd": ">> 2nd person pronoun",
        "pronoun_3rd": ">> 3rd person pronoun",
        "pronoun_misspelling": ">> Pronoun in non-standard spelling",

        "particles": "Particles",
        "particles_SARP": ">> Socially-related particles",
        "particles_misspelling": ">> Particle in non-standard spelling",
    },
    
    "Sentiment-related": {
    
        "sentiment": "Sentiment words",
        "sentiment_positive": ">> Positive words",
        "sentiment_negative": ">> Negative words",
    },
    
    "Internet Language": {
        "misspelling": "Spelling variation",
        "misspelling_common": ">> Common misspelt words",
        "misspelling_intention": ">> Semantic variation",
        "misspelling_shorten": ">> Simplified variation",
        "nrepeat": ">> Repeated characters",

        "abbr": "Abbreviation",
        "slang": "Slang",
        "swear": "Swear words",
        "transliterated": "Transliteration",
    }
}

In [13]:
def map_token_2_words(words, shap_tokens, debug=False):
    tokens = [w for w, _ in  shap_tokens]
    values = np.array([v for _, v in  shap_tokens])
    
    idxs = []
    sidx = 0
    windows = 10
        
    newwords = []
    newtokens = []
    newvalues = []
    
    
    widx = 0
    w = ""
    while widx < len(words):
        w += words[widx]
        if sidx >= len(tokens):
#             print(newwords)
#             assert(False)
            break
            
#         print(widx, w, sidx, tokens[sidx])
#         break
        s = ""
        matched = False
        for tidx in range(sidx, min(sidx+windows, len(tokens))):
            s += tokens[tidx]
            if s==w:
                matched = True
                break
                
        if matched:
            if debug:
                print("MATCHED", w)
            idxs.append([sidx, tidx+1])
            newwords.append(w)
            newtokens.append("".join(tokens[sidx:tidx+1]))
            newvalues.append(values[sidx:tidx+1].sum())
            sidx = tidx+1
            w = ""
            widx += 1
            continue
        
        if debug:
            print("NOT MATCHED", w, s)
            
        if not s.startswith(w):
            sidx += 1
            w = ""
#             print("SKIP TOKEN")
            continue
        else:                
            widx += 1
#             print("MERGE WORDS")
            continue
            
    if debug:    
        print(newwords)
        print(newtokens)
    
    
    return newtokens, newvalues


def get_lexicon_feats(token, ref_text):
#     print(token, ref_text)
    if token not in lexicons_keys:
        return []
    
    feats = ["all"]
    for l in lexicons_keys[token]:
        if not ref_text.startswith(l):
            continue
        
        feats.extend(lexicons[l])
    return feats

def get_shap_lexicons(df, raw_shap_values):
    shap_lexicons = {}
    label_values = df["label"].unique()
    

    _tmp = raw_shap_values[:, :]
    shap_data = _tmp.data
    shap_values = _tmp.values

    for _, label in enumerate(label_values):
        feats = []
        for idx, row in df.iterrows():
            if row["label"]!=label:
                continue
            
            text = row["text"]
            words = word_tokenize(preprocess(row["text"]))
            words = [w.strip() for w in words if len(w.strip())>0]
            
            shap_tokens = [(w.strip(), v) for w,v in zip(shap_data[idx], shap_values[idx]) if len(w.strip())>0]
            shap_tokens = map_token_2_words(words, shap_tokens, debug=False)
            feats.append(shap_tokens)
        shap_lexicons[label] = feats
    return shap_lexicons

In [14]:
def get_shap_feats(shap_lexicons):
    output = {}
    for label in shap_lexicons:
        all_shap_feats = []
        for tokens, values in shap_lexicons[label]:
            shap_feats = defaultdict(int)
            for tidx, (t, v) in enumerate(zip(tokens, values)):
                feats = get_lexicon_feats(t, "".join(tokens[tidx:]))
                
                if t=="rep":
                    feats.append("nrepeat")
                    
                for f in feats:
                    shap_feats[f] += v
            
            shap_feats["pertoken"] = sum(values)/len(values)
            all_shap_feats.append(shap_feats)
            
        mean_shap_feats = {}
        for g in metric_names:
            for m in metric_names[g]:
                values = []
                for feats in all_shap_feats:
                    if m in feats:
                        values.append(feats[m])
                
                if len(values)==0:
                    mean_shap_feats[m] = 0
                    continue
                    
                values = np.array(values)
                rms = np.sqrt(np.mean(values**2))
                mean_shap_feats[m] = rms
        
        output[label] = mean_shap_feats
    return output

In [15]:
from collections import defaultdict
from pythainlp.tokenize import word_tokenize
import itertools
from tqdm import tqdm

In [17]:
from itertools import groupby
from data_loader import preprocess

def run_preprocess(train, val, test):
    train["text"] = train["text"].apply(preprocess)
    val["text"] = val["text"].apply(preprocess)
    test["text"] = test["text"].apply(preprocess)
    return train, val, test

In [25]:
def run_lexicons(df, shap_path):
    train, val, test = df
    train["split"] = "train"
    val["split"] = "val"
    test["split"] = "test"

    df = pd.concat([train, test, val])
    shap_values = load_shap_values(shap_path)

    assert(len(df)==len(shap_values))
    
    shap_lexicons = get_shap_lexicons(df, shap_values)
    shap_feats = get_shap_feats(shap_lexicons)
    return shap_feats

In [27]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
df = run_preprocess(*df)
shap_feats1 = run_lexicons(df, f"./ShapleyValues/task1_clse.pkl")

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1096 60 60


In [28]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "authority", skips = ["3. Not respect"], only_user=True)
df = run_preprocess(*df)
shap_feats2 = run_lexicons(df, f"./ShapleyValues/task1_auth.pkl")

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1098 61 61


In [29]:
df = df = get_task2_conver("../Task2/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
df = run_preprocess(*df)
shap_feats3 = run_lexicons(df, f"./ShapleyValues/task2_clse.pkl")

Loaded 2463 records from ../Task2/annotated/annotated.jsonl
N 1495 186 186


In [30]:
df = get_task2_conver("../Task2/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
df = run_preprocess(*df)
shap_feats4 = run_lexicons(df, f"./ShapleyValues/task2_auth.pkl")

Loaded 2463 records from ../Task2/annotated/annotated.jsonl
N 1642 205 205


In [31]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
df = run_preprocess(*df)
shap_feats5 = run_lexicons(df, f"./ShapleyValues/task3_clse.pkl")

Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1090 60 60


In [32]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
df = run_preprocess(*df)
shap_feats6 = run_lexicons(df, f"./ShapleyValues/task3_auth.pkl")

Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1099 61 61


In [48]:
def print_shap(shap_feats, print_labels):
    print_text = ""
    for g in metric_names:
        if g!="Reference":
            print_text += ("\multicolumn{5}{l}{\\textit{"+g+"}} \\\\") +"\n"
            
        for m in metric_names[g]:
            s = f"{metric_names[g][m]} "
            for l in print_labels:
                if l not in shap_feats:
                    s += f"& - "
                elif m not in shap_feats[l]:
                    s += f"& 0.00 "
                else:
                    s += f"& {shap_feats[l][m]:.2f} "

            s += "\\\\"
            print_text += (s)+"\n"
        
        if g!="Reference":
            print_text += ("&  & &  & \\\\") +"\n"
        print_text += ("\hline")+"\n"
    return print_text

clse_print_labels = ['1. Close', '2. Know each other', "3. Don't know each other", "4. Don't like each other"]
auth_print_labels = ['0. Very respect', '1. Respect', '2. Normal', '3. Not respect']

In [None]:
sections = [
    "Setting 1: Private Conversations with Self-Reported Labels",
    "Setting 2: Public Conversations with Labels from 3rd Party ",
    "Setting 3: Private Conversations with Labels from 3rd Party ",
]

table_contents = [
    (shap_feats1, shap_feats2),
    (shap_feats3, shap_feats4),
    (shap_feats5, shap_feats6),
]

printed_text = ""
for section, (t1, t2) in zip(sections, table_contents):
    
    printed_text += "\subsection{"+section+"}"
    
    printed_text += '''
\subsubsection{Closeness}
\\begin{longtable}[h]{
        p{\dimexpr 0.40\linewidth-2\\tabcolsep}|
        p{\dimexpr 0.15\linewidth-2\\tabcolsep}
        p{\dimexpr 0.15\linewidth-2\\tabcolsep}
        p{\dimexpr 0.15\linewidth-2\\tabcolsep}
        p{\dimexpr 0.15\linewidth-2\\tabcolsep}
    }
        \hline

        Lexical Features & Close & Know each other & Don't know each other &  Don't like each other\\\\
        \hline
        \endfirsthead
        
        \endhead
            '''
    
    s = print_shap(t1, clse_print_labels)
    printed_text += "\n            ".join(s.split("\n"))
    printed_text += '''
\end{longtable}
\clearpage

'''
    printed_text += '''
\subsubsection{Respect}
\\begin{longtable}[h]{
        p{\dimexpr 0.40\linewidth-2\\tabcolsep}|
        p{\dimexpr 0.16\linewidth-2\\tabcolsep}
        p{\dimexpr 0.15\linewidth-2\\tabcolsep}
        p{\dimexpr 0.15\linewidth-2\\tabcolsep}
        p{\dimexpr 0.15\linewidth-2\\tabcolsep}
    }
        \hline

        Lexical Features & Very respect & Respect & Normal &  Not respect\\\\
        \hline
        \endfirsthead
        
        \endhead
            '''
    
    s = print_shap(t2, auth_print_labels)
    printed_text += "\n            ".join(s.split("\n"))
    printed_text += '''
\end{longtable}
\clearpage

'''

#     break
    
# print(printed_text)