## Load Conversations

In [178]:
import sys
sys.path.append('.')

import pandas as pd
from utils import load_jsonl, dump_jsonl
# import numpy as np

In [179]:
def get_conversations(in_dir, col_label, skips=[]):
    conversations = load_jsonl(f"{in_dir}")
    
    
    def to_message_str(messages, users):
        s = []
        u = []
        for m in messages:
            if users[m['user_id']] == "SYS":
                s.append(m['text'])
            else:
                u.append(m['text'])
                
        return s, u
        
    newdata = []
    for row in conversations:
        row["messages"].sort(key=lambda x: x["date_created"], reverse=False)
        
        users = {}
        for m in row["messages"]:
            if m["user_id"] not in users:
#                 username = "USR"+str(len(users)+1) if len(users.keys())==0 else "SYS"
                username = "USR" if len(users.keys())==0 else "SYS"
                users[m["user_id"]] = username
                
#         if len(users)>2:
#             print("More than 1 users", len(users))
        
        
        messages = row["messages"]
        chunk_size = 100
        for i in range(0, len(messages), chunk_size):
            sub_messages = messages[i:i+chunk_size]
            s, u = to_message_str(sub_messages, users)
            
            if pd.isna(row[col_label]):
                continue
            
            if row[col_label] in skips:
                continue
                
            label = row[col_label]
                
            newdata.append({
                "user": u,
                "sys": s,
                "label": label,
                "nturn": len(sub_messages)
            })
        
#     n_val = int(len(newdata)*0.05)
#     n_test = n_val
    
#     test = newdata[0:n_test]
#     val = newdata[n_test:n_test+n_val]
#     train = newdata[n_test+n_val:]
    
    return pd.DataFrame(newdata)

In [180]:
df1 = get_conversations("./Task1/annotated_conersations.jsonl", "closeness", skips = ["5. Don't like each other"])
df2 = get_conversations("./Task2/annotated_conersations.jsonl", "closeness", skips = ["5. Don't like each other"])
df3 = get_conversations("./Task3/annotated_conersations.jsonl", "closeness", skips = ["5. Don't like each other"])

Loaded 1234 records from ./Task1/annotated_conersations.jsonl
Loaded 2486 records from ./Task2/annotated_conersations.jsonl
Loaded 1221 records from ./Task3/annotated_conersations.jsonl


In [181]:
df1.head()

Unnamed: 0,user,sys,label,nturn
0,"[ดีๆวาย, เป็นยังไงบ้างช่วงนี้, รวยๆ เฮงๆ , เหม...","[ว่าไงปราง, เหมือนเดิม ขายของ นอนตื่น ขายของ ช...",2. Close,35
1,"[วันนี้เรามาพูดถึงเรื่องภาพยนตร์กัน, ปกฅิแล้วค...","[โอเคค่ะ, ที่บ้านค่ะ, ดูแทบทุกวันเลย, ดูได้ทุก...",1. Very Close,39
2,"[เธอ, สรุป อ้วนจะซื้อรถเมื่อไหร่, แล้วอ้วนดูรถ...","[อะไรหมู, กลางปี, ว่าจะเอา ativ ตัวใหม่, ว่าจ...",2. Close,40
3,"[มึง, แมวมึงเป็นไงกันบ้างอ่ะ, ตอนนี้มีกี่ตัวนะ...","[ว่า, เด็กๆหรอ, ซนปกติเลยมึง, 5ตัว, แค่บ้านกูน...",2. Close,35
4,"[อ้วน, อยากได้แมว, อยากเลี้ยง, เอาไว้ตอนตัวไม่...","[ไรหมู, เอาไปทำไมแมว, จะหาจากไหน, เดี๋ยวถามให้...",2. Close,39


In [182]:
df1.label.unique()

array(['2. Close', '1. Very Close', '3. Know each other',
       "4. Don't know each other"], dtype=object)

In [183]:
df1.groupby("label").count()

Unnamed: 0_level_0,user,sys,nturn
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1. Very Close,102,102,102
2. Close,449,449,449
3. Know each other,230,230,230
4. Don't know each other,435,435,435


In [184]:
# df.groupby("label").count()

In [185]:
df2.groupby("label").count()

Unnamed: 0_level_0,user,sys,nturn
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1. Very Close,42,42,42
2. Close,180,180,180
3. Know each other,158,158,158
4. Don't know each other,1487,1487,1487


In [186]:
df3.groupby("label").count()

Unnamed: 0_level_0,user,sys,nturn
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1. Very Close,66,66,66
2. Close,289,289,289
3. Know each other,739,739,739
4. Don't know each other,115,115,115


## Load Lexicons

In [187]:
from pythainlp.tokenize import word_tokenize

In [188]:
# download https://github.com/chameleonTK/thai-dictionary

In [189]:
import json
with open("./Lexicons/thai-dictionary/RoyalInstituteDictionary/words.json", encoding="utf-8") as fin:
    raw = json.load(fin)
    thaidict_royal = set()
    for k in raw:
        thaidict_royal.update(raw[k])

In [190]:
lexicons_arr = load_jsonl("lexicons.jsonl")

Loaded 22991 records from lexicons.jsonl


In [191]:
from collections import defaultdict
tags = set()
lexicons = {}
lexicons_keys = defaultdict(list)

for key, values  in lexicons_arr:
    if len(key) <= 1:
        continue
        
    key = key.lower()
    if key.endswith("rep"):
        key = key.replace("rep", "")
        
    w = word_tokenize(key)
    
    lexicons_keys[w[0]].append(key)
    
    tag = [t for t in values["tags"] if not t.startswith("cat:")]
    lexicons[key] = tag
    tags.update(tag)

In [192]:
from pythainlp.util import countthai

In [193]:
import re
from itertools import groupby
import emoji
from pythainlp.util import countthai
from tqdm import tqdm

def rm_reptitive(text):
    s = ""
    groups = groupby(text)
    for label, group in groups:
        g = list(group)
        if len(g) >= 3:
            s += f"{label*3} rep "
        else:
            s += "".join(g)
    return s

def remove_space(sent):
    newwords = []
    for w in sent:
        if len(w.strip())==0:
            continue
        newwords.append(w)
    return newwords

import re
def notthai(w):
    if countthai(w) > 50:
        return False
    
    if w in ["usr", "sys", "rep"]:
        return False
    
    nt = re.sub(r'\W+', '', w)
    if len(nt) > 0 and not nt.isnumeric():
#         print(nt)
        return True
    return False

def analyse_conv_per_person(texts):
    
    # Word Statistic
    texts = [t.lower() for t in texts]
    texts = [rm_reptitive(t) for t in texts]
    words = [word_tokenize(t) for t in texts]
    words = [remove_space(w) for w in words]
    
    nlongword = 0
    ndict = 0
    nnotthai = 0
    nthai = 0
    for sent in words:
        ndict += sum([1 if w in thaidict_royal else 0 for w in sent])
        nlongword += sum([1 if len(w) > 7 else 0 for w in sent])
        nthai += sum([1 if countthai(w) > 50 else 0 for w in sent])
            
        nnotthai += sum([1 if notthai(w) else 0 for w in sent])
    
    uwords = set()
    for sent in words:
        uwords.update(sent)
    
    # Lexicon 
    lex = []
    for sidx, sent in enumerate(words):
        for widx, w in enumerate(sent):
            if w not in lexicons_keys:
                continue 
            
            s = "".join(sent[widx:])
            for l in lexicons_keys[w]:
                if not s.startswith(l):
                    continue

                lex.extend(lexicons[l])
#                 print(">>", w, l, lexicons[l])

    lexcat = {}
    for l in lex:
        if l not in lexcat:
            lexcat[l] = 0
        lexcat[l] += 1
    
    # Stylistic words
    nrepeat = 0
    for sidx, sent in enumerate(words):
        nrepeat += sum([1 if w=="rep" else 0 for w in sent])
    
    s = " ".join(texts)
    nemoji = emoji.emoji_count(s)
    

    return {
        "nsent": len(texts),
        "nword": sum([len(w) for w in words]),
        "ndict": ndict,
        "nunique": len(uwords),
        "nlongword": nlongword,
        "nrepeat": nrepeat,
        "nthai": nthai,
        "nnotthai": nnotthai,
        "nemoji": nemoji,
        **lexcat
    }
  


def analyse_conversation(df):
    metrics = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        ms = analyse_conv_per_person(row["sys"])
        mu = analyse_conv_per_person(row["user"])
        metrics.append((ms, mu))
        
    
    return metrics

# analyse_conv_per_person(["เมิงงงงงงงมันโง่เหมือนควายยยยยยยยย"])

In [194]:
# df1["user"]

## Run Analysis

In [195]:
import pickle
def save_obj_values(filepath, obj):
  with open(filepath, 'wb') as fin:
    pickle.dump(obj, fin)
    
def load_obj_values(filepath):
  with open(filepath, 'rb') as fin:
    obj = pickle.load(fin)
  return obj

In [196]:
analysis_values = {
    "closeness": [None, None, None],
    # "authority": [None, None, None],
}

analysis_labels = {
    "closeness": [None, None, None],
    # "authority": [None, None, None],
}

In [197]:
df1 = get_conversations("./Task1/annotated_conersations.jsonl", "closeness", skips = ["5. Don't like each other"])
df2 = get_conversations("./Task2/annotated_conersations.jsonl", "closeness", skips = ["5. Don't like each other"])
df3 = get_conversations("./Task3/annotated_conersations.jsonl", "closeness", skips = ["5. Don't like each other"])


metrics = analyse_conversation(df1)
analysis_values["closeness"][0] = metrics
analysis_labels["closeness"][0] = df1["label"].values


metrics = analyse_conversation(df2)
analysis_values["closeness"][1] = metrics
analysis_labels["closeness"][1] = df2["label"].values

metrics = analyse_conversation(df3)
analysis_values["closeness"][2] = metrics
analysis_labels["closeness"][2] = df3["label"].values

Loaded 1234 records from ./Task1/annotated_conersations.jsonl
Loaded 2486 records from ./Task2/annotated_conersations.jsonl
Loaded 1221 records from ./Task3/annotated_conersations.jsonl


100%|█████████████████████████████████████████████████████████████████████████████████████| 1216/1216 [00:02<00:00, 548.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1867/1867 [00:01<00:00, 1107.63it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1209/1209 [00:02<00:00, 537.45it/s]


In [156]:
# save_obj_values("analysis_values.pkl", analysis_values)
# save_obj_values("analysis_labels.pkl", analysis_labels)

In [71]:
analysis_values = load_obj_values("analysis_values.pkl")
analysis_labels = load_obj_values("analysis_labels.pkl")

In [72]:
# cc = 0
# pp = 0
# for m in metrics:
#     m1, m2 = m
#     if "pronoun" in m1:
#         pp += 1
#     if "pronoun_plural" in m1:
# #         print(m1["pronoun_plural"])
#         cc += m1["pronoun_plural"]
    
# #     if "pronoun_plural" in m2:
# #         print(m2["pronoun_plural"])

# cc/pp

## Appendix A: Descriptive Analytics

In [98]:
metric_names = {
    "Corpus Statistics" : {
        "nsent": "Number of utterance",
        "nword": "Number of word",
        "nunique": "Vocabulary size",
        "nthai": "Thai words",
        "nnotthai": "Non-Thai words",
        "nlongword": "Long words",
        "ndict": "Dictionary words",
        "transliterated": "Transliteration",
    },
    "Pronoun": {
        "pronoun": "All pronoun",
        "pronoun_1st": ">> 1st person pronoun",
        "pronoun_2nd": ">> 2nd person pronoun",
        "pronoun_3rd": ">> 3rd person pronoun",
        "pronoun_singular": ">> Singular pronoun",
        "pronoun_plural": ">> Plural pronoun",
        "pronoun_misspelling": ">> Pronoun in non-standard spelling",
    },
    
    "Sentence-ending Particles": {
        "particles": "All particles",
        "particles_SARP": ">> Socially-related particles",
        "particles_notSARP": ">> Non-socially-related particles",
        "particles_misspelling": ">> Particle in non-standard spelling",
    },
    
#     "Sentiment-related": {
#         "sentiment": "Sentiment words",
#         "sentiment_positive": ">> Positive words",
#         "sentiment_negative": ">> Negative words",
#     },
    
    "Spelling Variation": {
        "misspelling": "All spelling variation",
        "misspelling_common": ">> Common misspelt words",
        "misspelling_intention": ">> Morphophonemic variation",
        "misspelling_shorten": ">> Simplified variation",
        "nrepeat": ">> Repeated characters",
#         "nemoji": ">> Emoji",
#         "abbr": "Abbreviation",
#         "slang": "Slang",
#         "swear": "Swear words"
    }
}

In [99]:
clse_print_labels = [
    "1. Very Close", 
    "2. Close", 
    "3. Know each other", 
    "4. Don't know each other", 
    # "5. Don't like each other"
]
# auth_print_labels = ['0. Very respect', '1. Respect', '2. Normal', '3. Not respect']

In [100]:
def print_lexi_stat(analysis_values, analysis_labels, factor, setting, print_labels):
    values = analysis_values[factor][setting]
    labels = analysis_labels[factor][setting]
    assert(len(values)==len(labels))
    
    printed_text = ""
    rows = []
    for (ms, mu), l in zip(values, labels):
        for m in mu:
            if m in ["nsent", "nword"]:
                v = mu[m]
            else:
#                 v = mu[m]*100/mu["nword"]
                v = mu[m]
                
            rows.append({
                "label": l,
                "metric": m,
                "value": v
            })
        
        if "particles_SARP" in mu:
            particles_notSARP = mu["particles"] - mu["particles_SARP"]
            rows.append({
                "label": l,
                "metric": "particles_notSARP",
                "value": particles_notSARP
            })

    df = pd.DataFrame(rows).groupby(["label", "metric"]).mean().reset_index()
    
    for g in metric_names:
        # print("\multicolumn{5}{l}{\\textit{"+g+"}} \\\\")
        printed_text += "\multicolumn{5}{l}{\\textit{"+g+"}} \\\\"+"\n"
        printed_text += "\hline"+"\n"
        for m in metric_names[g]:
            s = f"{metric_names[g][m]} "
            for l in print_labels:
                row = df[(df["label"]==l) & (df["metric"]==m)]
#                 print(l, m)
#                 print(df)
                if len(row)!=0:
                    s += f"& {row['value'].values[0]:.2f} "
                else:
                    s += f"& - "

            s += "\\\\"
            # print(s)
            printed_text += "    "+s+"\n"
        
        printed_text += "    &  & &  & \\\\"+"\n"
        printed_text += "\hline"+"\n"
        
        # print("&  & &  & \\\\")
        # print("\hline")
    # print(printed_text)
    return df, printed_text

In [101]:
df, printed_text1 = print_lexi_stat(analysis_values, analysis_labels, "closeness", 0, clse_print_labels)

In [102]:
# df, printed_text2 = print_lexi_stat(analysis_values, analysis_labels, "authority", 0, auth_print_labels)

In [103]:
df, printed_text3 = print_lexi_stat(analysis_values, analysis_labels, "closeness", 1, clse_print_labels)

In [104]:
# df, printed_text4 = print_lexi_stat(analysis_values, analysis_labels, "authority", 1, auth_print_labels)

In [105]:
df, printed_text5 = print_lexi_stat(analysis_values, analysis_labels, "closeness", 2, clse_print_labels)

In [106]:
# set(analysis_labels["authority"][2])

In [107]:
# df, printed_text6 = print_lexi_stat(analysis_values, analysis_labels, "authority", 2, auth_print_labels)

In [108]:
sections = [
    "Setting 1: Private Conversations with Self-Reported Labels (Private-Self)",
    "Setting 2: Public Conversations with Labels from 3rd Party (Public-Perceived)",
    "Setting 3: Private Conversations with Labels from 3rd Party (Private-Perceived)",
]

table_contents = [
    (printed_text1, ""),
    (printed_text3, ""),
    (printed_text5, ""),
]

printed_text = ""
for section, (t1, t2) in zip(sections, table_contents):
    
    printed_text += "\subsection{"+section+"}"+"\n\n"
    
    printed_text += '''\subsubsection{Closeness}
\\begin{longtable}[h]{
    p{\dimexpr 0.40\linewidth-2\\tabcolsep}|
    >{\\centering}p{\dimexpr 0.15\linewidth-2\\tabcolsep}
    >{\\centering}p{\dimexpr 0.15\linewidth-2\\tabcolsep}
    >{\\centering}p{\dimexpr 0.15\linewidth-2\\tabcolsep}
    >{\\centering\\arraybackslash}p{\dimexpr 0.15\linewidth-2\\tabcolsep}
}
    \hline

    Linguistic Features & Intimate & Close & Acquainted &  Unfamiliar\\\\
    \hline
    \endfirsthead

    \endhead
    '''
    
    printed_text += ("\n"+t1).replace("\n", "\n    ")
    printed_text += "\n"
    printed_text += "\end{longtable}"
    printed_text += "\n\n"

In [109]:
print(printed_text)

\subsection{Setting 1: Private Conversations with Self-Reported Labels (Private-Self)}

\subsubsection{Closeness}
\begin{longtable}[h]{
    p{\dimexpr 0.40\linewidth-2\tabcolsep}|
    >{\centering}p{\dimexpr 0.15\linewidth-2\tabcolsep}
    >{\centering}p{\dimexpr 0.15\linewidth-2\tabcolsep}
    >{\centering}p{\dimexpr 0.15\linewidth-2\tabcolsep}
    >{\centering\arraybackslash}p{\dimexpr 0.15\linewidth-2\tabcolsep}
}
    \hline

    Linguistic Features & Intimate & Close & Acquainted &  Unfamiliar\\
    \hline
    \endfirsthead

    \endhead
    
    \multicolumn{5}{l}{\textit{Corpus Statistics}} \\
    \hline
        Number of utterance & 19.08 & 19.05 & 17.67 & 17.94 \\
        Number of word & 99.74 & 95.60 & 99.47 & 97.55 \\
        Vocabulary size & 68.38 & 66.06 & 66.21 & 64.34 \\
        Thai words & 95.25 & 91.18 & 94.70 & 91.84 \\
        Non-Thai words & 0.38 & 0.56 & 0.50 & 0.60 \\
        Long words & 3.83 & 3.61 & 4.41 & 3.97 \\
        Dictionary words & 78.00 & 74.88 & 7

## Regression w/ Othogonal Polynomial Coding
https://medium.com/@wyess/demystifying-statistical-analysis-3-the-one-way-anova-expressed-in-linear-regression-99269e84edd9

In [110]:
def closeness_to_vec(label):
    if label == "1. Very Close":
        return {"x1": -3, "x2": 1, "x3": -1}
    elif label == "2. Close":
        return {"x1": -1, "x2": -1, "x3": 3}
    elif label == "3. Know each other":
        return {"x1": 1, "x2": -1, "x3": -3}
    elif label == "4. Don't know each other": 
        return {"x1": 3, "x2": 1, "x3": 1}
    elif label == "5. Don't like each other":
        return None
    else:
        return None

In [111]:
clse_coef_labels = ["b", "a1", "a2", "a3"]
auth_coef_labels = ["b", 'a1', 'a2', "a3"]

In [112]:
def to_features(analysis_values, analysis_labels, factor, setting, to_vec_func, skips=[]):
    values = analysis_values[factor][setting]
    labels = analysis_labels[factor][setting]
    assert(len(values)==len(labels))
    
    rows = []
    for (ms, mu), l in zip(values, labels):
        if l in skips:
            continue
            
        x = to_vec_func(l)
        if x is None:
            continue
        
        for m in mu:
            if m in ["nsent", "nword"]:
                v = mu[m]
            else:
                v = mu[m]*100/mu["nword"]
            
            
            rows.append({
                "metric": m,
                "value": v,
                **x
            })
            
        if "particles_SARP" in mu:
            particles_notSARP = mu["particles"] - mu["particles_SARP"]
            rows.append({
                "metric": "particles_notSARP",
                "value": particles_notSARP,
                **x
            })
            
    feats = pd.DataFrame(rows)  
    return feats

In [113]:
import statsmodels.api as sm
import numpy as np

def print_anova_test(feats, n=3):
    feat_names = feats["metric"].unique()
    
    feat_important = []
    for f in feat_names:
        if "cat:" in f:
            continue

        try:
            d = feats[feats["metric"]==f]
        #     print(f, len(d))
            x_columns = [f"x{i+1}" for i in range(n)]
            X = d[x_columns]
            X = sm.add_constant(X)
            Y = d["value"]
            model = sm.OLS(Y,X)
            results = model.fit()
            
            ncoef = n + 1
            
            t_test = results.t_test(np.identity(ncoef))
            f_test = results.f_test(np.identity(ncoef))
            o = {"feat": f, "f_value": f_test.pvalue}
            for i in range(ncoef):
                o[f"coef{i}"] = t_test.effect[i]
                
            for i in range(ncoef):
                o[f"pval{i}"] = t_test.pvalue[i]
                
            feat_important.append(o)
        except Exception as e:
            print("error", f, e)

    outputs = pd.DataFrame(feat_important)
    return outputs

In [114]:
def print_weights(outputs):
    
    coefs = {}
    coef_labels = ["b", "a1", "a2", "a3"]
    for i, label in enumerate(coef_labels):
        for _, row in outputs.iterrows():
            coefs[(label, row["feat"])] = (row[f"coef{i}"], row[f"pval{i}"])

#     print(coefs)

    printed_text = ""
    for g in metric_names:
        # print("\multicolumn{4}{l}{\\textit{"+g+"}} \\\\")
        printed_text += "\multicolumn{5}{l}{\\textit{"+g+"}} \\\\"+"\n"
        for m in metric_names[g]:
            s = f"{metric_names[g][m]} "
            for l in coef_labels:                    
                val, pval = coefs[(l, m)]
                if l=="b":
                    s += f"& {val:.2f} "
                elif pval < 0.05:
                    s += "& \cellcolor{gray!25} "+f"{val:.2f}*"
                else:
                    s += f"& {val:.2f} "
            s += "\\\\"
            # print(s)
            printed_text += s+"\n"
        printed_text += "&  & &\\\\" + "\n"
        printed_text += "\hline" + "\n"
        # print("&  & &\\\\")
        # print("\hline")
    return printed_text

In [115]:
feats = to_features(analysis_values, analysis_labels, "closeness", 0, closeness_to_vec, skips=[])
clse1_poly_outputs = print_anova_test(feats, n=3)
# print_weights(outputs, clse_coef_labels, clse_print_labels, "2. Know each other")
printed_text1 = print_weights(clse1_poly_outputs)



In [116]:
feats = to_features(analysis_values, analysis_labels, "closeness", 1, closeness_to_vec)
clse2_poly_outputs = print_anova_test(feats, n=3)
# clse_coef_labels = ["b", "2. Know each other", "4. Don't like each other"]
# print_weights(outputs, clse_coef_labels, clse_print_labels, "3. Don't know each other")
printed_text3 = print_weights(clse2_poly_outputs)

In [117]:
feats = to_features(analysis_values, analysis_labels, "closeness", 2, closeness_to_vec, skips=[])
clse3_poly_outputs = print_anova_test(feats, n=3)
# print_weights(outputs, clse_coef_labels, clse_print_labels, "2. Know each other")
printed_text5 = print_weights(clse3_poly_outputs)

In [118]:
sections = [
    "Setting 1: Private Conversations with Self-Reported Labels",
    "Setting 2: Public Conversations with Labels from 3rd Party ",
    "Setting 3: Private Conversations with Labels from 3rd Party ",
]

table_contents = [
    (printed_text1, ""),
    (printed_text3, ""),
    (printed_text5, ""),
]

printed_text = ""
for section, (t1, t2) in zip(sections, table_contents):
    
    printed_text += "\subsection{"+section+"}"+"\n\n"
    
    printed_text += '''\subsubsection{Closeness}
\\begin{longtable}[h]{
    p{\dimexpr 0.40\linewidth-2\\tabcolsep}|c|c|c|c
}
    \hline

      

    Lexical Features & Grand Mean & Linear & Quadratic & Cubic \\\\
    \hline
    \endfirsthead

    \endhead
    '''
    
    printed_text += ("\n"+t1).replace("\n", "\n    ")
    printed_text += "\n"
    printed_text += "\end{longtable}"

    printed_text += "\clearpage"
    printed_text += "\n\n"
    # break
    
# print(printed_text)

# Regression w/ Effect Coding

In [119]:
# def closeness_to_vec_effect(label):
#     if label == "1. Close":
#         return {"x1": 1, "x2": 0}
#     elif label == "2. Know each other": ## Base category
#         return {"x1": -1, "x2": -1}
#     elif label == "3. Don't know each other": 
#         return {"x1": 0, "x2": 1}
#     elif label == "4. Don't like each other":
#         return None
#     else:
#         return None

def closeness_to_vec_effect(label):
    if label == "1. Very Close":
        return {"x1": -1, "x2": -1, "x3": -1}
    elif label == "2. Close":
        return {"x1": 1, "x2": 0, "x3": 0}
    elif label == "3. Know each other":
        return {"x1": 0, "x2": 1, "x3": 0}
    elif label == "4. Don't know each other": 
        return {"x1": 0, "x2": 0, "x3": 1}
    elif label == "5. Don't like each other":
        return None
    else:
        return None

In [129]:
def print_weights_effect(outputs, coef_labels):
    coefs = {}
    for i, label in coef_labels.items():
        if i is None:
            continue
            
        for _, row in outputs.iterrows():
            coefs[(label, row["feat"])] = (row[f"coef{i}"], row[f"pval{i}"])

    printed_text = ""
    for g in metric_names:
        # print("\multicolumn{4}{l}{\\textit{"+g+"}} \\\\")
        printed_text += "\multicolumn{5}{l}{\\textit{"+g+"}} \\\\"+"\n"
        for m in metric_names[g]:
            s = f"{metric_names[g][m]} "
            for i, l in coef_labels.items():                    
                if i is None:
                    s += f"& * "
                    continue
                
                val, pval = coefs[(l, m)]
                if l=="Grand Mean":
                    s += f"& {val:.2f} "
                elif pval < 0.05:
                    s += "& \cellcolor{gray!25} "+f"{val:.2f}* "
                else:
                    s += f"& {val:.2f} "
            s += "\\\\"
            # print(s)
            printed_text += s+"\n"
        printed_text += "&  & &\\\\" + "\n"
        printed_text += "\hline" + "\n"
        # print("&  & &\\\\")
        # print("\hline")
    return printed_text, coef_labels

In [130]:
feats = to_features(analysis_values, analysis_labels, "closeness", 0, closeness_to_vec_effect, skips=[])
clse1_effect_outputs = print_anova_test(feats, n=3)

coef_labels = {0:"Grand Mean", 1: "Close", 2: "Acquainted", 3: "Unfamiliar"}
printed_text1 = print_weights_effect(clse1_effect_outputs, coef_labels)



In [131]:
feats

Unnamed: 0,metric,value,x1,x2,x3
0,nsent,20.000000,1,0,0
1,nword,135.000000,1,0,0
2,ndict,72.592593,1,0,0
3,nunique,65.185185,1,0,0
4,nlongword,5.185185,1,0,0
...,...,...,...,...,...
27294,misspelling_intention,12.500000,1,0,0
27295,slang,6.944444,1,0,0
27296,misspelling_shorten,18.055556,1,0,0
27297,misspelling_common,2.777778,1,0,0


In [132]:
# for u in feats["metric"].unique():
#     print(u)
#     print(feats[feats["metric"]==u][['x1', 'x2', 'x3']].corr())

In [133]:
feats = to_features(analysis_values, analysis_labels, "closeness", 1, closeness_to_vec_effect, skips=[])
clse2_effect_outputs = print_anova_test(feats, n=3)
printed_text3 = print_weights_effect(clse2_effect_outputs, coef_labels)

In [134]:
feats = to_features(analysis_values, analysis_labels, "closeness", 2, closeness_to_vec_effect, skips=[])
clse3_effect_outputs = print_anova_test(feats, n=3)
printed_text5 = print_weights_effect(clse3_effect_outputs, coef_labels)


In [135]:
columns = ["Grand Mean", "Close", "Acquainted", "Unfamiliar"]

In [136]:
sections = [
    "Setting 1: Private Conversations with Self-Reported Labels",
    "Setting 2: Public Conversations with Labels from 3rd Party ",
    "Setting 3: Private Conversations with Labels from 3rd Party ",
]

table_contents = [
    (printed_text1, ""),
    (printed_text3, ""),
    (printed_text5, ""),
]

printed_text = ""
for section, (v1, v2) in zip(sections, table_contents):
    
    printed_text += "\subsection{"+section+"}"+"\n\n"
    
    printed_text += '''\subsubsection{Closeness}
\\begin{longtable}[h]{
    p{\dimexpr 0.40\linewidth-2\\tabcolsep}|
    p{\dimexpr 0.15\linewidth-2\\tabcolsep}
    p{\dimexpr 0.15\linewidth-2\\tabcolsep}
    p{\dimexpr 0.15\linewidth-2\\tabcolsep}
    p{\dimexpr 0.15\linewidth-2\\tabcolsep}
}
    \hline
    '''
    
    printed_text += "Lexical Features & " + " & ".join(columns) + "\\\\"
        
    printed_text +='''
    \hline
    \endfirsthead

    \endhead
    '''

    t1, _ = v1
    printed_text += ("\n"+t1).replace("\n", "\n    ")
    printed_text += "\n"
    printed_text += "\end{longtable}"
    printed_text += "\n\n"

    printed_text += "\clearpage"
    printed_text += "\n\n"
    # break
    
print(printed_text)

\subsection{Setting 1: Private Conversations with Self-Reported Labels}

\subsubsection{Closeness}
\begin{longtable}[h]{
    p{\dimexpr 0.40\linewidth-2\tabcolsep}|
    p{\dimexpr 0.15\linewidth-2\tabcolsep}
    p{\dimexpr 0.15\linewidth-2\tabcolsep}
    p{\dimexpr 0.15\linewidth-2\tabcolsep}
    p{\dimexpr 0.15\linewidth-2\tabcolsep}
}
    \hline
    Lexical Features & Grand Mean & Close & Acquainted & Unfamiliar\\
    \hline
    \endfirsthead

    \endhead
    
    \multicolumn{5}{l}{\textit{Corpus Statistics}} \\
    Number of utterance & 18.43 & \cellcolor{gray!25} 0.61* & \cellcolor{gray!25} -0.77* & -0.49 \\
    Number of word & 98.09 & -2.49 & 1.38 & -0.54 \\
    Vocabulary size & 70.81 & \cellcolor{gray!25} 1.01* & -0.61 & -0.82 \\
    Thai words & 94.91 & -0.00 & 0.49 & \cellcolor{gray!25} -0.97* \\
    Non-Thai words & 0.47 & 0.09 & -0.06 & 0.08 \\
    Long words & 3.84 & \cellcolor{gray!25} -0.35* & \cellcolor{gray!25} 0.38* & -0.06 \\
    Dictionary words & 78.65 & \cellcol

### Print Results

In [190]:
sections = [
    "Setting 1: Private Conversations with Self-Reported Labels",
    "Setting 2: Public Conversations with Labels from 3rd Party ",
    "Setting 3: Private Conversations with Labels from 3rd Party ",
]

In [191]:
# def outputs_to_dict(outputs, coef_labels):
#     coefs = {}
#     for label in coef_labels.keys():
#         for _, row in outputs.iterrows():
#             coef_col, pval_col = coef_labels[label]
#             if coef_col is None:
#                 coefs[(label, row["feat"])] = (None, None)
#             elif coef_col not in row:
#                 continue
#             else:
#                 coefs[(label, row["feat"])] = (row[coef_col], row[pval_col])

#     return coefs

In [192]:
# # feats[feats["metric"]=="nsent"].groupby(["x1", "x2"]).count()
# labelmap = {
#     '1. Close': "1. Close", 
#     '2. Know each other': "2. Acquainted", 
#     "3. Don't know each other": "3. Unfamiliar",
    
    
#     '0. Very respect': "0. Highly Respectful", 
#     '1. Respect': "1. Respectful", 
#     '2. Normal': "2. Normal", 
#     '3. Not respect': "3. Disrespectful"
# }

# def label2newlabel(label):
#     if type(label) is list or type(label) is np.ndarray:
#         return [labelmap[l] if l in labelmap else l for l in label ]
    
#     if label in labelmap:
#         return labelmap[label]
#     return label


In [336]:

# def get_printed_text_by_section(title, coef_labels, outputs, section_label):
#     printed_text = ""
#     printed_text += "\subsubsection{"+title+"}"+"\n"
#     printed_text += "\label{"+section_label+"}"+"\n"
    
#     printed_text += "\\begin{longtable}[h]{"+"\n"

    
#     ncol = len(coef_labels)
#     if ncol==5:
#         printed_text += "    p{\dimexpr 0.35\linewidth-2\\tabcolsep}|"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.13\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.13\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.13\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.13\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering\\arraybackslash}p{\dimexpr 0.13\linewidth-2\\tabcolsep}"+"\n"
#     elif ncol==4:
#         printed_text += "    p{\dimexpr 0.40\linewidth-2\\tabcolsep}|"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.15\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.15\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.15\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering\\arraybackslash}p{\dimexpr 0.15\linewidth-2\\tabcolsep}"+"\n"
#     elif ncol==3:
#         printed_text += "    p{\dimexpr 0.40\linewidth-2\\tabcolsep}|"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.15\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering}p{\dimexpr 0.15\linewidth-2\\tabcolsep}"+"\n"
#         printed_text += "    >{\\centering\\arraybackslash}p{\dimexpr 0.15\linewidth-2\\tabcolsep}"+"\n"
#     else:
#         assert(False)
        
#     printed_text += "}"+"\n"
#     printed_text += "    \hline"+"\n"
#     printed_text += "    Lexical Features & "+" & ".join(coef_labels)+"\\\\"+"\n"
#     printed_text += "    \hline"+"\n"
# #     printed_text += "    \endfirsthead"+"\n"
# #     printed_text += ""+"\n"
#     printed_text += "    \endhead"+"\n"
#     printed_text += ""+"\n"
    
#     for sec, results in zip(sections, outputs):
#         printed_text += "    \multicolumn{4}{l}{\\textit{"+sec+"}} \\\\"+"\n"
#         printed_text += "    \hline"+"\n"
        
#         for m in metric_names[g]:
#             s = f"        {metric_names[g][m]} "
#             for l in coef_labels:   
#                 k = (l, m)
                
#                 if k not in results:
#                     s += f"& - "
#                 else:
#                     val, pval = results[k]
#                     if l=="Grand Mean":
#                         s += f"& {val:.2f} "
#                     elif val is None:
#                         s += f"& - " ## Base Category
#                     elif pval < 0.05:
#                         s += "& \cellcolor{gray!25} "+f"{val:.2f}* "
#                     else:
#                         s += f"& {val:.2f} "
#             s += "\\\\"
#             # print(s)
#             printed_text += s+"\n"
            
        
# #         printed_text += "        &  & &\\\\"+"\n"
#         printed_text += "    \hline"+"\n"
#         printed_text += ""+"\n"
    
#     printed_text += "\end{longtable}"+"\n"
    
    
#     return printed_text

In [337]:
# metric_names_code = {
# #     "Corpus Statistics": "corp_stat",
#     "Pronoun": "pronoun",
#     "Sentence-ending Particles": "particle",
# #     "Sentiment-related": "sentiment",
#     "Spelling Variation": "spelling",
# }

# printed_text = "" 
# for g in metric_names:
#     if g in ["Corpus Statistics"]:
#         continue
        
#     printed_text += "\subsection{"+g+"}"+"\n"+"\n"
    
#     ###### Closeness Effect Coding #######
#     coef_labels = {
#         "Grand Mean": ("coef0", "pval0"),
#         "Close": ("coef1", "pval1"),
#         "Acquainted": (None, None),
#         "Unfamiliar": ("coef2", "pval2"),
#     }
        
#     outputs = [
#         outputs_to_dict(clse1_effect_outputs, coef_labels),
#         outputs_to_dict(clse2_effect_outputs, coef_labels),
#         outputs_to_dict(clse3_effect_outputs, coef_labels),
#     ]
    
    
    

#     printed_text += get_printed_text_by_section(
#         "Regression Analysis on Closeness with Effect Coding",
#         coef_labels.keys(), 
#         outputs,
#         "tab:closeness_effect_"+metric_names_code[g]
#     )
    
#     ###### Closeness Polynomial Coding #######
    
#     coef_labels = {
#         "Grand Mean": ("coef0", "pval0"),
#         "Linear": ("coef1", "pval1"),
#         "Quadratic": ("coef2", "pval2"),
#     }
    
#     outputs = [
#         outputs_to_dict(clse1_poly_outputs, coef_labels),
#         outputs_to_dict(clse2_poly_outputs, coef_labels),
#         outputs_to_dict(clse3_poly_outputs, coef_labels),
#     ]
    
#     printed_text += get_printed_text_by_section(
#         "Regression Analysis on Closeness with Orthogonal Polynomial Coding",
#         coef_labels.keys(), 
#         outputs,
#         "tab:closeness_polynomial_"+metric_names_code[g]
#     )
    
#     printed_text += "\clearpage"+"\n\n"
    
    
#     ###### Respect Effect Coding #######
    
        
#     coef_labels1 = {
#         "Grand Mean": ("coef0", "pval0"),
#         "Highly Respectful": ("coef1", "pval1"),
#         "Respectful": ("coef2", "pval2"),
#         "Normal": (None, None),
#         "Disrespectful": ("-", "-"),
#     }
    
#     coef_labels2 = {
#         "Grand Mean": ("coef0", "pval0"),
#         "Highly Respectful": ("-", "-"),
#         "Respectful": ("coef2", "pval2"),
#         "Normal": (None, None),
#         "Disrespectful": ("coef1", "pval1"),
        
#     }
    
#     outputs = [
#         outputs_to_dict(auth1_effect_outputs, coef_labels1),
#         outputs_to_dict(auth2_effect_outputs, coef_labels2),
#         outputs_to_dict(auth3_effect_outputs, coef_labels2),
#     ]
    
    
#     printed_text += get_printed_text_by_section(
#         "Regression Analysis on Respect with Effect Coding",
#         coef_labels1.keys(), 
#         outputs,
#         "tab:respect_effect_"+metric_names_code[g]
#     )
    
#     ###### Respect Polynomial Coding #######
    
#     coef_labels = {
#         "Grand Mean": ("coef0", "pval0"),
#         "Linear": ("coef1", "pval1"),
#         "Quadratic": ("coef2", "pval2"),
#     }
    
#     outputs = [
#         outputs_to_dict(auth1_poly_outputs, coef_labels),
#         outputs_to_dict(auth2_poly_outputs, coef_labels),
#         outputs_to_dict(auth3_poly_outputs, coef_labels),
#     ]
    
#     printed_text += get_printed_text_by_section(
#         "Regression Analysis on Respect with Orthogonal Polynomial Coding",
#         coef_labels.keys(), 
#         outputs,
#         "tab:respect_polynomial_"+metric_names_code[g]
#     )
    
#     printed_text += "\clearpage"+"\n\n"
    
# #     break
    

In [194]:
# print(printed_text)