In [23]:
import sys
sys.path.append('..')

In [24]:
from utils import dump_jsonl, load_jsonl

In [25]:
import requests
import time
from tqdm.notebook import tqdm

In [26]:
def closeness_to_eng(label):
    if label == 'สนิทกันมาก':
        return "1. Close"
    elif label == 'สนิท':  # it was missed in the first annotation setting;
        return "1. Close"
    elif label == 'แค่คนรู้จักกัน':
        return "2. Know each other"
    elif label == 'ไม่รู้จักกัน':
        return "3. Don't know each other"
    elif label == 'ไม่ชอบหน้ากัน':
        return "4. Don't like each other"
    else:
        return None

def authority_to_eng(label):
    if label == "ให้เกียรติมาก":
        return "0. Very respect"
    elif label == "ให้เกียรติ":
        return "1. Respect"
    elif label == "ปกติ (สัมพันธ์แบบเท่าเทียมกัน)":
        return "2. Normal"
    elif label == "ไม่ให้เกียรติ":
        return "3. Not respect"
    else:
        return None

In [27]:
from itertools import groupby

def get_conversation(row):
    conversation = ""
    
    row["messages"].sort(key=lambda x: x["date_created"], reverse=False)
    
    users = {}
    for m in row["messages"]:
        if m["user_id"] not in users:
            username = "USR" if len(users.keys())==0 else "SYS"
            users[m["user_id"]] = username
            
    for m in row["messages"]:
        conversation += f"{users[m['user_id']]} {m['text']} \n"
#         conversation.append((users[m['user_id']], m['text']))
    
    return conversation

annotated_conversations = load_jsonl("./raw_data/annotated_conersations.jsonl")
for conv in annotated_conversations:
    conv["user_ids"] = set([m["user_id"] for m in conv["messages"]])

    conv["userA_id"] = conv["messages"][0]["user_id"]
    conv["userA_name"] = None
    conv["userB_id"] = list(conv["user_ids"]-{conv["userA_id"]})[0]
    conv["userB_name"] = None
    conv["revisit"] = len(conv["user_ids"])==2
    
    conv["conversations"] = get_conversation(conv)
    # conv["user_ids"] = None
    conv["messages"] = None

Loaded 1234 records from ./raw_data/annotated_conersations.jsonl


In [28]:
import pandas as pd
df = pd.DataFrame(annotated_conversations)

## Data Validation

In [29]:
import pandas as pd
validated_labels = pd.read_csv("./raw_data/validated_labels.csv", names=["room_id", "relationship", "closeness", "authority"])

In [30]:
validated_labels

Unnamed: 0,room_id,relationship,closeness,authority
0,กรอก Room ID,คุณเรียกความสัมพันธ์นี้อย่างไร ? (How do you c...,ให้คะแนนระดับความสนิทสนม ความไว้เนื้อเชื่อใจ (...,ให้คะแนนระดับการให้เกียรติ การให้ความเคารพ (Au...
1,2616,เพื่อน (Friend),สนิท,ปกติ (สัมพันธ์แบบเท่าเทียมกัน)
2,2504,เพื่อน (Friend),แค่คนรู้จักกัน,ให้เกียรติมาก
3,2505,เพื่อนร่วมงาน (Work colleague),สนิท,ปกติ (สัมพันธ์แบบเท่าเทียมกัน)
4,2495,เพื่อน (Friend),สนิท,ปกติ (สัมพันธ์แบบเท่าเทียมกัน)
...,...,...,...,...
246,760,เพื่อน (Friend),สนิท,ให้เกียรติ
247,763,เพื่อน (Friend),สนิท,ให้เกียรติ
248,768,เพื่อน (Friend),สนิท,ให้เกียรติ
249,2567,เพื่อน (Friend),สนิท,ปกติ (สัมพันธ์แบบเท่าเทียมกัน)


In [31]:
N = 0
matched = {
    "relationship":0,
    "closeness": 0,
    "authority": 0
}
for conv in annotated_conversations:
    d = validated_labels[validated_labels["room_id"]==str(conv["room_id"])]
    if len(d)!=0:
        N += 1
#         print(conv.keys())
#         print(conv["relationship"])
#         print(conv["closeness"])
#         print(conv["authority"])
        
        
        auth = authority_to_eng(d["authority"].values[0])
        clos = closeness_to_eng(d["closeness"].values[0])
        
        if clos is None or auth is None:
            print(d, auth, clos)
            assert(False)
        
        if conv["authority"]==auth:
            matched["authority"] += 1
#         else:
#             print(conv["authority"], auth)
    
        if conv["closeness"]==clos:
            matched["closeness"] += 1
#         else:
#             print(conv["closeness"], clos)

In [32]:
N, matched["closeness"]/N, matched["authority"]/N

(250, 0.796, 0.74)

In [33]:
# !uv pip install scikit-learn

In [34]:
from sklearn.metrics import cohen_kappa_score

In [35]:
from collections import defaultdict

# ordinal_weights
def get_weights(categories):
    weights = defaultdict(dict)
    if len(categories)==2:
        mat = [[1.00, 0.00], 
               [0.00, 1.00]]  
    elif len(categories)==3:
        mat = [[1.00, 0.67, 0.00], 
               [0.67, 1.00, 0.67], 
               [0.00, 0.67, 1.00]]
    elif len(categories)==4:
        mat = [[1.00, 0.83, 0.50, 0.00], 
               [0.83, 1.00, 0.83, 0.50], 
               [0.50, 0.83, 1.00, 0.83], 
               [0.00, 0.50, 0.83, 1.00]]
    elif len(categories)==5:
        mat = [[1, 0.9, 0.7, 0.4, 0.0], 
               [0.9, 1, 0.9, 0.7, 0.4], 
               [0.7, 0.9, 1, 0.9, 0.7], 
               [0.4, 0.7, 0.9, 1, 0.9],
               [0.0, 0.4, 0.7, 0.9, 1]]
    else:
        # Lazy to implement in case of len(categories) > 5
        raise Exception("No Implementation")
        
    for i, l in enumerate(categories): 
        for j, k in enumerate(categories): 
            weights[l][k] = mat[i][j]
    return weights


# (df["y1"], df["y2"], categories)
def cal_agreement(merged, categories, cat_column="label"):
    merged = merged.dropna()    
    cnt_matrix = defaultdict(dict)
    acc_matrix = defaultdict(dict)
    
    for l in categories: 
        for k in categories: 
            d = merged
            d = d[d[f"{cat_column}_x"]==k]
            d = d[d[f"{cat_column}_y"]==l]
            cnt_matrix[l][k] = len(d)
    
    for l in categories: 
        d = merged
        d = d[d[f"{cat_column}_x"]==l]
        acc_matrix["x"][l] = len(d)
        
        d = merged
        d = d[d[f"{cat_column}_y"]==l]
        acc_matrix["y"][l] = len(d)
    
    weights = get_weights(categories)
    
    N = len(merged)
    Pa = 0
    for l in categories: 
        for k in categories: 
            Pa += weights[l][k]*cnt_matrix[l][k]/N
    
    Pe = 0
    for l in categories: 
        for k in categories: 
            Pe += weights[l][k]*(acc_matrix["x"][l]/N)*(acc_matrix["y"][k]/N)
    
    if Pe==1:
        raise Exception("Divide by zero")
    
    kappa = (Pa-Pe)/(1-Pe)
    return kappa

In [36]:
df = []
for conv in annotated_conversations:
    d = validated_labels[validated_labels["room_id"]==str(conv["room_id"])]
    if len(d)==0:
        continue
        
    N += 1
#         print(conv.keys())
#         print(conv["relationship"])
#         print(conv["closeness"])
#         print(conv["authority"])
    
    
    auth = authority_to_eng(d["authority"].values[0])
    clos = closeness_to_eng(d["closeness"].values[0])
    
    if clos is None or auth is None:
        print(d, auth, clos)
        assert(False)
    
    label_x = conv["authority"]
    # if conv["authority"] in ['1. Respect', '2. Normal']:
    #     label_x = '2. Normal'
    
    label_y = auth
    # if auth in ['1. Respect', '2. Normal']:
    #     label_y = '2. Normal'
        
    df.append({
        "label_x": label_x,
        "label_y": label_y
    })


df = pd.DataFrame(df)

categories = ['0. Very respect', '1. Respect', '2. Normal']
# categories = ['0. Very respect', '2. Normal']
cal_agreement(df, categories)

0.2235880945154796

In [37]:
len(annotated_conversations), len(df)

(1234, 250)

In [38]:
df.label_y.value_counts()

label_y
2. Normal          196
1. Respect          52
0. Very respect      2
Name: count, dtype: int64

In [39]:
df.label_x.value_counts()

label_x
2. Normal          202
1. Respect          37
0. Very respect     11
Name: count, dtype: int64

In [40]:
df["cnt"] = "1"
# df[df["label_x"]!=df["label_y"]]
df.groupby(["label_x", "label_y"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt
label_x,label_y,Unnamed: 2_level_1
0. Very respect,0. Very respect,1
0. Very respect,1. Respect,2
0. Very respect,2. Normal,8
1. Respect,0. Very respect,1
1. Respect,1. Respect,16
1. Respect,2. Normal,20
2. Normal,1. Respect,34
2. Normal,2. Normal,168


In [43]:
y1 = []
y2 = []
df = []
labels = set()
for conv in annotated_conversations:
    d = validated_labels[validated_labels["room_id"]==str(conv["room_id"])]
    if len(d)==0:
        continue
        
    N += 1
#         print(conv.keys())
#         print(conv["relationship"])
#         print(conv["closeness"])
#         print(conv["authority"])
    
    
    auth = authority_to_eng(d["authority"].values[0])
    clos = closeness_to_eng(d["closeness"].values[0])

    labels.add(d["closeness"].values[0])
    if clos is None or auth is None:
        print(d, auth, clos)
        assert(False)
    

    df.append({
        "label_x": conv["closeness"],
        "label_y": clos
    })


df = pd.DataFrame(df)

categories = ['1. Close', '2. Know each other', "3. Don't know each other"]
cal_agreement(df, categories)

0.8226983726646814

In [44]:
labels

{'สนิท', 'สนิทกันมาก', 'แค่คนรู้จักกัน', 'ไม่รู้จักกัน'}