In [116]:
import sys
sys.path.append('..')

In [117]:
from utils import dump_jsonl, load_jsonl, CLOSENESS_LABELS, AUTHORITY_LABELS

In [118]:
ls raw_data/annotated/v0.1/closeness_v0.1

user0.jsonl  user1.jsonl  user2.jsonl  user3.jsonl


In [119]:
import pandas as pd

users = ["user0", "user1", "user2", "user3"]

def authority_to_degree(label):
    if label == 'B ให้เกียรติ':
        return 3
    elif label == 'B ทำตัวปกติ':
        return 2
    elif label == 'B ไม่ให้เกียรติกัน':
        return 1
    else:
        return None

def closeness_to_degree(label):
    if label == 'สนิทกันมาก':
        return 5
    elif label == 'สนิทกัน':
        return 4
    elif label == 'แค่คนรู้จักกัน':
        return 3
    elif label == 'ไม่รู้จักกัน':
        return 2
    elif label == 'ไม่ชอบหน้ากัน':
        return 1
    else:
        return None
    
    
def load_data(filename, column_name, to_degree):
    annotated = load_jsonl(filename)
    
    _df = []
    for row in annotated:
        if len(row["label"])==0:
            continue

        _df.append({
            "text": row["text"],
            "tweet": row["tweet"],
            column_name: row["label"][0],
            f"{column_name}_degree": to_degree(row["label"][0])
        })
    
    return pd.DataFrame(_df)

In [120]:
from collections import defaultdict

# ordinal_weights
def get_weights(categories):
    weights = defaultdict(dict)
    if len(categories)==3:
        mat = [[1.00, 0.67, 0.00], 
               [0.67, 1.00, 0.67], 
               [0.00, 0.67, 1.00]]
    elif len(categories)==4:
        mat = [[1.00, 0.83, 0.50, 0.00], 
               [0.83, 1.00, 0.83, 0.50], 
               [0.50, 0.83, 1.00, 0.83], 
               [0.00, 0.50, 0.83, 1.00]]
    elif len(categories)==5:
        mat = [[1, 0.9, 0.7, 0.4, 0.0], 
               [0.9, 1, 0.9, 0.7, 0.4], 
               [0.7, 0.9, 1, 0.9, 0.7], 
               [0.4, 0.7, 0.9, 1, 0.9],
               [0.0, 0.4, 0.7, 0.9, 1]]
    else:
        # Lazy to implement in case of len(categories) > 5
        raise Exception("No Implementation")
        
    for i, l in enumerate(categories): 
        for j, k in enumerate(categories): 
            weights[l][k] = mat[i][j]
    return weights

    
def cal_agreement(df1, df2, column, categories, cat_column):
    merged = pd.merge(df1, df2, on=column)
#     assert(len(df1)==len(merged))
    merged = merged.dropna()    
    cnt_matrix = defaultdict(dict)
    acc_matrix = defaultdict(dict)
    
    for l in categories: 
        for k in categories: 
            d = merged
            d = d[d[f"{cat_column}_x"]==k]
            d = d[d[f"{cat_column}_y"]==l]
            cnt_matrix[l][k] = len(d)
    
    for l in categories: 
        d = merged
        d = d[d[f"{cat_column}_x"]==l]
        acc_matrix["x"][l] = len(d)
        
        d = merged
        d = d[d[f"{cat_column}_y"]==l]
        acc_matrix["y"][l] = len(d)
    
    weights = get_weights(categories)
    
    N = len(merged)
    Pa = 0
    for l in categories: 
        for k in categories: 
            Pa += weights[l][k]*cnt_matrix[l][k]/N
    
    Pe = 0
    for l in categories: 
        for k in categories: 
            Pe += weights[l][k]*(acc_matrix["x"][l]/N)*(acc_matrix["y"][k]/N)
    
    if Pe==1:
        raise Exception("Divide by zero")
    
    kappa = (Pa-Pe)/(1-Pe)
    return kappa

# Test Agreement

In [121]:
auth_df = {}
clos_df = {}
for u in users:
    auth_df[u] = load_data(f"raw_data/annotated/v0.1/authority_v0.1/{u}.jsonl", "authority", authority_to_degree)
    clos_df[u] = load_data(f"raw_data/annotated/v0.1/closeness_v0.1/{u}.jsonl", "closeness", closeness_to_degree)

Loaded 20 records from raw_data/annotated/v0.1/authority_v0.1/user0.jsonl
Loaded 20 records from raw_data/annotated/v0.1/closeness_v0.1/user0.jsonl
Loaded 20 records from raw_data/annotated/v0.1/authority_v0.1/user1.jsonl
Loaded 20 records from raw_data/annotated/v0.1/closeness_v0.1/user1.jsonl
Loaded 20 records from raw_data/annotated/v0.1/authority_v0.1/user2.jsonl
Loaded 20 records from raw_data/annotated/v0.1/closeness_v0.1/user2.jsonl
Loaded 20 records from raw_data/annotated/v0.1/authority_v0.1/user3.jsonl
Loaded 20 records from raw_data/annotated/v0.1/closeness_v0.1/user3.jsonl


In [122]:
import numpy as np

In [123]:
print("Authority")
alliaa = []
for u in users[1:]:
    iaa = cal_agreement(auth_df["user0"], auth_df[u], column="tweet", categories=[1,2,3], cat_column="authority_degree")
    print(u, iaa)
    alliaa.append(iaa)
    
print(alliaa, np.mean(alliaa))

Authority
user1 0.5152998776009788
user2 0.5152998776009788
user3 0.5119356512714061
[0.5152998776009788, 0.5152998776009788, 0.5119356512714061] 0.5141784688244546


In [124]:
print("Closeness")
alliaa = []
for u in users[1:]:
    iaa = cal_agreement(clos_df["user0"], clos_df[u], column="tweet", categories=[1, 2, 3, 4, 5], cat_column="closeness_degree")
    print(u, iaa)
    alliaa.append(iaa)
    
print(alliaa, np.mean(alliaa))

Closeness
user1 0.6288156288156285
user2 0.6653116531165313
user3 0.5922330097087379
[0.6288156288156285, 0.6653116531165313, 0.5922330097087379] 0.6287867638802993
