In [1]:
import sys
sys.path.append('..')

In [2]:
from utils import dump_jsonl, load_jsonl, CLOSENESS_LABELS, AUTHORITY_LABELS

In [3]:
ls annotated/v0.1/closeness_v0.1

user0.jsonl  user1.jsonl  user2.jsonl  user3.jsonl


In [4]:
import pandas as pd

users = ["user0", "user1", "user2", "user3"]

def authority_to_degree(label):
    if label == 'B ‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡∏µ‡∏¢‡∏£‡∏ï‡∏¥':
        return 3
    elif label == 'B ‡∏ó‡∏≥‡∏ï‡∏±‡∏ß‡∏õ‡∏Å‡∏ï‡∏¥':
        return 2
    elif label == 'B ‡πÑ‡∏°‡πà‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡∏µ‡∏¢‡∏£‡∏ï‡∏¥‡∏Å‡∏±‡∏ô':
        return 1
    else:
        return None

def closeness_to_degree(label):
    if label == '‡∏™‡∏ô‡∏¥‡∏ó‡∏Å‡∏±‡∏ô‡∏°‡∏≤‡∏Å':
        return 4
    elif label == '‡πÅ‡∏Ñ‡πà‡∏Ñ‡∏ô‡∏£‡∏π‡πâ‡∏à‡∏±‡∏Å‡∏Å‡∏±‡∏ô':
        return 3
    elif label == '‡πÑ‡∏°‡πà‡∏£‡∏π‡πâ‡∏à‡∏±‡∏Å‡∏Å‡∏±‡∏ô':
        return 2
    elif label == '‡πÑ‡∏°‡πà‡∏ä‡∏≠‡∏ö‡∏´‡∏ô‡πâ‡∏≤‡∏Å‡∏±‡∏ô':
        return 1
    else:
        return None
    
    
def load_data(filename, column_name, to_degree):
    annotated = load_jsonl(filename)
    
    _df = []
    for row in annotated:
        if len(row["label"])==0:
            continue

        _df.append({
            "text": row["text"],
            "tweet": row["tweet"],
            column_name: row["label"][0],
            f"{column_name}_degree": to_degree(row["label"][0])
        })
    
    return pd.DataFrame(_df)

In [5]:
from collections import defaultdict

# ordinal_weights
def get_weights(categories):
    weights = defaultdict(dict)
    if len(categories)==3:
        mat = [[1.00, 0.67, 0.00], 
               [0.67, 1.00, 0.67], 
               [0.00, 0.67, 1.00]]
    elif len(categories)==4:
        mat = [[1.00, 0.83, 0.50, 0.00], 
               [0.83, 1.00, 0.83, 0.50], 
               [0.50, 0.83, 1.00, 0.83], 
               [0.00, 0.50, 0.83, 1.00]]
    elif len(categories)==5:
        mat = [[1, 0.9, 0.7, 0.4, 0.0], 
               [0.9, 1, 0.9, 0.7, 0.4], 
               [0.7, 0.9, 1, 0.9, 0.7], 
               [0.4, 0.7, 0.9, 1, 0.9],
               [0.0, 0.4, 0.7, 0.9, 1]]
    else:
        # Lazy to implement in case of len(categories) > 5
        raise Exception("No Implementation")
        
    for i, l in enumerate(categories): 
        for j, k in enumerate(categories): 
            weights[l][k] = mat[i][j]
    return weights

    
def cal_agreement(df1, df2, column, categories, cat_column):
    merged = pd.merge(df1, df2, on=column)
#     assert(len(df1)==len(merged))
    merged = merged.dropna()    
    cnt_matrix = defaultdict(dict)
    acc_matrix = defaultdict(dict)
    
    for l in categories: 
        for k in categories: 
            d = merged
            d = d[d[f"{cat_column}_x"]==k]
            d = d[d[f"{cat_column}_y"]==l]
            cnt_matrix[l][k] = len(d)
    
    for l in categories: 
        d = merged
        d = d[d[f"{cat_column}_x"]==l]
        acc_matrix["x"][l] = len(d)
        
        d = merged
        d = d[d[f"{cat_column}_y"]==l]
        acc_matrix["y"][l] = len(d)
    
    weights = get_weights(categories)
    
    N = len(merged)
    Pa = 0
    for l in categories: 
        for k in categories: 
            Pa += weights[l][k]*cnt_matrix[l][k]/N
    
    Pe = 0
    for l in categories: 
        for k in categories: 
            Pe += weights[l][k]*(acc_matrix["x"][l]/N)*(acc_matrix["y"][k]/N)
    
    if Pe==1:
        raise Exception("Divide by zero")
    
    kappa = (Pa-Pe)/(1-Pe)
    return kappa

# Test Agreement

In [6]:
auth_df = {}
clos_df = {}
for u in users:
    auth_df[u] = load_data(f"annotated/v0.1/authority_v0.1/{u}.jsonl", "authority", authority_to_degree)
    clos_df[u] = load_data(f"annotated/v0.1/closeness_v0.1/{u}.jsonl", "closeness", closeness_to_degree)

Loaded 20 records from annotated/v0.1/authority_v0.1/user0.jsonl
Loaded 20 records from annotated/v0.1/closeness_v0.1/user0.jsonl
Loaded 20 records from annotated/v0.1/authority_v0.1/user1.jsonl
Loaded 20 records from annotated/v0.1/closeness_v0.1/user1.jsonl
Loaded 20 records from annotated/v0.1/authority_v0.1/user2.jsonl
Loaded 20 records from annotated/v0.1/closeness_v0.1/user2.jsonl
Loaded 20 records from annotated/v0.1/authority_v0.1/user3.jsonl
Loaded 20 records from annotated/v0.1/closeness_v0.1/user3.jsonl


In [7]:
import numpy as np

In [10]:
auth_df["user0"]

Unnamed: 0,text,tweet,authority,authority_degree
0,A: #‡∏Ñ‡∏•‡∏±‡∏ö‡πÄ‡∏Æ‡πâ‡∏≤‡∏™‡πåtoxic ‡πÑ‡∏Æ‡πÇ‡∏ã‡πÅ‡∏ö‡∏ö‡∏ô‡∏µ‡πâ ‡πÄ‡∏£‡∏≤‡∏Å‡πá‡∏°‡∏≥‡πÑ‡∏î‡πâ‡∏ô‡∏∞ ‡πÄ‡∏ú...,https://twitter.com/i/web/status/1456862455305...,B ‡∏ó‡∏≥‡∏ï‡∏±‡∏ß‡∏õ‡∏Å‡∏ï‡∏¥,2
1,A: ‡∏´‡∏ô‡∏π‡∏°‡∏µ‡∏û‡∏µ‡πà‡∏Ñ‡∏ô‡∏´‡∏ô‡∏∂‡πà‡∏á‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏≠‡∏î‡∏≠‡∏•‡∏Ñ‡πà‡∏∞‡πÅ‡∏°‡πà ‡∏´‡∏ô‡∏π‡∏à‡∏∞‡∏Ç‡∏≤‡∏¢‡∏´‡∏µ‡πÄ‡∏´...,https://twitter.com/i/web/status/1488876213754...,B ‡πÑ‡∏°‡πà‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡∏µ‡∏¢‡∏£‡∏ï‡∏¥‡∏Å‡∏±‡∏ô,1
2,A: ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏Ñ‡∏¥‡∏î‡πÄ‡∏´‡πá‡∏ô‡∏™‡πà‡∏ß‡∏ô‡∏ï‡∏±‡∏ß : #‡∏Ñ‡∏•‡∏±‡∏ö‡πÄ‡∏Æ‡∏≤‡∏™‡πåtoxic ‡∏°‡∏µ‡∏Ñ‡∏≥‡∏û‡∏π...,https://twitter.com/i/web/status/1457174314113...,B ‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡∏µ‡∏¢‡∏£‡∏ï‡∏¥,3
3,A: ‡∏™‡∏µ‡∏°‡πà‡∏ß‡∏á‡∏Ñ‡∏∑‡∏≠‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å‡∏™‡πÄ‡∏õ‡∏ã‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ ‡∏™‡∏µ‡∏ô‡πâ‡∏≥‡πÄ‡∏á‡∏¥‡∏ô‡∏Ñ‡∏∑‡∏≠‡∏ó‡∏µ...,https://twitter.com/i/web/status/1494612254544...,B ‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡∏µ‡∏¢‡∏£‡∏ï‡∏¥,3
4,A: ‡πÄ‡∏Ç‡∏¥‡∏ô‡πÄ‡∏ß‡∏Å‡∏±‡∏™‡∏à‡∏π‡∏ö‡∏û‡∏µ‡∏ó‡∏ó‡∏∏‡∏Å‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÄ‡∏•‡∏¢‡∏≠‡πà‡∏∞ ‡∏ô‡∏±‡∏á‡∏ä‡∏≠‡∏ö‡πÉ‡∏ä‡πâ‡∏à‡∏°‡∏π‡∏Å...,https://twitter.com/i/web/status/1546146399682...,B ‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡∏µ‡∏¢‡∏£‡∏ï‡∏¥,3
5,A: ‡∏≠‡∏¢‡∏≤‡∏Å‡∏°‡∏≤‡πÅ‡∏ä‡∏£‡πå‡∏õ‡∏™‡∏Å.‡∏ô‡∏¥‡∏î‡∏´‡∏ô‡πà‡∏≠‡∏¢ ‡πÄ‡∏£‡∏≤‡πÄ‡∏Ñ‡∏¢‡∏Å‡∏î‡∏î‡∏±‡∏ô‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á‡∏°‡∏≤‡∏Å...,https://twitter.com/i/web/status/1520391472457...,B ‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡∏µ‡∏¢‡∏£‡∏ï‡∏¥,3
6,A: ‡∏ü‡∏±‡∏á‡πÑ‡∏õ‡∏à‡∏î‡πÑ‡∏õ‡∏¢‡∏±‡∏á‡∏á‡∏á‡πÄ‡∏•‡∏¢ ü§îü§î #‡∏´‡∏±‡∏ß‡∏´‡∏ô‡πâ‡∏≤‡∏ß‡∏á‡∏ó‡∏µ‡πà‡∏Ç‡∏∂‡πâ‡∏ô‡∏ï‡πâ‡∏ô‡∏ß‡πà...,https://twitter.com/i/web/status/1494319446323...,B ‡∏ó‡∏≥‡∏ï‡∏±‡∏ß‡∏õ‡∏Å‡∏ï‡∏¥,2
7,A: ‡∏£‡∏≥‡∏Ñ‡∏≤‡∏ç‡∏û‡∏ß‡∏Å‡∏ó‡∏µ‡πà‡∏ö‡πà‡∏ô‡∏ß‡πà‡∏≤ ‡∏™‡∏±‡∏á‡∏Ñ‡∏°‡πÑ‡∏î‡πâ‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏ä‡∏ô‡πå‡∏≠‡∏∞‡πÑ‡∏£‡∏à‡∏≤‡∏Å‡∏Å‡∏≤...,https://twitter.com/i/web/status/1500489552485...,B ‡∏ó‡∏≥‡∏ï‡∏±‡∏ß‡∏õ‡∏Å‡∏ï‡∏¥,2
8,A: ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏≠‡∏≥‡∏°‡∏´‡∏¥‡∏ï‡∏ß‡∏á‡∏Å‡∏≤‡∏£‡∏ö‡∏±‡∏ô‡πÄ‡∏ó‡∏¥‡∏á‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏û‡∏£‡∏∞‡∏™‡∏∂‡∏Å‡πÉ‡∏´‡∏°‡πà - ‡∏≠‡∏¥...,https://twitter.com/i/web/status/1487700593460...,B ‡∏ó‡∏≥‡∏ï‡∏±‡∏ß‡∏õ‡∏Å‡∏ï‡∏¥,2
9,A: #DSI‡πÅ‡∏ï‡∏á‡πÇ‡∏° ‡∏≠‡∏µ‡∏Å‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏´‡∏ô‡∏∂‡πà‡∏á‡∏ó‡∏µ‡πà‡∏™‡∏á‡∏™‡∏±‡∏¢ ‡πÄ‡∏ß‡∏•‡∏≤‡∏ï‡∏Å‡∏Å‡∏±‡∏ö‡πÄ...,https://twitter.com/i/web/status/1509109934428...,B ‡∏ó‡∏≥‡∏ï‡∏±‡∏ß‡∏õ‡∏Å‡∏ï‡∏¥,2


In [8]:
print("Authority")
alliaa = []
for u in users[1:]:
    iaa = cal_agreement(auth_df["user0"], auth_df[u], column="tweet", categories=[1,2,3], cat_column="authority_degree")
    print(u, iaa)
    alliaa.append(iaa)
    
print(alliaa, np.mean(alliaa))

Authority
user1 0.5152998776009788
user2 0.5152998776009788
user3 0.5119356512714061
[0.5152998776009788, 0.5152998776009788, 0.5119356512714061] 0.5141784688244546


In [9]:
print("Closeness")
alliaa = []
for u in users[1:]:
    iaa = cal_agreement(clos_df["user0"], clos_df[u], column="tweet", categories=[1,2,3,4], cat_column="closeness_degree")
    print(u, iaa)
    alliaa.append(iaa)
    
print(alliaa, np.mean(alliaa))

Closeness
user1 0.6161425159235672
user2 0.6734563971992363
user3 0.6373333333333332
[0.6161425159235672, 0.6734563971992363, 0.6373333333333332] 0.6423107488187122
