In [12]:
import numpy as np
import csv
from collections import defaultdict
from transformers import BertTokenizer, BertModel
import torch
from skbio.stats.distance import mantel
from scipy.stats import entropy
from operator import itemgetter
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from tqdm.notebook import tqdm

In [4]:
all_data = []
with open('../aggregate-avg-all.csv', newline='\n', mode='r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for row in reader:
        all_data.append(row)

f2i = {}
for i, f in enumerate(all_data[0]):
    f2i[f] = i
i2f = {i: f for (f, i) in f2i.items()}

# remove fields row
all_data = all_data[1:]

In [38]:
f2i

{'_unit_id': 0,
 '_golden': 1,
 '_unit_state': 2,
 '_trusted_judgments': 3,
 '_last_judgment_at': 4,
 'cannot_decide_reason': 5,
 'sim_score': 6,
 'sim_score:variance': 7,
 'a': 8,
 'b': 9,
 'cannot_decide_reason_gold': 10,
 'cluster_a': 11,
 'cluster_b': 12,
 'id_a': 13,
 'id_b': 14,
 'lemma': 15,
 'sim_score_gold': 16,
 'time_a': 17,
 'time_b': 18}

In [37]:
all_data[0]

['2540617972',
 'false',
 'finalized',
 '5',
 '11/13/2019 17:50:08',
 '',
 '2.2',
 '0.98',
 'undead be born . [[virus]] can not be traced to rotten meat , obscene sexual rite of autumnal haystack , nor the french itch . maybe the devil have made this devil thing .',
 "crack at the tests - - different ones , of course . he was under the weather when he took them the first time - - tail end of some [[virus]] business . he did n ' t miss by much , and the appeal board made a special ruling .",
 '',
 '0',
 '0',
 '1',
 '3',
 'virus',
 '',
 '1970-80',
 '1950-60']

In [6]:
snippets = defaultdict(dict)
judgements = defaultdict(dict)
variances = defaultdict(dict)

for datum in all_data:
    lemma = datum[f2i['lemma']]
    id_a = int(datum[f2i['id_a']])
    id_b = int(datum[f2i['id_b']])
    a = datum[f2i['a']]
    b = datum[f2i['b']]
    
    if a not in snippets[lemma]:
        snippets[lemma][id_a] = a.lower()
    
    if b not in snippets[lemma]:
        snippets[lemma][id_b] = b.lower()
        
    judgements[lemma][(id_a, id_b)] = float(datum[f2i['sim_score']])
    variances[lemma][(id_a, id_b)] = float(datum[f2i['sim_score:variance']])
    
    
for w in snippets:
    for id_, sent in snippets[w].items():
        tokens = list(map(str.lower, sent.split()))
        
        form = None
        for t in tokens:
            if t.startswith('[[') and t.endswith(']]'):
                form = t[2:-2]

        snippets[w][id_] = (form, sent)

In [7]:
sim_matrices = {}

for w in judgements:
    n_sent = len(snippets[w])
    m = np.zeros((n_sent, n_sent))

    for (id_a, id_b), score in judgements[w].items():
        m[id_a, id_b] = float(score)
        m[id_b, id_a] = float(score)

    sim_matrices[w] = m


In [10]:
for i, s in sorted(snippets['virus'].items(), key=itemgetter(0)):
    print(i, s[1])
print(sim_matrices['leaf'])

0 gye and barnard wanted to make no such mistake . furthermore , they knew that with the isolation of the [[virus]] the battle would be only half won . a cure must be found later .
1 undead be born . [[virus]] can not be traced to rotten meat , obscene sexual rite of autumnal haystack , nor the french itch . maybe the devil have made this devil thing .
2 as the others entered she opened one eye in blissful content . " i ' m afraid , " she announced , " that i ' m getting the village [[virus]] . i do n ' t want to move , i do n ' t want to stir , i do n ' t even want to lift a finger - - ever .
3 crack at the tests - - different ones , of course . he was under the weather when he took them the first time - - tail end of some [[virus]] business . he did n ' t miss by much , and the appeal board made a special ruling .
4 solomon ' s software , recently acquired by data - fellows , offers a pair of virus scanners : dr . solomon ' s anti - [[virus]] deluxe for windows 95 / nt and dr . solom

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# lm = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
lm = BertModel.from_pretrained(
    'bert-large-uncased',
    output_hidden_states=True)

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def canberra_distance(a, b):
    return np.sum(np.abs(a-b) / (np.abs(a) + np.abs(b)))

In [None]:
bert_sim_matrices = {}
for lemma in tqdm(judgements):
# for lemma in ['virus', 'sphere', 'leaf', 'card']:
#     print('--- Lemma: {}'.format(lemma))

    bert_sim_matrices[lemma] = np.zeros_like(sim_matrices[lemma])

    for (id_a, id_b) in judgements[w]:
        
        form1, s1 = snippets[lemma][id_a]
        form2, s2 = snippets[lemma][id_b]

        tokens_s1 = tokenizer.tokenize(s1)
        tokens_s2 = tokenizer.tokenize(s2)

        new_tokens_s1 = []
        skip_till = -1
        target1_pos = None
        for i, tok in enumerate(tokens_s1):
            if i <= skip_till:
                continue
            if tok == '[' and tokens_s1[i+1] == '[' and tokens_s1[i+2] == form1:
                skip_till = i+4
                target1_pos = len(new_tokens_s1)
                new_tokens_s1.append(form1)
            elif tok == '[' and tokens_s1[i + 1] == '[' and tokens_s1[i + 2] == lemma and tokens_s1[i + 3].startswith('##'):
                skip_till = i + 5
                target1_pos = len(new_tokens_s1)
                new_tokens_s1.append(lemma)
                new_tokens_s1.append(tokens_s1[i + 3])
            else:
                new_tokens_s1.append(tok)

        new_tokens_s2 = []
        skip_till = -1
        target2_pos = None
        for i, tok in enumerate(tokens_s2):
            if i <= skip_till:
                continue
            if tok == '[' and tokens_s2[i + 1] == '[' and tokens_s2[i + 2] == form2:
                skip_till = i + 4
                target2_pos = len(new_tokens_s2)
                new_tokens_s2.append(form2)
            elif tok == '[' and tokens_s2[i + 1] == '[' and tokens_s2[i + 2] == lemma and tokens_s2[i + 3].startswith('##'):
                skip_till = i + 5
                target2_pos = len(new_tokens_s2)
                new_tokens_s2.append(lemma)
                new_tokens_s2.append(tokens_s2[i + 3])
            else:
                new_tokens_s2.append(tok)


        token_ids_1 = tokenizer.encode(new_tokens_s1)
        token_ids_2 = tokenizer.encode(new_tokens_s2)
        
        

        with torch.no_grad():
            input_ids_tensor_1 = torch.tensor([token_ids_1])
            input_ids_tensor_2 = torch.tensor([token_ids_2])

            outputs_1 = lm(input_ids_tensor_1)
            outputs_2 = lm(input_ids_tensor_2)
            
#             print(outputs_1[0].shape, outputs_2[1].shape)

            hidden_states_1 = np.stack([l.clone().numpy() for l in outputs_1[2]])
            hidden_states_2 = np.stack([l.clone().numpy() for l in outputs_2[2]])

            hidden_states_1 = hidden_states_1.squeeze(1)
            hidden_states_2 = hidden_states_2.squeeze(1)

            usage_vector_1 = hidden_states_1[:, target1_pos, :]
            usage_vector_2 = hidden_states_2[:, target2_pos, :]

#             usage_vector_1 = np.sum(usage_vector_1, axis=0)
#             usage_vector_2 = np.sum(usage_vector_2, axis=0)
            usage_vector_1 = usage_vector_1.reshape((usage_vector_1.shape[0] * usage_vector_1.shape[1]))
            usage_vector_2 = usage_vector_2.reshape((usage_vector_2.shape[0] * usage_vector_2.shape[1]))
# 
#             print(usage_vector_1.shape, usage_vector_2.shape)

            sim_score = cosine_similarity(usage_vector_1, usage_vector_2)
            bert_sim_matrices[lemma][id_a, id_b] = sim_score
            bert_sim_matrices[lemma][id_b, id_a] = sim_score

In [None]:
coeffs = {}
sig_coeffs = {}

for w in bert_sim_matrices:
    coeff, p_value, n = mantel(
        sim_matrices[w],
        bert_sim_matrices[w],
        method='spearman',  # pearson
        permutations=999,
        alternative='two-sided'  # greater, less
    )

    print(w)
    print('spearman: {:.2f}    p: {:.2f}'.format(coeff, p_value))

    coeffs[w] = coeff

    if p_value < 0.05:
        sig_coeffs[w] = coeff
        print('---')

print(np.mean(list(coeffs.values())))
print(np.mean(list(sig_coeffs.values())))


---

In [14]:
def ann_entropy(scores):
    p_distr = np.zeros(5)
    for c in range(5):
        p_distr[c] = scores.count(c) / len(scores)
    
    return entropy(p_distr)

In [34]:
agg_data = []
with open('../aggregate-all-all.csv', newline='\n', mode='r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for row in reader:
        agg_data.append(row)

field2index = {}
for i, f in enumerate(agg_data[0]):
    field2index[f] = i
index2field = {i: f for (f, i) in field2index.items()}

# remove fields row
agg_data = agg_data[1:]

print('{} rows.\n'.format(len(agg_data)))
print('Fields:\n{}'.format(list(index2field.values())))

variances_bylemma = defaultdict(list)
entropies_bylemma = defaultdict(list)
for row in agg_data:
    lemma = row[field2index['lemma']]
    row_judgements = list(map(int, row[field2index['sim_score']].split('\n')))
    variances_bylemma[lemma].append(np.var(row_judgements))
    entropies_bylemma[lemma].append(ann_entropy(row_judgements))
        
        
mean_variance_bylemma = {}
median_variance_bylemma = {}
max_variance_bylemma = {}
min_variance_bylemma = {}
median_entropy_bylemma = {}
mean_entropy_bylemma = {}
min_entropy_bylemma = {}

for lemma, variances in variances_bylemma.items():
    mean_variance_bylemma[lemma] = np.mean(variances)
    median_variance_bylemma[lemma] = np.median(variances)
    max_variance_bylemma[lemma] = np.max(variances)
    
    v = np.ma.masked_equal(variances, 0.0, copy=False)
    min_variance_bylemma[lemma] = np.min(v)
    
    median_entropy_bylemma[lemma] = np.median(entropies_bylemma[lemma])
    mean_entropy_bylemma[lemma] = np.mean(entropies_bylemma[lemma])
    
    h = np.ma.masked_equal(entropies_bylemma[lemma], np.inf, copy=False)
    min_entropy_bylemma[lemma] = np.min(h)

3285 rows.

Fields:
['_unit_id', '_golden', '_unit_state', '_trusted_judgments', '_last_judgment_at', 'cannot_decide_reason', 'sim_score', 'a', 'b', 'cannot_decide_reason_gold', 'cluster_a', 'cluster_b', 'id_a', 'id_b', 'lemma', 'sim_score_gold', 'time_a', 'time_b']


In [None]:
from scipy.stats import spearmanr

print('Median ', spearmanr(list(coeffs.values()), list(median_variance_bylemma.values())))
print('Mean   ', spearmanr(list(coeffs.values()), list(mean_variance_bylemma.values())))
print('Max    ', spearmanr(list(coeffs.values()), list(max_variance_bylemma.values())))

print('Median ', spearmanr(list(coeffs.values()), list(median_entropy_bylemma.values())))
print('Mean   ', spearmanr(list(coeffs.values()), list(mean_entropy_bylemma.values())))

In [None]:
# X, Y = zip(*sorted(variance_bylemma.items(), key=itemgetter(1)))
fig = go.Figure(data=[
    go.Bar(name='Agreement', x=list(coeffs.keys()), y=list(coeffs.values())),
    go.Bar(name='Variance', x=list(variance_bylemma.keys()), y=list(variance_bylemma.values()))
])

fig.show()

In [36]:
with open('median_var.txt', 'w') as f:
    for w, var in sorted(median_variance_bylemma.items(), key=itemgetter(1)):
        print('{:10} {:.3f}'.format(w, var), file=f)

In [None]:
for w, var in sorted(mean_entropy_bylemma.items(), key=itemgetter(1)):
    print('{:10} {:.2f}'.format(w, var))

In [None]:
A_bycoder = sorted(annotations_bylemma['virus'], key=itemgetter(0)) # 0:coder 1:item 2:label 
A_byitem = sorted(annotations_bylemma['virus'], key=itemgetter(1, 0)) # 0:coder 1:item 2:label 


coder2idx = {}
for (coder, _, _) in A_bycoder:
    if coder not in coder2idx:
        coder2idx[coder] = len(coder2idx)

item2idx = {}
for (coder, item, _) in A_byitem:
    if item not in item2idx:
        item2idx[item] = len(item2idx)

agr_data = [['*' for _ in item2idx] for _ in coder2idx]

for (coder, item, label) in A_byitem: 
    agr_data[coder2idx[coder]][item2idx[item]] = str(label)

agr_data = tuple(tuple(l) for l in agr_data)

missing = '*' # indicator for missing values
print("interval metric: %.4f" % krippendorff_alpha(agr_data, interval_metric, missing_items=missing))