# Is there a correlation between annotation quality and lexical data?

To investigate:

* MRC familiarity
* Polysemy types

Annotation quality measured by:

* performance on expert annotations
* agreement 
* contradiction rate


**To do**

* Find aggregated labels for evaluation and comparison
* Measure average and std ratings for correct and incorrect
* Count polysemy type for correct and incorrect/expected and unexpected agreement


Aggregated labels (relations): ../aggregated_labels/run-all--group_experiment-all--batch-all--cleaned_contradictions_batch_0.5-vote_majority_vote-relations.csv


Aggregated labels (levels): ../aggregated_labels/run-all--group_experiment-all--batch-all--cleaned_contradictions_batch_0.5-vote_majority_vote-levels.csv

In [1]:
from utils_data import load_experiment_data, load_gold_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
import csv
from collections import defaultdict, Counter
from statistics import stdev
from scipy.stats import spearmanr

# progress bar
from tqdm import tqdm

In [30]:
def load_aggregated_labels(category='relations'):
    
    agg_dir = '../aggregated_labels/'
    exp = 'run-all--group_experiment-all--batch-all'
    path = f'{agg_dir}/{exp}--cleaned_contradictions_batch_0.5-vote_majority_vote-{category}.csv'
    with open(path) as infile:
        dicts = list(csv.DictReader(infile))
    return dicts


def load_lexical_data():
    # this is what we originally sampled from
    path = '../data_lexical_info/vocab/all_lodce_mrc.csv'
    
    with open(path) as infile:
        dicts = list(csv.DictReader(infile))
    word_info_dict = sort_by_key(dicts, ['word'])
    return word_info_dict

def get_outcome_dict(gold, agg_dict):
    outcome_dict = defaultdict(set)
    c = 0
    keys = ['relation', 'property', 'concept']
    for d in gold:
        triple = '-'.join([d[k] for k in keys])
        agg_info = agg_dict[triple.strip()]
        assert len(agg_info) == 1, 'too many values'
        agg_info = agg_info[0]
        label = agg_info['majority_vote'].lower()
        gold_label = d['answer']
        if gold_label != 'NOGOLD':
            concept = agg_info['concept']
            if label == gold_label:
                outcome_dict['correct'].add(concept)
            else:
                outcome_dict['incorrect'].add(concept)

    for l, concepts in outcome_dict.items():
        print(l, len(concepts))
    return outcome_dict
        
def get_lexical_info(concepts, word_info_dict, info='fam'):
    concept_value_dict = dict()
    no_rating = []
    for c in concepts:
        c = c.strip()
        if c in word_info_dict:
            info_dict = word_info_dict[c][0]
            value = info_dict[info]
            if value.isdigit():
                concept_value_dict[c] = float(value)
            elif value != '':
                concept_value_dict[c] = value  
            else:
                no_rating.append(c)
        else:
            print(c, 'not found')
    print(f'No value for {len(no_rating)} out of {len(concepts)}: {no_rating}')
    return concept_value_dict

def get_av_ratings(outcome_dict, word_info_dict, info):
    
    result = dict()
    for outcome, concepts in outcome_dict.items():
        concept_value_dict = get_lexical_info(concepts, word_info_dict, info=info)
        #print(len(concept_value_dict))
        sum_values = sum(concept_value_dict.values())
        if sum_values > 0:
            av = sum_values/len(concept_value_dict)
        else:
            av = None
        if len(concept_value_dict) > 1:
            sd = stdev(concept_value_dict.values())
        else:
            sd = None
        result[outcome] = dict()
        result[outcome]['av'] = av
        result[outcome]['stdv'] = sd
        result[outcome]['n_concepts'] = len(concept_value_dict)
    return result


def get_polysemy_type_count(outcome_dict, word_info_dict):
    result = dict()
    for outcome, concepts in outcome_dict.items():
        concept_value_dict = get_lexical_info(concepts, word_info_dict, info='polysemy_type')
        poly_type_counter = Counter(concept_value_dict.values())
        result[outcome] = poly_type_counter
    return result
        

def get_concept_agreement_dict(crowd, iaa_m = 'Proportional'):
    concept_agreement_dict = defaultdict(list)
    data_by_triple = sort_by_key(crowd, ['relation', 'property', 'concept'])

    for t, data in tqdm(data_by_triple.items()):
        concepts = set([d['concept'] for d in data])
        assert len(concepts) == 1, 'More than one concept detected'
        concept = data[0]['concept']
        iaa = get_agreement(data, v = False)[iaa_m]
        concept_agreement_dict[concept].append(iaa)
    return concept_agreement_dict


def get_ratings_agreement(concept_agreement_dict, word_info_dict, rating_type = 'fam'):

    ratings = []
    agreements = []
    sds = []
    for c, ag in concept_agreement_dict.items():
        av_ag = sum(ag)/len(ag)
        if len(ag) > 1:
            sd = stdev(ag)
        else:
            sd = None
        if c in word_info_dict:
            value = word_info_dict[c][0][rating_type]
            if value.isdigit():
                rating = float(value) 
            elif value != '':
                rating = value
            else:
                rating = None
            if rating:
                ratings.append(rating)
                agreements.append(av_ag)
    return ratings, agreements

In [11]:
# load crowd:
run = '*'
group = 'experiment*'
n_q = '*'
batch = '*'
crowd = load_experiment_data(run, group, n_q, batch)

Discarded 655.0 annotations.


In [5]:
# test iaa
iaa = get_agreement(crowd)

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Krippendorff's alpha: 0.3318404604764501
Average Cohen's Kappa (pairwise): 0.3152848933711451
Proportional agreement (pairwise): 0.6743882516665843



In [12]:
# load aggregated labels
agg = load_aggregated_labels()
agg_dict = sort_by_key(agg, ['relation', 'property', 'concept'])

In [39]:
# load lexical data

word_info_dict = load_lexical_data()
check = [(len(v)== 1) for k, v in word_info_dict.items()]
print(all(check))

True


In [40]:
# load gold
gold = load_gold_data()
gold[:2]

[{'quid': 'impossible-shovel-roll',
  'relation': 'impossible',
  'property': 'roll',
  'concept': 'shovel',
  'answer': 'false',
  'expected_agreement': 'disagreement',
  'disagreement_cnt': 4,
  'workerid': 'gold',
  'completionurl': 'gold'},
 {'quid': 'unusual-carrot-red',
  'relation': 'unusual',
  'property': 'red',
  'concept': 'carrot',
  'answer': 'NOGOLD',
  'expected_agreement': 'disagreement',
  'disagreement_cnt': 5,
  'workerid': 'gold',
  'completionurl': 'gold'}]

## 1.) Evaluate against correct vs incorrect labels based on gold standard

In [41]:
# Get correct vs incorrect
    
print(len(gold))
outcome_dict = get_outcome_dict(gold, agg_dict)

154
correct 18
incorrect 12


In [42]:
info = 'fam'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 7 out of 18: ['buttercup', 'washer', 'recliner', 'acaridae', 'rhino', 'freebooter', 'stock']
No value for 6 out of 12: ['buttercup', 'washer', 'recliner', 'acaridae', 'rhino', 'freebooter']
correct {'av': 482.72727272727275, 'stdv': 117.3942851327022, 'n_concepts': 11}
incorrect {'av': 537.6666666666666, 'stdv': 27.7320512524167, 'n_concepts': 6}


In [43]:
info = 'imag'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 7 out of 18: ['buttercup', 'washer', 'recliner', 'acaridae', 'rhino', 'freebooter', 'stock']
No value for 6 out of 12: ['buttercup', 'washer', 'recliner', 'acaridae', 'rhino', 'freebooter']
correct {'av': 545.8181818181819, 'stdv': 112.41069182405931, 'n_concepts': 11}
incorrect {'av': 565.8333333333334, 'stdv': 38.47813231780704, 'n_concepts': 6}


In [44]:
info = 'aoa'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 16 out of 18: ['carrot', 'buttercup', 'cruiser', 'plaice', 'washer', 'leopard', 'shovel', 'recliner', 'acaridae', 'pineapple', 'rhino', 'freebooter', 'tire', 'arrow', 'stock', 'pen']
No value for 11 out of 12: ['carrot', 'buttercup', 'washer', 'pineapple', 'recliner', 'acaridae', 'shovel', 'rhino', 'freebooter', 'tire', 'pen']
correct {'av': 332.0, 'stdv': 100.40916292848975, 'n_concepts': 2}
incorrect {'av': 403.0, 'stdv': None, 'n_concepts': 1}


In [47]:
info = 'conc'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 7 out of 18: ['buttercup', 'washer', 'recliner', 'acaridae', 'rhino', 'freebooter', 'stock']
No value for 6 out of 12: ['buttercup', 'washer', 'recliner', 'acaridae', 'rhino', 'freebooter']
correct {'av': 571.4545454545455, 'stdv': 89.45877669224372, 'n_concepts': 11}
incorrect {'av': 601.8333333333334, 'stdv': 35.45372570924904, 'n_concepts': 6}


In [48]:
info = 'wiki_frequency'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 0 out of 18: []
No value for 0 out of 12: []
correct {'av': 17407.277777777777, 'stdv': 28800.587432337423, 'n_concepts': 18}
incorrect {'av': 10775.583333333334, 'stdv': 20855.596739390137, 'n_concepts': 12}


In [50]:
# polysemy

result = get_polysemy_type_count(outcome_dict, word_info_dict)
for l, resdict in result.items():
    total = sum(resdict.values())
    print(l, total)
    print('---------')
    for t, c in resdict.most_common():
        print(t, c, c/total)
    print()

No value for 3 out of 18: ['recliner', 'acaridae', 'rhino']
No value for 3 out of 12: ['recliner', 'acaridae', 'rhino']
correct 15
---------
poly 5 0.3333333333333333
homonyms_only_different_pos 5 0.3333333333333333
mon 4 0.26666666666666666
homonyms_also_same_pos 1 0.06666666666666667

incorrect 9
---------
poly 3 0.3333333333333333
homonyms_only_different_pos 3 0.3333333333333333
mon 2 0.2222222222222222
homonyms_also_same_pos 1 0.1111111111111111



## 2.) Check if there is a correlation between agreement and scores



In [51]:

concept_agreement_dict_krippendorff = get_concept_agreement_dict(crowd, iaa_m = 'Krippendorff')


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
100%|██████████| 17917/17917 [05:46<00:00, 51.75it/s]


In [52]:
concept_agreement_dict_prop = get_concept_agreement_dict(crowd, iaa_m = 'Proportional')

100%|██████████| 17917/17917 [06:06<00:00, 48.89it/s]


In [53]:
# Familiarity
rating_type = 'fam'
ratings, agreements = get_ratings_agreement(concept_agreement_dict_prop, word_info_dict, rating_type = rating_type)
print(len(ratings), len(agreements), len(concept_agreement_dict_prop))
spear = spearmanr(ratings, agreements)
print(spear)

ratings, agreements = get_ratings_agreement(concept_agreement_dict_krippendorff, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

540 540 1317
SpearmanrResult(correlation=-0.041178658718134996, pvalue=0.3395305264243659)
SpearmanrResult(correlation=-0.011746238121894018, pvalue=0.7853646141389479)


In [54]:
# Concreteness
rating_type = 'conc'
ratings, agreements = get_ratings_agreement(concept_agreement_dict_prop, word_info_dict, rating_type = rating_type)
print(len(ratings), len(agreements), len(concept_agreement_dict_prop))
spear = spearmanr(ratings, agreements)
print(spear)

ratings, agreements = get_ratings_agreement(concept_agreement_dict_krippendorff, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

516 516 1317
SpearmanrResult(correlation=0.020607244105233933, pvalue=0.640484887862392)
SpearmanrResult(correlation=0.019089933399731078, pvalue=0.6652852646805701)


In [55]:
# Age of acquisition
rating_type = 'aoa'

ratings, agreements = get_ratings_agreement(concept_agreement_dict_prop, word_info_dict, rating_type = rating_type)
print(len(ratings), len(agreements), len(concept_agreement_dict_prop))
spear = spearmanr(ratings, agreements)
print(spear)

ratings, agreements = get_ratings_agreement(concept_agreement_dict_krippendorff, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

247 247 1317
SpearmanrResult(correlation=0.04734565567809899, pvalue=0.45885325838194546)
SpearmanrResult(correlation=-0.011377984972510823, pvalue=0.8587875626466843)


In [56]:
# imag
rating_type = 'imag'

ratings, agreements = get_ratings_agreement(concept_agreement_dict_prop, word_info_dict, rating_type = rating_type)
print(len(ratings), len(agreements), len(concept_agreement_dict_prop))
spear = spearmanr(ratings, agreements)
print(spear)

ratings, agreements = get_ratings_agreement(concept_agreement_dict_krippendorff, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

518 518 1317
SpearmanrResult(correlation=0.0865525024481382, pvalue=0.048971631146406555)
SpearmanrResult(correlation=0.07398745880382133, pvalue=0.09253702943729658)


In [57]:
# Wiki frequency
rating_type = 'wiki_frequency'

ratings, agreements = get_ratings_agreement(concept_agreement_dict_prop, word_info_dict, rating_type = rating_type)
print(len(ratings), len(agreements), len(concept_agreement_dict_prop))
spear = spearmanr(ratings, agreements)
print(spear)

ratings, agreements = get_ratings_agreement(concept_agreement_dict_krippendorff, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

1298 1298 1317
SpearmanrResult(correlation=0.03092789807077539, pvalue=0.26551266529519524)
SpearmanrResult(correlation=-1.789438725001037e-05, pvalue=0.9994861035850755)


In [58]:
# Polysemy

rating_type = 'polysemy_type'
ratings, agreements = get_ratings_agreement(concept_agreement_dict_prop, word_info_dict, rating_type = rating_type)
print(len(ratings), len(agreements), len(concept_agreement_dict_prop))
polysemy_agreement = defaultdict(list)

for r, a in zip(ratings, agreements):
    polysemy_agreement[r].append(a)
    
for r, a in polysemy_agreement.items():
    av_a = sum(a)/len(a)
    print(r, av_a)


1024 1024 1317
poly 0.682615581132293
homonyms_also_same_pos 0.6822993776439703
mon 0.6904455146699301
homonyms_only_different_pos 0.6826598227289749


In [59]:
# Polysemy

rating_type = 'polysemy_type'
ratings, agreements = get_ratings_agreement(concept_agreement_dict_krippendorff, word_info_dict, rating_type = rating_type)
print(len(ratings), len(agreements), len(concept_agreement_dict_krippendorff))
polysemy_agreement = defaultdict(list)

for r, a in zip(ratings, agreements):
    polysemy_agreement[r].append(a)
    
for r, a in polysemy_agreement.items():
    av_a = sum(a)/len(a)
    print(r, av_a)


1024 1024 1317
poly 0.2048169649560948
homonyms_also_same_pos 0.21450596233026287
mon 0.21948861129198657
homonyms_only_different_pos 0.20457653550357413


### What about the words we excluded?