# Is there a correlation between annotation quality and lexical data?

To investigate:

* MRC familiarity
* Polysemy types

Annotation quality measured by:

* performance on expert annotations
* agreement 
* contradiction rate


**To do**

* Find aggregated labels for evaluation and comparison
* Measure average and std ratings for correct and incorrect
* Count polysemy type for correct and incorrect/expected and unexpected agreement


Aggregated labels (relations): ../aggregated_labels/run-all--group_experiment-all--batch-all--cleaned_contradictions_batch_0.5-vote_majority_vote-relations.csv


Aggregated labels (levels): ../aggregated_labels/run-all--group_experiment-all--batch-all--cleaned_contradictions_batch_0.5-vote_majority_vote-levels.csv

In [31]:
from utils_data import load_experiment_data, load_gold_data
from calculate_iaa import get_agreement
from utils_analysis import sort_by_key

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
import csv
from collections import defaultdict, Counter
from statistics import stdev
from scipy.stats import spearmanr

# progress bar
from tqdm import tqdm

In [46]:
def load_aggregated_labels(category='relations'):
    
    agg_dir = '../aggregated_labels/'
    exp = 'run-all--group_experiment-all--batch-all'
    path = f'{agg_dir}/{exp}--cleaned_contradictions_batch_0.5-vote_majority_vote-{category}.csv'
    with open(path) as infile:
        dicts = list(csv.DictReader(infile))
    return dicts


def load_lexical_data():
    path = '../data_lexical_info/vocab/lemma_concept_property_overview_ids.csv'
    with open(path) as infile:
        dicts = list(csv.DictReader(infile))
    word_info_dict = sort_by_key(dicts, ['lemma'])
    return word_info_dict

def get_outcome_dict(gold, agg_dict):
    outcome_dict = defaultdict(set)
    c = 0
    keys = ['relation', 'property', 'concept']
    for d in gold:
        triple = '-'.join([d[k] for k in keys])
        agg_info = agg_dict[triple.strip()]
        assert len(agg_info) == 1, 'too many values'
        agg_info = agg_info[0]
        label = agg_info['majority_vote'].lower()
        gold_label = d['answer']
        if gold_label != 'NOGOLD':
            concept = agg_info['concept']
            if label == gold_label:
                outcome_dict['correct'].add(concept)
            else:
                outcome_dict['incorrect'].add(concept)

    for l, concepts in outcome_dict.items():
        print(l, len(concepts))
    return outcome_dict
        
def get_lexical_info(concepts, word_info_dict, info='fam'):
    concept_value_dict = dict()
    no_rating = []
    for c in concepts:
        c = c.strip()
        if c in word_info_dict:
            info_dict = word_info_dict[c][0]
            value = info_dict[info]
            if value.isdigit():
                concept_value_dict[c] = float(value)
            elif value != '':
                concept_value_dict[c] = value  
            else:
                no_rating.append(c)
    print(f'No value for {len(no_rating)} out of {len(concepts)}: {no_rating}')
    return concept_value_dict

def get_av_ratings(outcome_dict, word_info_dict, info):
    
    result = dict()
    for outcome, concepts in outcome_dict.items():
        concept_value_dict = get_lexical_info(concepts, word_info_dict, info=info)
        #print(len(concept_value_dict))
        sum_values = sum(concept_value_dict.values())
        if sum_values > 0:
            av = sum_values/len(concept_value_dict)
        else:
            av = None
        if len(concept_value_dict) > 1:
            sd = stdev(concept_value_dict.values())
        else:
            sd = None
        result[outcome] = dict()
        result[outcome]['av'] = av
        result[outcome]['stdv'] = sd
        result[outcome]['n_concepts'] = len(concept_value_dict)
    return result


def get_polysemy_type_count(outcome_dict, word_info_dict):
    result = dict()
    for outcome, concepts in outcome_dict.items():
        concept_value_dict = get_lexical_info(concepts, word_info_dict, info='polysemy_type')
        poly_type_counter = Counter(concept_value_dict.values())
        result[outcome] = poly_type_counter
    return result
        

def get_concept_agreement_dict(crowd, iaa_m = 'Proportional'):
    concept_agreement_dict = defaultdict(list)
    data_by_triple = sort_by_key(crowd, ['relation', 'property', 'concept'])

    for t, data in tqdm(data_by_triple.items()):
        concepts = set([d['concept'] for d in data])
        assert len(concepts) == 1, 'More than one concept detected'
        concept = data[0]['concept']
        iaa = get_agreement(data, v = False)[iaa_m]
        concept_agreement_dict[concept].append(iaa)
    return concept_agreement_dict


def get_ratings_agreement(concept_agreement_dict, word_info_dict, rating_type = 'fam'):

    ratings = []
    agreements = []
    sds = []
    for c, ag in concept_agreement_dict.items():
        av_ag = sum(ag)/len(ag)
        if len(ag) > 1:
            sd = stdev(ag)
        else:
            sd = None
        if c in word_info_dict:
            value = word_info_dict[c][0][rating_type]
            if value.isdigit():
                rating = float(value) 
            elif value != '':
                rating = value
            else:
                rating = None
            if rating:
                ratings.append(rating)
                agreements.append(av_ag)
    return ratings, agreements

In [4]:
# load crowd:
run = '*'
group = 'experiment*'
n_q = '*'
batch = '*'
crowd = load_experiment_data(run, group, n_q, batch)

Discarded 655.0 annotations.


In [5]:
# test iaa
iaa = get_agreement(crowd)

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Krippendorff's alpha: 0.3318404604764501
Average Cohen's Kappa (pairwise): 0.3152848933711451
Proportional agreement (pairwise): 0.6743882516665843



In [6]:
# load aggregated labels
agg = load_aggregated_labels()
agg_dict = sort_by_key(agg, ['relation', 'property', 'concept'])

In [7]:
# load lexical data

word_info_dict = load_lexical_data()
check = [(len(v)== 1) for k, v in word_info_dict.items()]
print(all(check))

False


In [8]:
# load gold
gold = load_gold_data()
gold[:2]

[{'quid': 'impossible-shovel-roll',
  'relation': 'impossible',
  'property': 'roll',
  'concept': 'shovel',
  'answer': 'false',
  'expected_agreement': 'disagreement',
  'disagreement_cnt': 4,
  'workerid': 'gold',
  'completionurl': 'gold'},
 {'quid': 'unusual-carrot-red',
  'relation': 'unusual',
  'property': 'red',
  'concept': 'carrot',
  'answer': 'NOGOLD',
  'expected_agreement': 'disagreement',
  'disagreement_cnt': 5,
  'workerid': 'gold',
  'completionurl': 'gold'}]

## 1.) Evaluate against correct vs incorrect labels based on gold standard

In [9]:
# Get correct vs incorrect
    
print(len(gold))
outcome_dict = get_outcome_dict(gold, agg_dict)

154
correct 18
incorrect 12


In [10]:
info = 'fam'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 11 out of 18: ['freebooter', 'pin', 'acaridae', 'rhino', 'stock', 'leopard', 'tire', 'recliner', 'wine', 'buttercup', 'washer']
No value for 8 out of 12: ['freebooter', 'acaridae', 'rhino', 'tire', 'recliner', 'wine', 'buttercup', 'washer']
correct {'av': 462.7142857142857, 'stdv': 140.8376302528486, 'n_concepts': 7}
incorrect {'av': 527.5, 'stdv': 27.79088579612628, 'n_concepts': 4}


In [11]:
info = 'aoa'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 18 out of 18: ['freebooter', 'pin', 'cruiser', 'acaridae', 'pen', 'rhino', 'shovel', 'stock', 'leopard', 'tire', 'recliner', 'plaice', 'pineapple', 'wine', 'carrot', 'arrow', 'buttercup', 'washer']
No value for 12 out of 12: ['freebooter', 'acaridae', 'pen', 'rhino', 'shovel', 'tire', 'recliner', 'pineapple', 'wine', 'carrot', 'buttercup', 'washer']
correct {'av': None, 'stdv': None, 'n_concepts': 0}
incorrect {'av': None, 'stdv': None, 'n_concepts': 0}


In [12]:
info = 'conc'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 11 out of 18: ['freebooter', 'pin', 'acaridae', 'rhino', 'stock', 'leopard', 'tire', 'recliner', 'wine', 'buttercup', 'washer']
No value for 8 out of 12: ['freebooter', 'acaridae', 'rhino', 'tire', 'recliner', 'wine', 'buttercup', 'washer']
correct {'av': 558.1428571428571, 'stdv': 111.72351076269872, 'n_concepts': 7}
incorrect {'av': 606.75, 'stdv': 37.91547617178681, 'n_concepts': 4}


In [53]:
info = 'wiki_frequency'
result = get_av_ratings(outcome_dict, word_info_dict, info)
for l, resdict in result.items():
    print(l, resdict)

No value for 0 out of 18: []
No value for 0 out of 12: []
correct {'av': 13073.222222222223, 'stdv': 25424.774061245058, 'n_concepts': 18}
incorrect {'av': 5893.25, 'stdv': 8407.676082820768, 'n_concepts': 12}


In [13]:
# polysemy

result = get_polysemy_type_count(outcome_dict, word_info_dict)
for l, resdict in result.items():
    total = sum(resdict.values())
    print(l, total)
    print('---------')
    for t, c in resdict.most_common():
        print(t, c, c/total)
    print()

No value for 3 out of 18: ['acaridae', 'rhino', 'recliner']
No value for 3 out of 12: ['acaridae', 'rhino', 'recliner']
correct 15
---------
homonyms_only_different_pos 5 0.3333333333333333
poly 5 0.3333333333333333
mon 4 0.26666666666666666
homonyms_also_same_pos 1 0.06666666666666667

incorrect 9
---------
homonyms_only_different_pos 3 0.3333333333333333
poly 3 0.3333333333333333
mon 2 0.2222222222222222
homonyms_also_same_pos 1 0.1111111111111111



## 2.) Check if there is a correlation between agreement and scores



In [68]:

concept_agreement_dict_krippendorff = get_concept_agreement_dict(crowd, iaa_m = 'Krippendorff')


100%|██████████| 17917/17917 [05:48<00:00, 51.48it/s]


In [None]:
concept_agreement_dict_prop = get_concept_agreement_dict(crowd, iaa_m = 'Proportional')

In [62]:
# Familiarity
rating_type = 'fam'
ratings, agreements = get_ratings_agreement(concept_agreement_dict, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

SpearmanrResult(correlation=-0.03162221896472617, pvalue=0.5136133225296674)


In [63]:
# Concreteness
rating_type = 'conc'
ratings, agreements = get_ratings_agreement(concept_agreement_dict, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

SpearmanrResult(correlation=0.02748585030854983, pvalue=0.5789276736119442)


In [52]:
# Age of acquisition
rating_type = 'aoa'
ratings, agreements = get_ratings_agreement(concept_agreement_dict, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

SpearmanrResult(correlation=0.05386632909258574, pvalue=0.45334531474777173)


In [64]:
# Wiki frequency
rating_type = 'wiki_frequency'
ratings, agreements = get_ratings_agreement(concept_agreement_dict, word_info_dict, rating_type = rating_type)
spear = spearmanr(ratings, agreements)
print(spear)

SpearmanrResult(correlation=0.0011168563996093548, pvalue=0.9678109521590175)


In [67]:
# Polysemy

rating_type = 'polysemy_type'
ratings, agreements = get_ratings_agreement(concept_agreement_dict, word_info_dict, rating_type = rating_type)

polysemy_agreement = defaultdict(list)

for r, a in zip(ratings, agreements):
    polysemy_agreement[r].append(a)
    
for r, a in polysemy_agreement.items():
    av_a = sum(a)/len(a)
    print(r, av_a)





poly 0.2048169649560948
homonyms_also_same_pos 0.21450596233026287
mon 0.21948861129198657
homonyms_only_different_pos 0.20457653550357413


Counter({(1, 10): 1, (2, 15): 1, (3, 25): 1})