# Annotation analysis

This noteboook

* illustrates the steps of the analysis currently implemented
* is used to analyze and monitor workers batch by batch

In [2]:
from load_data import load_experiment_data
from analyze_workers import get_worker_analysis
from analyze_pairs import get_pair_analysis
from analyze_pairs import show_pairs_of_worker
from analyze_pairs import compare_runs
from calculate_iaa import get_agreement
from clean_annotations import filter_annotations
from analyze_relations_and_examples import agreement_relations_across_runs

# Analyze worker contradictions:

* Workers with highest contradiction to annotations ratio appear on top
* use the pandas dataframe sort method to sort on other values: 

`df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')`


### Current batch

In [3]:
run = 4
batch = 13
n_q = '*'
group = 'experiment2'

data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')
df, filepath = get_worker_analysis(data_dict_list, name)
df.drop(['annotations'], axis = 1, inplace=True)
df[['workerid', 'contradiction_poss_contradiction_ratio', 'n_contradictions', 'n_fails']]

Discarded 0.0 annotations.


Unnamed: 0,workerid,contradiction_poss_contradiction_ratio,n_contradictions,n_fails
9,5e9713200afe041a361c8b80,0.233333,21,1
0,5ac8a74fe1099600016aaaf2,0.2,18,0
8,5e94fa6625f03e0bd5d1b12c,0.044444,4,1
1,5bdc45e99ab26e0001a473fe,0.033333,3,0
3,5c72a71f595f3f00015b5849,0.033333,3,0
2,5c19f2319bce3100018a67ba,0.011111,1,0
4,5cbf3b3330b52100182bbf89,0.0,0,1
5,5ce2cde71d1060001a811155,0.0,0,0
6,5dc6d78cac77c74d18692044,0.0,0,1
7,5e1103e212e922000bc4bdf9,0.0,0,0


### All annotations

* Workers with highest contradiction to annotations ratio appear on top
* use the pandas dataframe sort method to sort on other values: 
* Check how the behavior of a specific worker with many contradictions compares to other workers

`df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')`


In [4]:
run = '*'
batch = '*'
n_q = '*'
group = 'experiment*'

data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')
df_total, filepath = get_worker_analysis(data_dict_list, name)
df_total.drop(['annotations'], axis = 1, inplace=True)
df_total[['workerid', 'contradiction_poss_contradiction_ratio', 'n_contradictions', 'n_fails']][:10]

Discarded 655.0 annotations.


Unnamed: 0,workerid,contradiction_poss_contradiction_ratio,n_contradictions,n_fails
3,5c2c7eae8a20f50001e9a8c0,0.770115,67,0
425,5bfed16ca1f050000102693c,0.712644,62,1
673,5d6ef000391fb4001a97337d,0.69697,69,1
943,111,0.65,39,0
368,5c91321c0375460001034cb8,0.633333,38,0
901,5d76a2dc2a476a0001ce6843,0.625,45,1
431,5e2f554c4c58fa515ad361d0,0.6,3,0
562,5e87276d17cdf1028850af59,0.529412,54,1
320,5d4434c68225f9001642fcd3,0.517241,30,0
641,5c042ba4bb85e60001c05c83,0.505376,47,0


In [6]:
# Good workers
df_total[['workerid', 'contradiction_poss_contradiction_ratio', 'n_contradictions', 'n_fails']][-20:]

Unnamed: 0,workerid,contradiction_poss_contradiction_ratio,n_contradictions,n_fails
22,5bae3351a91ee200011a4220,0.0,0,0
25,5d6aeb25a6a4220015010bd6,0.0,0,0
26,5e1b28d4d370f435a62cfbe2,0.0,0,0
30,5c2e82582fe63c0001c40448,0.0,0,0
32,5c900d755c6c02001669786b,0.0,0,0
33,5ca87fb236a495001765efef,0.0,0,0
42,5c0a4b3cd26cd500010383fe,0.0,0,0
77,5ea0448e208a8a03174218a0,0.0,0,0
46,5e6a97d6d5745d0522c116ed,0.0,0,0
51,5c0eb7a0e4ed910001c2f002,0.0,0,0


### Get all contradictions and fails of a particular worker 

* Check if a specfic worker continuously contradicts themselves (and fails checks)

In [5]:
# copy from above - worker with many disagreements in target batch
worker = '5e9713200afe041a361c8b80'


# df.loc[df['column_name'] == some_value]
df_total.loc[df_total['workerid'] == worker]


Unnamed: 0,average_time_question,contradiction_annotation_ratio,contradiction_poss_contradiction_ratio,contradictory_pairs_ratio,fail_annotation_ratio,n_annotations,n_contradictions,n_fails,n_possible_contradictions,workerid,...,"(impossible, typical_of_concept)","(implied_category, impossible)","(impossible, typical_of_property)","(affording_activity, impossible)","(afforded_usual, rare)","(afforded_usual, impossible)","(afforded_usual, unusual)","(afforded_unusual, impossible)","(afforded_unusual, rare)","(afforded_unusual, unusual)"
79,12.781158,0.276316,0.233333,0.5,0.013158,76,21,1,90,5e9713200afe041a361c8b80,...,0.022222,0.022222,0.022222,0.022222,,,,,,


# Analyze pairs

* compars number of annotator contradictions among pairs


### Current batch:

In [8]:
run = '4'
batch = '13'
n_q = '*'
group = 'experiment2'

data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')
df, filepath = get_pair_analysis(data_dict_list, name)
df.drop(['annotations_with_contradiction'], axis = 1, inplace=True)
df


Discarded 0.0 annotations.


Unnamed: 0,average_time_pair,contradiction_annotation_ratio,contradiction_poss_contradiction_ratio,n_annotations,n_contradictions,n_possible_contradictions,n_workers,n_workers_contradicting,pair,ratio_workers_contradicting,...,"(affording_activity, impossible)","(impossible, typical_of_concept)","(impossible, typical_of_property)","(implied_category, impossible)","(typical_of_property, unusual)","(afforded_usual, unusual)","(afforded_unusual, unusual)","(afforded_usual, rare)","(afforded_unusual, rare)","(afforded_usual, impossible)"
0,93.71455,0.07,0.058333,100,7,120,10,3,wheels-motorbus,0.3,...,,,,,,,,,,
5,93.71455,0.14,0.116667,100,14,120,10,3,sweet-plum,0.3,...,1.0,1.0,1.0,1.0,,,,,,
3,93.71455,0.06,0.05,100,6,120,10,2,hot-wok,0.2,...,1.0,1.0,1.0,1.0,,,,,,
4,93.71455,0.05,0.041667,100,5,120,10,2,black-dog,0.2,...,,,,,1.0,,,,,
6,93.71455,0.12,0.08,100,12,150,10,2,lay_eggs-howler,0.2,...,,,1.0,,2.0,2.0,1.0,1.0,1.0,1.0
7,93.71455,0.06,0.05,100,6,120,10,1,square-dining,0.1,...,,,,,,,,,,
1,234.286375,0.0,0.0,40,0,0,10,0,yellow-tomato,0.0,...,,,,,,,,,,
2,93.71455,0.0,0.0,100,0,150,10,0,fly-robin,0.0,...,,,,,,,,,,
8,937.1455,0.0,0.0,10,0,0,10,0,_check2-_check2,0.0,...,,,,,,,,,,
9,937.1455,0.0,0.0,10,0,0,10,0,_test4-_test,0.0,...,,,,,,,,,,


### Get rows with pairs annotated by specific workers:

* Check if a worker in the only one contradicting or if others also had issues with the pairs

In [9]:
worker = '5e9713200afe041a361c8b80'

show_pairs_of_worker(worker, df)

Worker 5e9713200afe041a361c8b80 contradicted themselves in the following pairs:

wheels-motorbus 	 total workers contradicting themselves: 3
sweet-plum 	 total workers contradicting themselves: 3
hot-wok 	 total workers contradicting themselves: 2
black-dog 	 total workers contradicting themselves: 2
square-dining 	 total workers contradicting themselves: 1


### Compare to entire annotation set

In [7]:
run = '*'
batch = '*'
n_q = '*'
group = 'experiment*'

data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')
df, filepath = get_pair_analysis(data_dict_list, name)
df.drop(['annotations_with_contradiction'], axis = 1, inplace=True)
df[:10]

Discarded 655.0 annotations.


Unnamed: 0,average_time_pair,contradiction_annotation_ratio,contradiction_poss_contradiction_ratio,n_annotations,n_contradictions,n_possible_contradictions,n_workers,n_workers_contradicting,pair,ratio_workers_contradicting,...,"(impossible, typical_of_concept)","(implied_category, impossible)","(impossible, typical_of_property)","(affording_activity, impossible)","(afforded_usual, rare)","(afforded_usual, impossible)","(afforded_usual, unusual)","(afforded_unusual, impossible)","(afforded_unusual, rare)","(afforded_unusual, unusual)"
460,91.48726,0.206667,0.155,150,31,200,15,14,roll-bucket,0.933333,...,,1.0,,,1.0,,,1.0,8.0,8.0
340,74.64338,0.146667,0.11,150,22,200,15,13,roll-tray,0.866667,...,,,1.0,,,,,4.0,3.0,9.0
391,67.06886,0.18,0.135,150,27,200,15,13,roll-glass,0.866667,...,,,,,,,1.0,2.0,5.0,9.0
127,50.681993,0.299363,0.235,157,47,200,15,13,roll-nut,0.866667,...,2.0,2.0,1.0,,1.0,2.0,4.0,2.0,5.0,11.0
523,83.71511,0.2,0.133333,100,20,150,10,8,roll-windshield,0.8,...,,1.0,1.0,,1.0,1.0,1.0,4.0,5.0,5.0
362,68.401,0.233333,0.175,150,35,200,15,12,roll-hammer,0.8,...,1.0,2.0,3.0,,,2.0,1.0,5.0,5.0,8.0
414,77.292245,0.235669,0.185,157,37,200,15,12,roll-peg,0.8,...,1.0,2.0,,,3.0,2.0,2.0,5.0,8.0,8.0
113,83.456113,0.226667,0.17,150,34,200,15,12,roll-ferrule,0.8,...,,3.0,,,1.0,1.0,1.0,5.0,6.0,7.0
318,62.03672,0.293333,0.22,150,44,200,15,12,roll-bolt,0.8,...,1.0,1.0,2.0,,3.0,2.0,4.0,4.0,5.0,6.0
322,81.44398,0.24,0.18,150,36,200,15,12,roll-car,0.8,...,1.0,2.0,,,1.0,1.0,2.0,4.0,5.0,11.0


In [10]:
# check specific pair:

pair = 'red-lipstick'
df.loc[df['pair'] == pair]

Unnamed: 0,average_time_pair,contradiction_annotation_ratio,contradiction_poss_contradiction_ratio,n_annotations,n_contradictions,n_possible_contradictions,n_workers,n_workers_contradicting,pair,ratio_workers_contradicting,...,"(impossible, typical_of_concept)","(implied_category, impossible)","(impossible, typical_of_property)","(affording_activity, impossible)","(afforded_usual, rare)","(afforded_usual, impossible)","(afforded_usual, unusual)","(afforded_unusual, impossible)","(afforded_unusual, rare)","(afforded_unusual, unusual)"


### Compare contradictions between runs:

In [8]:
run = '3'
batch = '*'
n_q = '*'
group = 'experiment1'

data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')
df1, filepath = get_pair_analysis(data_dict_list, name)
print(f'analysis can be found at: {filepath}')

run = '4'
batch = '*'
n_q = '*'
group = 'experiment2'

data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
name = f'run{run}-group_{group}-batch{batch}'.replace('*', '-all-')
df2, filepath = get_pair_analysis(data_dict_list, name)
print(f'analysis can be found at: {filepath}')

name1 = '3'
name2 = '4'
compare_runs(name1, name2, df1, df2, comp = 'all')
#print()
#compare_runs(name1, name2, df1, df2, comp = 'pairs')

Discarded 655.0 annotations.
analysis can be found at: ../analyses/pairs/run3-group_experiment1-batch-all-.csv
Discarded 0.0 annotations.
analysis can be found at: ../analyses/pairs/run4-group_experiment2-batch-all-.csv
Set 3 as a contradiction ratio of 0.2528971962616822
Set 4 as a contradiction ratio of 0.2034383954154728
The ratio is based on the number of workers annotating a pair.
A worker always annotates a full set.


(0.2528971962616822, 0.2034383954154728)

# Inter-annotator agreement

* Total agreement
* Agreement by relations
* agreement after removing annotators with many contradictions/annotators who fail checks/contradictory annotations

## Total iaa:

In [9]:
run = '*'
batch = '*'
n_q = '*'
group = 'experiment*'
print(f"Run {run}, all experiment groups:")
data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
get_agreement(data_dict_list)

run = '*'
batch = '*'
n_q = '*'
group = 'experiment1'
print(f"Run {run}:")
data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
get_agreement(data_dict_list)

run = '4'
batch = '*'
n_q = '*'
group = 'experiment2'
print(f"Run {run}:")
data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)

get_agreement(data_dict_list)

Run *, all experiment groups:
Discarded 655.0 annotations.
Krippendorff's alpha: 0.2378609178333677
Proportional agreement (pairwise): 0.6243037902425451

Run *:
Discarded 655.0 annotations.
Krippendorff's alpha: 0.23567624334318582
Proportional agreement (pairwise): 0.6230672746100898

Run 4:
Discarded 0.0 annotations.
Krippendorff's alpha: 0.25548048351340946
Proportional agreement (pairwise): 0.6389832752546688



{'Krippendorff': 0.25548048351340946, 'Proportional': 0.6389832752546688}

## IAA of collapsed relations



In [10]:
run = '*'
batch = '*'
n_q = '*'
group = 'experiment*'
print(f"Run {run}:")
data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
get_agreement(data_dict_list)

collapse_relations = 'pos_neg'
print(f'collapsing {collapse_relations}')
get_agreement(data_dict_list, collapse_relations = collapse_relations)

collapse_relations = 'levels'
print(f'collapsing {collapse_relations}')
get_agreement(data_dict_list, collapse_relations = collapse_relations)

collapse_relations = 'similar_relations'
print(f'collapsing {collapse_relations}')
get_agreement(data_dict_list, collapse_relations = collapse_relations)


run = '4'
batch = '*'
n_q = '*'
group = 'experiment2'
print(f"Run {run}:")
data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
get_agreement(data_dict_list)

collapse_relations = 'pos_neg'
print(f'collapsing {collapse_relations}')
get_agreement(data_dict_list, collapse_relations = collapse_relations)

collapse_relations = 'levels'
print(f'collapsing {collapse_relations}')
get_agreement(data_dict_list, collapse_relations = collapse_relations)

collapse_relations = 'similar_relations'
print(f'collapsing {collapse_relations}')
get_agreement(data_dict_list, collapse_relations = collapse_relations)

Run *:
Discarded 655.0 annotations.
Krippendorff's alpha: 0.2378609178333677
Proportional agreement (pairwise): 0.6243037902425451

collapsing pos_neg
pos_neg
Krippendorff's alpha: 0.3431240569115509
Proportional agreement (pairwise): 0.734049218498038

collapsing levels
levels
Krippendorff's alpha: 0.36048524149912353
Proportional agreement (pairwise): 0.6220320269738886

collapsing similar_relations
similar_relations
Krippendorff's alpha: 0.3713240612838109
Proportional agreement (pairwise): 0.6201418103436581

Run 4:
Discarded 0.0 annotations.
Krippendorff's alpha: 0.25548048351340946
Proportional agreement (pairwise): 0.6389832752546688

collapsing pos_neg
pos_neg
Krippendorff's alpha: 0.3732484191333858
Proportional agreement (pairwise): 0.7348484848484849

collapsing levels
levels
Krippendorff's alpha: 0.3845637299085969
Proportional agreement (pairwise): 0.6545679012345681

collapsing similar_relations
similar_relations
Krippendorff's alpha: 0.34007629757251134
Proportional agre

{'Krippendorff': 0.34007629757251134, 'Proportional': 0.6412412412412413}

## IAA by relation

In [13]:
runs = ['4', '3', '1']
experiment_name = 'experiment*'
path, df = agreement_relations_across_runs(runs, experiment_name)
print(f'Results written to: {path}')
df

Discarded 0.0 annotations.
Discarded 655.0 annotations.
Discarded 0.0 annotations.
Results written to: ../analyses/iaa/relations_runs4-3-1.csv


Unnamed: 0,relation,4_Krippendorff,4_Proportional,3_Krippendorff,3_Proportional,1_Krippendorff,1_Proportional
0,typical_of_property,0.285893,0.717834,0.234555,0.753711,0.326482,0.682547
1,affording_activity,0.320818,0.66541,0.196397,0.655055,0.149149,0.672259
2,creative,0.076448,0.609178,0.058862,0.543404,0.04002,0.524894
3,rare,0.202142,0.628666,0.12997,0.575412,,
4,variability_open,0.10491,0.555982,0.215107,0.606619,0.143751,0.580081
5,afforded_unusual,0.160385,0.685625,0.067595,0.542798,0.10846,0.557623
6,variability_subcategories,,,,,0.15727,0.579637
7,unusual,0.206405,0.612104,0.157012,0.575154,0.120147,0.561644
8,test_false,0.362976,0.782759,0.258187,0.702768,,
9,_check,0.801358,0.932895,0.60797,0.885378,0.904637,0.964443


## Agreement after removing workers or annotations:

In [11]:
run = '*'
batch = '*'
n_q = '*'
group = 'experiment*'

print('Total Agreement')
data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)
get_agreement(data_dict_list)
print('-------')
print()

print('Removing pair annotations with many contradictions:')
category = 'pairs_contradictions'
print(f'analyze {category}')
annotations_clean, annotations_removed = clean_annotations(run, group, n_q, batch, category)
print('agreement clean')
get_agreement(annotations_clean)
#print('agreement rest')
#get_agreement(annotations_removed)


category = 'worker_contradictions'
print(f'analyze {category}')
annotations_clean, annotations_removed = clean_annotations(run, group, n_q, batch, category)
print('agreement clean')
get_agreement(annotations_clean)
#print('agreement rest')
#get_agreement(annotations_removed)
print()

category = 'worker_checks'
print(f'analyze {category}')
annotations_clean, annotations_removed = clean_annotations(run, group, n_q, batch, category)
print('agreement clean')
get_agreement(annotations_clean)
#print('agreement rest')
#get_agreement(annotations_removed)

Total Agreement
Discarded 655.0 annotations.
Krippendorff's alpha: 0.2378609178333677
Proportional agreement (pairwise): 0.6243037902425451

-------

Removing pair annotations with many contradictions:
analyze pairs_contradictions
Discarded 655.0 annotations.
Found 49513 clean annotations.
Round 22603 to remove.

agreement clean
Krippendorff's alpha: 0.3415365612632997
Proportional agreement (pairwise): 0.6575207312935043

analyze worker_contradictions
Discarded 655.0 annotations.
Found 14524 clean annotations.
Round 57525 to remove.

agreement clean
Krippendorff's alpha: 0.38885888521500367
Proportional agreement (pairwise): 0.3914403402461456


analyze worker_checks
Discarded 655.0 annotations.
Found 59171 clean annotations.
Round 12878 to remove.

agreement clean
Krippendorff's alpha: 0.26648729510750124
Proportional agreement (pairwise): 0.6174040877502266



{'Krippendorff': 0.26648729510750124, 'Proportional': 0.6174040877502266}