In [1]:
import pandas as pd
import glob
from scipy.stats import pearsonr
from scipy import stats
import numpy as np

class HIT(object):
    def __init__(self, row, df_s):

        self.hit_id = row['HITId']
        self.worker_id = row['WorkerId']
        self.assignment_id = row['AssignmentId']
        self.working_time = round(row['WorkTimeInSeconds']/60,2)
        self.ans = row['Answer.Q1']
        self.status = row['AssignmentStatus']
        
        self.quality_score = 0
        self.annotation_res = {}
        for data in row['Answer.Q1'].split('|'):
            id,_,ans = data.split('__')
            if 'TRAP-SAME' in id:
                if int(ans) >= 90:
                    self.quality_score += 1
            elif 'TRAP-DIFFERENT' in id:
                if int(ans) <= 10:
                    self.quality_score += 1
            else:
                self.annotation_res[id] = int(ans)
        self.task_id = -1
        for idx, datum in df_s.iterrows():
            if datum['text_0'].replace('\'','') in self.ans:
                self.task_id = idx
                break

        # comment this line if you dont want to apply z-normalization
        self.normalize() 

    def normalize(self):
        keys = list(self.annotation_res.keys())
        scores = list(self.annotation_res.values())
        zscore = stats.zscore(scores)
        ret = {}
        for idx in range(len(keys)):
            ret[keys[idx]] = zscore[idx]
        self.annotation_res = ret
        
def calculate_correlation(data):
    v1=[]; v2=[]; v3=[]
    for key in data[0].annotation_res.keys():
        v1.append(data[0].annotation_res[key])
        v2.append(data[1].annotation_res[key])
        v3.append(data[2].annotation_res[key])
    v1 = np.array(v1)
    v2 = np.array(v2)
    v3 = np.array(v3)
    pearson1, _ = pearsonr(v1, (v2+v3)/2)
    pearson2, _ = pearsonr(v2, (v1+v3)/2)
    pearson3, _ = pearsonr(v3, (v1+v2)/2)
    return (pearson1+pearson2+pearson3)/3

In [14]:
LANG='ES'
path = f'mturk/annotation_result/{LANG}/clean/*' #all downloaded mturk batch files
df_s = pd.read_csv(f'mturk/images/{LANG}/turker.csv')

hits = []
for file in glob.glob(path):
    df = pd.read_csv(file)
    for idx, row in df.iterrows():
        hits.append(HIT(row, df_s))

cols = ['task_id', 'hit_id', 'worker_id', 'assignment_id', 'status', 'quality_score', 'working_time(min)', 'annotation_res']
res = pd.DataFrame(columns=cols)
for idx in range(len(hits)):
    res = res.append({
            'task_id': hits[idx].task_id,
            'hit_id': hits[idx].hit_id,
            'worker_id': hits[idx].worker_id,
            'assignment_id': hits[idx].assignment_id,
            'status': hits[idx].status,
            'working_time(min)': hits[idx].working_time,
            'quality_score': hits[idx].quality_score,
            'annotation_res': hits[idx].annotation_res
        }, ignore_index=True)

res =res[res['quality_score']>=7]

res = res.sort_values(by=['task_id'], ignore_index=True)
res.to_csv(f'mturk/annotation_result/{LANG}/analysis_clean.csv', columns=cols, index=False)

data = {}
for id, row in res.iterrows():
    idx = row['task_id']
    if data.get(idx,[]) == []:
        data[idx] = [row]
    else:
        data[idx].append(row)

corrs = {}
for key in data.keys():
    assert len(data[key])==3
    correlation = calculate_correlation(data[key])
    corrs[key] = correlation
print(corrs)
print(np.mean(list(corrs.values())))

{0: 0.6109349386889638, 1: 0.48939003387575947, 2: 0.6263745391110125, 3: 0.6399141902001321, 4: 0.6545092245074118, 5: 0.608827895976754}
0.6049918037266723


In [12]:
res

Unnamed: 0,task_id,hit_id,worker_id,assignment_id,status,quality_score,working_time(min),annotation_res
0,0,31HLTCK4BLVA8YJX6TBP08SKWWYGVH,A1LUWOV426JV1N,3JRJSWSMQHLVZVJG6QKW5UBFWM63E8,Submitted,10,48.2,{'RU-510-REFERENCE-SYSTEM-BERT': -0.9448209096...
1,0,31HLTCK4BLVA8YJX6TBP08SKWWYGVH,A2Z596CRYWL6XC,374TNBHA8BVLC169428MP5W6OPCYQ6,Submitted,10,23.28,{'RU-551-SYSTEM-REFERENCE-BERT': -0.9987305264...
2,0,30F94FBDNRKZ4P7XLCJWYFD2K39TBR,A3U1K9X1LXA5JA,34S9DKFK73P094THOAG5LB11LXDNY0,Submitted,9,52.95,{'RU-551-SYSTEM-REFERENCE-BERT': -0.4227588500...
3,1,37OPIVELUU34G6BAF42BPLQ8QMPAHO,A1C7XI68SED8JE,3VHHR074H3HR6KUVPOAQV6DFAHO7LR,Submitted,10,41.52,{'RU-675-REFERENCE-SYSTEM-PG': -0.490055262093...
4,1,3QXFBUZ4ZKGAVJUX131K4MQ1IPPGUS,A1LUWOV426JV1N,3ZR9AIQJUB9HQ1NQYXQ0UA1HGLM04Y,Submitted,10,45.87,{'RU-616-SYSTEM-REFERENCE-BERT': -0.7471368199...
5,1,3QXFBUZ4ZKGAVJUX131K4MQ1IPPGUS,A2Z596CRYWL6XC,3X31TUMD7XMDVYFT08THUWM27YCL12,Submitted,10,62.18,{'RU-661-SYSTEM-REFERENCE-PG': -0.794415406157...
6,2,34R3P23QHS11NJEDG9Y37ANZI5DWH6,A1LUWOV426JV1N,3RU7GD8VPOT3G3TL5J187EIV292PS2,Submitted,10,44.73,{'RU-375-REFERENCE-SYSTEM-PG': -0.148259427955...
7,2,34R3P23QHS11NJEDG9Y37ANZI5DWH6,A2Z596CRYWL6XC,3AMW0RGHOD260Q1GXNKM8K8PF9HPNB,Submitted,10,33.18,{'RU-366-SYSTEM-REFERENCE-PG': -0.223484736467...
8,2,3WA2XVDZEMHR9U85ZQDTODR93SBE61,A11T4227GSO6EP,3OONKJ5DKCJMEH0IFX2YSP35P9WBO5,Approved,10,67.53,{'RU-438-REFERENCE-SYSTEM-BERT': -0.4195432414...
9,3,3MZ3TAMYTLNWBI8BNX6IKNGXRQDRI2,A11T4227GSO6EP,3RJSC4XJ10UZMKRQC29AZ27VLCK051,Submitted,10,55.82,{'RU-225-REFERENCE-SYSTEM-PG': 0.1539515758145...


## Merge Human Annotation

In [45]:
import pandas as pd
from ast import literal_eval
from scipy import stats
from scipy.stats import pearsonr
LANG='ES'

source = f'mturk/annotation_result/{LANG}/analysis_clean.csv'
df = pd.read_csv(source)

data = {}
for idx, row in df.iterrows():
    id = row['task_id']
    if data.get(id, None) == None:
        data[id] = [row['annotation_res']]
    else:
        data[id].append(row['annotation_res'])

In [46]:
merge = []
for key in data.keys():
    a = literal_eval(data[key][0])
    b = literal_eval(data[key][1])
    c = literal_eval(data[key][2])
    for id in a.keys():
        overall = (a[id]+b[id]+c[id])/3
        merge.append((id, overall))
print(len(merge))

540


In [47]:
focus = []
coverage = []
for ids, score in merge:
    data_name, data_id, a, b, model_name = ids.split('-')
    if a == 'SYSTEM' and b == 'REFERENCE': #coverage
        coverage.append((data_id,data_name,model_name,score))
    elif a == 'REFERENCE' and b == 'SYSTEM': #focus
        focus.append((data_id,data_name,model_name,score))
print('focus:', len(focus))
print('coverage:', len(coverage))

focus: 270
coverage: 270


In [48]:
df_focus = pd.DataFrame(columns = ['id', 'model', 'score'])
df_coverage = pd.DataFrame(columns = ['id', 'model', 'score'])

for data_id,_,model_name,score in focus:
    df_focus = df_focus.append(
        {'id': data_id, 'model': model_name, 'score': score},
        ignore_index=True
    )
for data_id,_,model_name,score in coverage:
    df_coverage = df_coverage.append(
        {'id': data_id, 'model': model_name, 'score': score},
        ignore_index=True
    )

In [49]:
df_focus.to_csv(f'mturk/annotation_result/{LANG}/human_focus_final.csv', index=False)
df_coverage.to_csv(f'mturk/annotation_result/{LANG}/human_coverage_final.csv', index=False)