In [1]:
import pandas as pd
import numpy as np

from glob import glob

In [2]:
results_path = './Results_Anon'

In [3]:
example = pd.read_csv(results_path + '/rexrex_Batch_3501801_batch_results_anon.csv')
example.columns

Index([u'Unnamed: 0', u'HITId', u'HITTypeId', u'Title', u'Description',
       u'Keywords', u'Reward', u'CreationTime', u'MaxAssignments',
       u'RequesterAnnotation', u'AssignmentDurationInSeconds',
       u'AutoApprovalDelayInSeconds', u'Expiration', u'NumberOfSimilarHITs',
       u'LifetimeInSeconds', u'AssignmentId', u'WorkerId', u'AssignmentStatus',
       u'AcceptTime', u'SubmitTime', u'AutoApprovalTime', u'ApprovalTime',
       u'RejectionTime', u'RequesterFeedback', u'WorkTimeInSeconds',
       u'LifetimeApprovalRate', u'Last30DaysApprovalRate',
       u'Last7DaysApprovalRate', u'Input.premise', u'Input.hypothesis',
       u'Input.label', u'Input.prompt', u'Answer.semantic-similarity.label',
       u'Approve', u'Reject'],
      dtype='object')

In [4]:
example['Input.prompt'][0], example['Answer.semantic-similarity.label'].unique()

('Assume Text 1 is meant to refer to something (which could be a person, an animal, or a thing) in a picture. Could Text 2 refer to the same thing? (That is, could Text 2 be a clarification of Text 1, in some situation?)',
 array(['1 - Not At All', '3', '4 - Yes, absolutely', '2'], dtype=object))

In [5]:
def prep_tuple_list(df, group_by='HITId', summariser='avg'):
    df['anno_label'] = df['Answer.semantic-similarity.label'].apply(lambda x: int(x[0]))
    grouped = df.groupby(group_by)
    tuples = []
    for group in grouped.groups.keys():
        this_slice = grouped.get_group(group)
        if summariser == 'avg':
            avg_label = this_slice['anno_label'].mean()
        elif summariser == 'majority':
            avg_label = np.bincount(this_slice['anno_label'].values).argmax()
        elif summariser == 'rand':
            avg_label = this_slice['anno_label'].sample().values[0]
        std_label = this_slice['anno_label'].std()
        n_rows = len(this_slice)
        hyp = this_slice['Input.hypothesis'].tolist()[0]
        prem = this_slice['Input.premise'].tolist()[0]
        true_label = this_slice['Input.label'].tolist()[0]
        tuples.append((group, prem, hyp, true_label, avg_label, std_label, n_rows))
    return tuples

def score_res(df, summariser='avg', break_point=3):
    tuples = prep_tuple_list(df, summariser=summariser)
    acc = sum([1 if (avg >= break_point and lab == 1) or (avg < break_point and lab == 0) else 0
               for _, _, _, lab, avg, _, _ in tuples]) / len(tuples)
    return acc

In [6]:
for respath in glob(results_path + '/*csv'):
    break_point = 3
    print('-' * 60)
    print(respath)
    df = pd.read_csv(respath)
    print()
    print(df['Input.prompt'][0])
    print(df['Answer.semantic-similarity.label'].unique())
    print('')
    print('    avg: {:.2}'.format(score_res(df, break_point=break_point)))
    print('    maj: {:.2}'.format(score_res(df, summariser='majority', break_point=break_point)))
    print('    rnd: {:.2}'.format(score_res(df, summariser='rand', break_point=break_point)))
    
    print('Avg 0: {:.2}  Avg 1: {:.2}'.format(df[df['Input.label'] == 0]['anno_label'].mean(), 
                                                        df[df['Input.label'] == 1]['anno_label'].mean()))
    correctdf = df[((df['anno_label'] >= break_point) & (df['Input.label'] == 1)) |
                   ((df['anno_label'] < break_point) & (df['Input.label'] == 0))]
    print('    raw: {:.2}'.format(len(correctdf) / len(df)))
    print('# diff annotators', len(pd.read_csv(respath)['WorkerId'].unique()))

------------------------------------------------------------
./Results_Anon/capcap_Batch_3501811_batch_results_anon.csv

Is Text 2 likely to be describing the same situation as Text 1?
['3' '2' '4 - Yes, absolutely' '1 - Not At All']

    avg: 0.63
    maj: 0.6
    rnd: 0.63
Avg 0: 2.5  Avg 1: 2.9
    raw: 0.59
# diff annotators 41
------------------------------------------------------------
./Results_Anon/capdeep_Batch_3501826_batch_results_anon.csv

Is Text 2 likely to be a longer description of the situation described by Text 1?
['4 - Yes, absolutely' '3' '1 - Not At All' '2']

    avg: 0.6
    maj: 0.48
    rnd: 0.52
Avg 0: 2.7  Avg 1: 2.9
    raw: 0.54
# diff annotators 33
------------------------------------------------------------
./Results_Anon/capobj_Batch_3501814_batch_results_anon.csv

Using what you know about the world, in the situation described by Text 1, is Text 2 likely to be true?
['4 - Yes, absolutely' '1 - Not At All' '2' '3']

    avg: 0.58
    maj: 0.55
    rnd: 0