# Simple notebook that transforms the SNLI dev and test datasets to contain probability distributions
The probability distribution for each example is just the count of each annotation type (entailment, neutral, or contradiction) divided by the total number of annotations (i.e., 5)

In [1]:
from collections import Counter
import pandas as pd

### Helper functions

In [2]:
def gen_dist(x):
        
    al = x.annotator_labels
    counter = Counter(al)

    num_annotators = len(al)
    num_entail = counter['entailment']
    num_neutral = counter['neutral']
    num_contra = counter['contradiction']
    
    return [i/num_annotators for i in [num_entail, num_neutral, num_contra]]


def gold_to_int(gold):
    if gold == 'entailment':
        return 0
    elif gold == 'neutral':
        return 1
    elif gold == 'contradiction':
        return 2
    else:
        return -1

In [3]:
ROOT_DATA_PATH = '../data/snli_1.0'

#### Prepare the `dev` dataset

In [4]:
# read the original file
dev_snli_df = pd.read_json(f'{ROOT_DATA_PATH}/snli_1.0_dev.jsonl', lines=True)

# generate the probability distributions
dev_snli_df['label'] = dev_snli_df.apply(lambda x: gen_dist(x), axis=1)

# generate an integer gold label
dev_snli_df['label_g'] = dev_snli_df.apply(lambda x: gold_to_int(x['gold_label']), axis=1)

# remove any completely ambiguous examples
dev_snli_df = dev_snli_df.query('label_g != -1')
dev_snli_df = dev_snli_df.drop(columns=['annotator_labels', 'captionID',
                                        'pairID', 'sentence1_binary_parse',
                                        'sentence1_parse', 'sentence2_binary_parse',
                                        'sentence2_parse'])

# rename the premise and hypothesis columns
dev_snli_df = dev_snli_df.rename(columns={'sentence1': 'premise', 'sentence2': 'hypothesis'})

# write the new data
with open(f'{ROOT_DATA_PATH}/snli_1.0_dev_probs.jsonl', 'w') as f:
    f.write(dev_snli_df.to_json(orient='records', lines=True))

#### Prepare the `test` dataset

In [5]:
# read the original file
test_snli_df = pd.read_json(f'{ROOT_DATA_PATH}/snli_1.0_test.jsonl', lines=True)

# generate the probability distributions
test_snli_df['label'] = test_snli_df.apply(lambda x: gen_dist(x), axis=1)

# generate an integer gold label
test_snli_df['label_g'] = test_snli_df.apply(lambda x: gold_to_int(x['gold_label']), axis=1)

# remove any completely ambiguous examples
test_snli_df = test_snli_df.query('label_g != -1')
test_snli_df = test_snli_df.drop(columns=['annotator_labels', 'captionID',
                                          'pairID', 'sentence1_binary_parse',
                                          'sentence1_parse', 'sentence2_binary_parse',
                                          'sentence2_parse'])

# rename the premise and hypothesis columns
test_snli_df = test_snli_df.rename(columns={'sentence1': 'premise', 'sentence2': 'hypothesis'})

# write the new data
with open(f'{ROOT_DATA_PATH}/snli_1.0_test_probs.jsonl', 'w') as f:
    f.write(test_snli_df.to_json(orient='records', lines=True))