# Inspecting the e-SNLI Dataset

In [40]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

## Original e-SNLI Data 

In [177]:
dataset = load_dataset("esnli")

Reusing dataset esnli (/Users/ggbetz/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc)
100%|██████████| 3/3 [00:00<00:00, 442.61it/s]


In [184]:
# splits and features
dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'explanation_1', 'explanation_2', 'explanation_3'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'explanation_1', 'explanation_2', 'explanation_3'],
        num_rows: 9842
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'explanation_1', 'explanation_2', 'explanation_3'],
        num_rows: 9824
    })
})

In [185]:
df_esnli = dataset["train"].to_pandas()

In [186]:
# duplicates? -> yes
counts = df_esnli.value_counts()
counts.value_counts()

1    549085
2       141
dtype: int64

In [187]:
df_esnli = df_esnli.drop_duplicates()

In [188]:
# does every record have an explanation? > no
df_esnli["n_explanations"] = 3-df_esnli[['explanation_1','explanation_2','explanation_3']].eq("").sum(axis=1)

In [189]:
df_esnli.n_explanations.value_counts()

1    549201
0        25
Name: n_explanations, dtype: int64

In [190]:
# drop records with no explanation
df_esnli = df_esnli[df_esnli.n_explanations.gt(0)]

In [191]:
# count how frequently premise occurs in the dataset (default = three times)
counts = df_esnli.groupby(["premise"]).size()
counts.value_counts()

3     141405
15      7827
2        771
6        473
14        84
12        45
5         45
9         27
18        25
4          9
30         9
21         4
27         3
1          2
13         2
33         1
24         1
11         1
dtype: int64

In [192]:
df_esnli["premise_counts"] = df_esnli.premise.progress_apply(lambda x: counts[x])

100%|██████████| 549201/549201 [00:01<00:00, 287217.11it/s]


In [193]:
# count records whose premise-hypothesis occurs in the dataset
# these are records with different explanations
df_esnli.value_counts(subset=["premise","hypothesis","label"]).value_counts()

1    548394
2       396
3         5
dtype: int64

In [194]:
# drop records whose premise occurs less than 3 times
df_esnli = df_esnli[df_esnli.premise_counts.ge(3)]

### Process records whose premise occurs more than 3 times

In [212]:
# get all rows whose premise occurs more than 3 times
df_esnli_tmp = df_esnli[df_esnli.premise_counts.gt(3)].copy()
df_esnli_tmp.reset_index(inplace=True)
len(df_esnli_tmp)

123442

In [214]:
df2 = df_esnli_tmp.groupby(["premise","label"]).size().unstack()
df2.fillna(0,inplace=True)
df2

label,0,1,2
premise,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"2 Men in a classroom, one standing and one sitting while participating in a discussion.",5.0,5.0,5.0
2 Older women and a young girl with a red bike.,5.0,5.0,5.0
2 artists creating a new piece on a table,5.0,5.0,5.0
2 boys in the foreground in a karate competition and coaches in background looking on with another coach sitting at table.,5.0,5.0,5.0
2 female babies eating chips.,5.0,5.0,5.0
...,...,...,...
two young women are both looking at their cellphones,5.0,5.0,5.0
"woman in helmet showing three small children something in her hands, standing in front of a stone wall.",5.0,5.0,5.0
women (wearing scarf) driving motorized scooter and man (in the backseat) riding in the busy streets.,5.0,5.0,5.0
women in a black shirt standing with a bottle of water in hand in front of wall with Greek writing on it.,5.0,5.0,5.0


In [215]:
sum(df2.nunique(axis=1).gt(1))

186

In [216]:
# df2.min(axis=1) tells us how many records for each premise will go into preprocessed esnli dataset
df2.min(axis=1)

premise
2 Men in a classroom, one standing and one sitting while participating in a discussion.                                       5.0
2 Older women and a young girl with a red bike.                                                                               5.0
2 artists creating a new piece on a table                                                                                     5.0
2 boys in the foreground in a karate competition and coaches in background looking on with another coach sitting at table.    5.0
2 female babies eating chips.                                                                                                 5.0
                                                                                                                             ... 
two young women are both looking at their cellphones                                                                          5.0
woman in helmet showing three small children something in her hands, standing in f

In [217]:
assert len(df2)==len(set(df_esnli_tmp.premise))

In [219]:
df_esnli_tmp["min_label_counts"] = df_esnli_tmp.premise.progress_apply(lambda x: int(df2.min(axis=1)[x]))

100%|██████████| 123442/123442 [00:14<00:00, 8742.75it/s]


In [220]:
df_esnli_tmp.head()

Unnamed: 0,index,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,n_explanations,premise_counts,min_label_counts
0,57,Woman in white in foreground and a man slightl...,The man with the sign is caucasian.,1,Not all men are Caucasian.,,,1,15,5
1,58,Woman in white in foreground and a man slightl...,They are protesting outside the capital.,2,One cannot be walking by a sign for John's Piz...,,,1,15,5
2,59,Woman in white in foreground and a man slightl...,A woman in white.,0,Woman in white in foreground implies that woma...,,,1,15,5
3,60,Woman in white in foreground and a man slightl...,A man is advertising for a restaurant.,0,walking with a sign for John's Pizza and Gyro ...,,,1,15,5
4,61,Woman in white in foreground and a man slightl...,The woman is wearing black.,2,The woman is in either white or black.,,,1,15,5


In [221]:
sum(df_esnli_tmp.premise_counts.lt(3*df_esnli_tmp.min_label_counts))

0

In [222]:
df_esnli_tmp[df_esnli_tmp.premise_counts.lt(3*df_esnli_tmp.min_label_counts)]

Unnamed: 0,index,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,n_explanations,premise_counts,min_label_counts


In [223]:
sum(df_esnli_tmp.groupby(["premise"]).size() != df_esnli_tmp.groupby(["premise"]).first()["premise_counts"])

0

In [224]:
# make sure that for each premise, we have the same number of records for labels 0,1,2
df_esnli_tmp = df_esnli_tmp.groupby(["premise","label"],as_index=False).progress_apply(lambda x: x.iloc[:x.min_label_counts.iloc[0]])
df_esnli_tmp

100%|██████████| 25667/25667 [00:12<00:00, 2113.46it/s]


Unnamed: 0,Unnamed: 1,index,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,n_explanations,premise_counts,min_label_counts
0,113486,503395,"2 Men in a classroom, one standing and one sit...",There are two men in a classroom talking.,0,The 2 men are participating in a discussion in...,,,1,15,5
0,113492,503401,"2 Men in a classroom, one standing and one sit...",Two men are participating in class.,0,The two men are participating in a discussion ...,,,1,15,5
0,113494,503403,"2 Men in a classroom, one standing and one sit...",2 men are sitting and standing in a classroom ...,0,"""2 men sitting and standing and taking part in...",,,1,15,5
0,113496,503405,"2 Men in a classroom, one standing and one sit...",Two people discuss something as one stands and...,0,"""Two people discuss something as one stands an...",,,1,15,5
0,113497,503406,"2 Men in a classroom, one standing and one sit...",Two men are in a classroom participating in a ...,0,"""Two men are in a classroom participating in a...",,,1,15,5
...,...,...,...,...,...,...,...,...,...,...,...
25666,122653,546380,"young adults posing for a photo at night, some...",A group of people are having a sleepover.,2,Posing for a photo at night implies being outs...,,,1,15,5
25666,122655,546382,"young adults posing for a photo at night, some...",Elderly people pose for a photo.,2,Young adults are not elderly adults.,,,1,15,5
25666,122657,546384,"young adults posing for a photo at night, some...",The beach is aflame with exploding missiles.,2,Adults posing doesn't suggest that a beach is ...,,,1,15,5
25666,122660,546387,"young adults posing for a photo at night, some...",They are warm,2,It is chilly so the adults are not likely warm.,,,1,15,5


In [225]:
def reorder_premise_group(pg):
    return pg.groupby("label").apply(lambda g: g.reset_index(drop=True)).sort_index(level=1)
df_esnli_tmp = df_esnli_tmp.groupby(["premise"],as_index=False).progress_apply(reorder_premise_group)
df_esnli_tmp

100%|██████████| 8555/8555 [00:23<00:00, 365.61it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,n_explanations,premise_counts,min_label_counts
Unnamed: 0_level_1,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0,503395,"2 Men in a classroom, one standing and one sit...",There are two men in a classroom talking.,0,The 2 men are participating in a discussion in...,,,1,15,5
0,1,0,503396,"2 Men in a classroom, one standing and one sit...",Two men are sitting in a class room next to ea...,1,They can be in a room together and not next to...,,,1,15,5
0,2,0,503397,"2 Men in a classroom, one standing and one sit...",Two men are attacking each other in a classroom.,2,Men attacking each other are not participating...,,,1,15,5
0,0,1,503401,"2 Men in a classroom, one standing and one sit...",Two men are participating in class.,0,The two men are participating in a discussion ...,,,1,15,5
0,1,1,503398,"2 Men in a classroom, one standing and one sit...","There are two men in a classroom, and one of t...",1,Nothing indicates that there is an oak roll to...,,,1,15,5
...,...,...,...,...,...,...,...,...,...,...,...,...
8554,1,3,546391,"young adults posing for a photo at night, some...",People smile for the camera.,1,We don't know that people smile.,,,1,15,5
8554,2,3,546387,"young adults posing for a photo at night, some...",They are warm,2,It is chilly so the adults are not likely warm.,,,1,15,5
8554,0,4,546388,"young adults posing for a photo at night, some...",People are taking a photo outside in the cold.,0,In a both sentence adults posing a photo outs...,,,1,15,5
8554,1,4,546392,"young adults posing for a photo at night, some...",The young adults are outside,1,The young adults are posing for a photo and it...,,,1,15,5


In [226]:
df_esnli_tmp.min_label_counts.value_counts()

5     117420
2       2658
4       1500
6        432
3        270
10       270
1        261
7         84
9         81
11        33
8         24
Name: min_label_counts, dtype: int64

In [227]:
# equal number of labels?
df_esnli_tmp.label.value_counts()

0    41011
1    41011
2    41011
Name: label, dtype: int64

### Process records whose premise occurs exactly 3 times

In [228]:
df_esnli_tmp2 = df_esnli[df_esnli.premise_counts.eq(3)].copy()
df_esnli_tmp2.head()

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,n_explanations,premise_counts
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1,the person is not necessarily training his horse,,,1,3
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2,One cannot be on a jumping horse cannot be a d...,,,1,3
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0,a broken down airplane is outdoors,,,1,3
3,Children smiling and waving at camera,They are smiling at their parents,1,Just because they are smiling and waving at a ...,,,1,3
4,Children smiling and waving at camera,There are children present,0,The children must be present to see them smili...,,,1,3


In [229]:
# equal number of labels?
df_esnli_tmp2.label.value_counts()

0    141668
2    141367
1    141180
Name: label, dtype: int64

In [230]:
# determine premises with incomplete labels (at least one label is missing)
labels_complete = df_esnli_tmp2.groupby(["premise"]).progress_apply(lambda g: len(set(g["label"]))==3)
df_esnli_tmp2["complete"] = df_esnli_tmp2.premise.progress_apply(lambda x: labels_complete[x])

100%|██████████| 141405/141405 [00:04<00:00, 31430.55it/s]
100%|██████████| 424215/424215 [00:01<00:00, 290877.37it/s]


In [231]:
df_esnli_tmp2 = df_esnli_tmp2[df_esnli_tmp2.complete]

In [232]:
df_esnli_tmp2

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3,n_explanations,premise_counts,complete
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1,the person is not necessarily training his horse,,,1,3,True
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2,One cannot be on a jumping horse cannot be a d...,,,1,3,True
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0,a broken down airplane is outdoors,,,1,3,True
3,Children smiling and waving at camera,They are smiling at their parents,1,Just because they are smiling and waving at a ...,,,1,3,True
4,Children smiling and waving at camera,There are children present,0,The children must be present to see them smili...,,,1,3,True
...,...,...,...,...,...,...,...,...,...
549362,Four dirty and barefooted children.,four kids won awards for 'cleanest feet',2,in a both sentence dirty and cleanest are not ...,,,1,3,True
549363,Four dirty and barefooted children.,"four homeless children had their shoes stolen,...",1,the children are not necessarily homeless,,,1,3,True
549364,A man is surfing in a bodysuit in beautiful bl...,A man in a bodysuit is competing in a surfing ...,1,the man is not necessarily competing,,,1,3,True
549365,A man is surfing in a bodysuit in beautiful bl...,A man in a business suit is heading to a board...,2,That is either a business suit or bodysuit.,,,1,3,True


### Merge

In [233]:
columns = ['premise', 'hypothesis', 'label', 'explanation_1', 'explanation_2',
       'explanation_3']
df_esnli_final = pd.concat([
    df_esnli_tmp2[columns],
    df_esnli_tmp[columns]
])
df_esnli_final.reset_index(drop=True,inplace=True)

In [234]:
df_esnli_final

Unnamed: 0,premise,hypothesis,label,explanation_1,explanation_2,explanation_3
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1,the person is not necessarily training his horse,,
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2,One cannot be on a jumping horse cannot be a d...,,
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0,a broken down airplane is outdoors,,
3,Children smiling and waving at camera,They are smiling at their parents,1,Just because they are smiling and waving at a ...,,
4,Children smiling and waving at camera,There are children present,0,The children must be present to see them smili...,,
...,...,...,...,...,...,...
539794,"young adults posing for a photo at night, some...",People smile for the camera.,1,We don't know that people smile.,,
539795,"young adults posing for a photo at night, some...",They are warm,2,It is chilly so the adults are not likely warm.,,
539796,"young adults posing for a photo at night, some...",People are taking a photo outside in the cold.,0,In a both sentence adults posing a photo outs...,,
539797,"young adults posing for a photo at night, some...",The young adults are outside,1,The young adults are posing for a photo and it...,,


### Sanity Checks

In [235]:
# equal number of labels?
df_esnli_final.label.value_counts()

1    179933
2    179933
0    179933
Name: label, dtype: int64

In [236]:
for start in tqdm(range(0, df_esnli_final.shape[0], 3)):
    triple = df_esnli_final.iloc[start:start + 3]
    assert len(set(triple.premise))==1
    assert len(set(triple.label))==3

100%|██████████| 179933/179933 [00:19<00:00, 9327.99it/s]


## Checking the transformed DeepA2-ESNLI dataset

In [41]:
from deepa2datasets.config import data_dir
from datasets import Dataset

In [47]:
path = (data_dir / "processed" / "esnli" / "train" / "train.parquet")
path = path.resolve()
path.exists()

True

In [49]:
da2_dataset = Dataset.from_parquet(str(path))

Using custom data configuration default-3f29edf1a53ed96a


Downloading and preparing dataset parquet/default to /Users/ggbetz/.cache/huggingface/datasets/parquet/default-3f29edf1a53ed96a/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...


100%|██████████| 1/1 [00:00<00:00, 6132.02it/s]
100%|██████████| 1/1 [00:00<00:00, 569.80it/s]


Dataset parquet downloaded and prepared to /Users/ggbetz/.cache/huggingface/datasets/parquet/default-3f29edf1a53ed96a/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.


In [50]:
da2_dataset

Dataset({
    features: ['argdown_reconstruction', 'argument_source', 'conclusion', 'conclusion_formalized', 'conclusion_statements', 'context', 'distractors', 'entity_placeholders', 'erroneous_argdown', 'gist', 'intermediary_conclusion', 'intermediary_conclusion_formalized', 'metadata', 'misc_placeholders', 'predicate_placeholders', 'premises', 'premises_formalized', 'reason_statements', 'source_paraphrase', 'title'],
    num_rows: 31200
})

In [51]:
da2_dataset[50]

{'argdown_reconstruction': '(1) # 6 tries her best to help her team to victory.\n(2) if a sports arena is empty then it is wrong that # 6 tries her best to help her team to victory.\n--\nwith modus tollens from (1) (2)\n--\n(3) it is wrong that a sports arena is empty.',
 'argument_source': 'The team cannot be helped to victory if the arena is empty. if she is helping her team then it is inferred she is playing a team sport it is wrong that a sports arena is empty. # 6 tries her best to help her team to victory.',
 'conclusion': [{'explicit': True,
   'ref_reco': 3,
   'text': 'it is wrong that a sports arena is empty.'}],
 'conclusion_formalized': [{'form': '¬{q}', 'ref_reco': 3}],
 'conclusion_statements': [{'ref_reco': 3,
   'starts_at': 135,
   'text': 'it is wrong that a sports arena is empty.'}],
 'context': None,
 'distractors': ['if she is helping her team then it is inferred she is playing a team sport'],
 'entity_placeholders': None,
 'erroneous_argdown': '(1) # 6 tries her b