In [30]:
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pandas as pd
import numpy as np
import glob

In [31]:
df = pd.read_csv('merged.tsv', sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,premise,hypothesis,labels,preds,label,case
0,8,"He made no remark , but the matter remained in...","the matter remained in his thoughts , for he s...",1,1,entailment,c: a S clause
1,12,"He made no remark , but the matter remained in...",he stood in front of the fire afterwards with ...,1,1,entailment,c: a S clause
2,20,No woman would ever send a reply-paid telegram .,No woman would ever send a reply paid telegram .,1,1,entailment,c: a S clause
3,56,"Well , sir , it did not appear to be a matter...",you have heard the facts,1,1,entailment,c: a S clause
4,64,"Well , sir , it did not appear to be a matter...","Well , sir , it did not appear to be a matter ...",1,1,entailment,c: a S clause


### 'but'

Does the presence of 'but' throw off the model? *It isn't particularly over-represented in the premises, but it is in the hypotheses. If the sentence has 'but' in both the premise and the the hypothesis, it's likely to be misclassified. Definitely might be worth exploring further.*

In [32]:
num_to_label = {1:"entailment", 2:"contradiction"}

for label in [1, 2]:
    for pred in [1, 2]:
        df_label = df[df.labels == label]
        df_label_pred = df_label[df.preds == pred]
        
        premise_has_but = 0; hypothesis_has_but = 0
        for _, row in df_label_pred.iterrows():
            if 'but' in row.premise:
                premise_has_but += 1
            if 'but' in row.hypothesis:
                hypothesis_has_but += 1
        
        print("{}, {}: {} in premise, {} in hypothesis, {} total".format(
            num_to_label[label], num_to_label[pred], premise_has_but, hypothesis_has_but, len(df_label_pred)))

  


entailment, entailment: 182 in premise, 9 in hypothesis, 437 total
entailment, contradiction: 12 in premise, 0 in hypothesis, 29 total
contradiction, entailment: 84 in premise, 54 in hypothesis, 339 total
contradiction, contradiction: 174 in premise, 48 in hypothesis, 980 total


### Length

Does something about the length of the sentences matter? *It does seem that the pairs the model gets wrong are significantly longer in both the premise and hypothesis.*

In [33]:
for label in [1, 2]:
    for pred in [1, 2]:
        df_label = df[df.labels == label]
        df_label_pred = df_label[df.preds == pred]
        
        premise_length_sum = 0; hypothesis_length_sum = 0
        for _, row in df_label_pred.iterrows():
            premise_length_sum += len(row.premise)
            hypothesis_length_sum += len(row.hypothesis)
        
        print("{}, {}: {} in premise, {} in hypothesis".format(
            num_to_label[label], num_to_label[pred], premise_length_sum / len(df_label_pred), hypothesis_length_sum / len(df_label_pred)))

  after removing the cwd from sys.path.


entailment, entailment: 187.38443935926773 in premise, 49.31578947368421 in hypothesis
entailment, contradiction: 121.03448275862068 in premise, 30.79310344827586 in hypothesis
contradiction, entailment: 203.83480825958702 in premise, 89.84070796460178 in hypothesis
contradiction, contradiction: 145.98673469387754 in premise, 59.38367346938775 in hypothesis


### For each cue word
As a sanity check, we'll check the error rate for each cue word. *It seems that nor leads to often incorrect predictions, though this might be because of badly formed sentences.* 

In [40]:
for cue_word in ['no ', 'not ', 'never ', 'nor ']:
    print("\n{}:".format(cue_word))
    df_filtered = df[df.premise.str.contains(cue_word)]

    for label in [1, 2]:
        incorrect_label = 2 if label == 1 else 1
        
        df_label = df_filtered[df.labels == label]
        total = len(df_label)
        num_incorrect = len(df_label[df.preds == incorrect_label])
        print("{}: {} / {} incorrect".format(
            num_to_label[label], num_incorrect, total))


no :
entailment: 10 / 141 incorrect
contradiction: 94 / 358 incorrect

not :
entailment: 18 / 322 incorrect
contradiction: 241 / 921 incorrect

never :
entailment: 2 / 45 incorrect
contradiction: 38 / 128 incorrect

nor :
entailment: 2 / 15 incorrect
contradiction: 19 / 31 incorrect


  
  # Remove the CWD from sys.path while we load stuff.


### For different numbers of cue words
I'm wondering if more cue words means more errors. *Nothing very striking is popping out. It doesn't look like 2 cue words is much more likely to lead to mis-classification than 1.*

In [42]:
def how_many_cue_words(premise):
    total_count = 0
    for cue_word in ['no ', 'not ', 'never ', 'nor ']:
        total_count += premise.count(cue_word)
    return(total_count)

for num_cue_words in range(5):
    print("\n{} cue words:".format(num_cue_words))
    
    df_filtered = df[df.apply(lambda x: how_many_cue_words(x["premise"]) == num_cue_words, axis=1)]

    for label in [1, 2]:
        incorrect_label = 2 if label == 1 else 1
        
        df_label = df_filtered[df.labels == label]
        total = len(df_label)
        num_incorrect = len(df_label[df.preds == incorrect_label])
        print("{}: {} / {} incorrect".format(
            num_to_label[label], num_incorrect, total))


0 cue words:
entailment: 1 / 7 incorrect
contradiction: 9 / 16 incorrect

1 cue words:
entailment: 17 / 317 incorrect
contradiction: 244 / 996 incorrect

2 cue words:
entailment: 7 / 107 incorrect
contradiction: 64 / 242 incorrect

3 cue words:
entailment: 3 / 11 incorrect
contradiction: 6 / 19 incorrect

4 cue words:
entailment: 1 / 13 incorrect
contradiction: 11 / 29 incorrect


  from ipykernel import kernelapp as app
