In [1]:
import pandas as pd
import json
import models
import torch 
import config
import mutils

import checklist
from checklist.editor import Editor
import numpy as np
import copy

device = "cuda" if torch.cuda.is_available() else "cpu"
seed = 42

## Performance scores

In [2]:
with open('../eval_results/final_model_results.json') as json_file:
        nli_dict = json.load(json_file)

In [3]:
results_table = pd.DataFrame.from_dict(nli_dict, orient='index')
results_table = results_table[['dev_acc', 'test_acc', 'micro', 'macro']]

Performance scores similar to Table 3 in the paper:

In [4]:
results_table

Unnamed: 0,dev_acc,test_acc,micro,macro
base,65.57,65.25,79.16,77.75
lstm,79.45,78.81,76.98,76.25
bilstm,78.51,78.55,79.63,78.88
bilstmpool,82.98,82.79,81.37,80.5


The performance scores for the LSTM and the BiLSTM with max pooling correspond to the results of Conneau et al. (2017) within a 3% margin for the NLI task and a 6% margin for SentEval. Also corresponding to Conneau et al. the BiLSTM max pooling model performs the best, both for NLI and for SentEval. Interestingly however, the base model outperforms the LSTM for SentEval, and performs equal to BiLSTM.

Performance scores for the NLI tasks:

In [5]:
with open('eval_results/final_task_results.json') as json_file:
        sent_eval_scores = json.load(json_file)
        
sent_eval_results = pd.DataFrame.from_dict(sent_eval_scores, orient='index')
sent_eval_results

Unnamed: 0,MR,CR,SUBJ,MPQA,SST2,TREC,SICKEntailment,MRPC
base,74.88,78.46,90.11,84.85,78.21,67.31,80.8,71.2
lstm,72.13,77.73,85.96,84.84,76.26,61.45,82.2,73.11
bilstm,72.4,79.26,89.52,85.06,78.67,73.06,83.4,72.55
bilstmpool,75.76,81.78,91.57,85.64,79.93,75.26,84.4,74.31


The base model performs particularly well on SUBJ and MPQA. LSTM performs very low on TREC. BiLSTMPool performs the best on all tasks. The hardest task appears to be TREC.

## Error analysis for NLI

In this Section, I will try out various difficult NLI testcases. I will create 100 example sentences per testcase, and report on the performance of each model. 

### Load models

In [6]:
base_nli, _ = mutils.load_model('base', 'model_checkpoints/base_model_final', None)
lstm_nli, lstm_lstm = mutils.load_model('lstm', 'model_checkpoints/lstm_nli_model_final', 'model_checkpoints/lstm_lstm_model_final')
bilstm_nli, bilstm_lstm = mutils.load_model('bilstm', 'model_checkpoints/bilstm_nli_model_final', 'model_checkpoints/bilstm_lstm_model_final')
bilstmpool_nli, bilstmpool_lstm = mutils.load_model('bilstmpool', 'model_checkpoints/bilstmpool_nli_model_final', 'model_checkpoints/bilstmpool_lstm_model_final')

load bilstm encoder
load bilstm maxpool encoder


In [7]:
with open('train_word_dict.json') as json_file:
    word_dict = json.load(json_file)

embedding_model = mutils.load_embeddings(word_dict)

In [8]:
label_dict = {0: 0, 1:0, 2:0}

results_dict = {'base': copy.deepcopy(label_dict),
               'lstm' : copy.deepcopy(label_dict),
               'bilstm': copy.deepcopy(label_dict),
               'bilstmpool': copy.deepcopy(label_dict)}

model_tuples = [('base', base_nli),('lstm', lstm_nli), ('bilstm', bilstm_nli), ('bilstmpool', bilstmpool_nli)]

def predict_all_models(result_dict, s1, s2):
    for model, nli in model_tuples:
        prediction = mutils.predict(model, embedding_model, nli, s1, s2)
        result_dict[model][prediction] += 1
    return result_dict

def print_performances(result_dict, nsamples, correct_label):
    performances = {}
    for model, _ in model_tuples:
        model_scores = copy.deepcopy(result_dict[model])
        model_scores['correct'] = str(round(model_scores[correct_label] / nsamples * 100,1)) + '%'
        performances[model] = model_scores
    performance_table = pd.DataFrame.from_dict(performances, orient='index')
    print(performance_table)

def eval_test(eval_sents, nsamples, correct_label):
    result_dict = copy.deepcopy(results_dict)
    for n, sent in enumerate(eval_sents):
        try:
            s1, s2 = sent.split(';')
            
        except:
            nsamples = nsamples - 1
            continue
        if n == 0:
            print(f"{nsamples} samples of the following structure where the correct label is {correct_label}:")
            print(f"Premise: '{s1}' \nHypothesis '{s2}'")
        result_dict = predict_all_models(result_dict, s1, s2)
    print_performances(result_dict, nsamples, correct_label)

### Negation

In [9]:
nsamples = 100
editor = Editor()

In [10]:
negation = editor.template('{first_name} is from {country}; {first_name} is not from {country} ')

np.random.seed(seed) 
negation_sents = np.random.choice(negation.data, nsamples)
correct_label = 2 #contradiction

In [11]:
eval_test(negation_sents, nsamples, correct_label)

100 samples of the following structure where the correct label is 2:
Premise: 'Scott is from Poland' 
Hypothesis ' Scott is not from Poland '
             0   1   2 correct
base        43  56   1    1.0%
lstm         1   0  99   99.0%
bilstm       1   0  99   99.0%
bilstmpool   3   0  97   97.0%


## Negetion in one part of the sentence, entailment in the relevant part

In [13]:
negation_entailment = editor.template('{first_name} does not live in {city} but they are from {country}; {first_name} is from {country} ')
negation_entailment_sents = np.random.choice(negation_entailment.data, nsamples)
correct_label = 0 #entailment

eval_test(negation_entailment_sents, nsamples, correct_label)

100 samples of the following structure where the correct label is 0:
Premise: 'Dave does not live in Milwaukee but they are from France' 
Hypothesis ' Dave is from France '
             0   1   2 correct
base         2  66  32    2.0%
lstm        88   0  12   88.0%
bilstm      18   7  75   18.0%
bilstmpool  88   0  12   88.0%


### TODO: interpret...

## Active - passive / Subject - object

In [14]:
# a list of verbs to use in the test cases
passive_verbs = ['kissed', 'killed', 'hurt', 'touched', 'ignored', 'silenced', 'hit', 'greeted']
english_firstname = editor.lexicons.female_from.United_Kingdom + editor.lexicons.male_from.United_Kingdom

active_passive = editor.template('{first_name} {verb} {first}; {first} was {verb} by {first_name}', first=english_firstname, verb=passive_verbs)
active_passive_sents = np.random.choice(active_passive.data, nsamples)

correct_label = 0 #entailment

eval_test(active_passive_sents, nsamples, correct_label)

100 samples of the following structure where the correct label is 0:
Premise: 'Marilyn silenced Joseph' 
Hypothesis ' Joseph was silenced by Marilyn'
              0   1  2 correct
base         97   3  0   97.0%
lstm        100   0  0  100.0%
bilstm      100   0  0  100.0%
bilstmpool   69  30  1   69.0%


In [19]:
# a list of verbs to use in the test cases
passive_verbs = ['kissed', 'killed', 'hurt', 'touched', 'ignored', 'silenced', 'hit', 'greeted']
english_firstname = editor.lexicons.female_from.United_Kingdom + editor.lexicons.male_from.United_Kingdom


active_passive = editor.template('{first_name} {verb} {first}; {first_name} was {verb} by {first}', first=english_firstname, verb=passive_verbs)
active_passive_sents = np.random.choice(active_passive.data, nsamples)

correct_label = 1 #neutral

eval_test(active_passive_sents, nsamples, correct_label)

100 samples of the following structure where the correct label is 1:
Premise: 'Alex greeted Louisa' 
Hypothesis ' Alex was greeted by Louisa'
              0  1  2 correct
base         99  1  0    1.0%
lstm        100  0  0    0.0%
bilstm      100  0  0    0.0%
bilstmpool  100  0  0    0.0%


## Short vs long distances

In [16]:
nsamples = 100
verbs = ['hit', 'kicked', 'stopped', 'touched', 'missed', 'smashed']
# lists of sentence fillers to increase the distance between the agent and the predicate
# for active sentences
precedents = ['nearly falling down', 'missing the past three games', 'celebrating a perfect streak', 'suffering from a knee injury', 'appearing so fit']
# for passive sentences
ball_precedents = ['lying there for a while', 'a boring match', 'three nerve-wrecking minutes', 'some time']

In [17]:
long_distance = editor.template("{first_name}, after {filler}, finnaly {verb} the ball; {first_name} {verb} the ball", verb=verbs, filler=precedents)
long_d_sents = np.random.choice(long_distance.data, nsamples)

correct_label = 0 #entailment

eval_test(long_d_sents, nsamples, correct_label)

100 samples of the following structure where the correct label is 0:
Premise: 'Ron, after celebrating a perfect streak, finnaly kicked the ball' 
Hypothesis ' Ron kicked the ball'
              0   1   2 correct
base         16  37  47   16.0%
lstm         98   0   2   98.0%
bilstm       97   3   0   97.0%
bilstmpool  100   0   0  100.0%


## Synonyms

In [18]:
synonym_words = [('author', 'writer'), ('surgeon', 'doctor'), ('server', 'waiter'), ('chef','cook'), 
                 ('educator','teacher'), ('professor','academic'), ('person','human'), ('actor','performer'),
                 ('musician', 'artist'), ('hairdresser', 'hairstylist')]

synonyms = editor.template("{first_name}, is {a:occupation[0]}; {first_name} {a:occupation[1]}", occupation=synonym_words, filler=precedents)
synonym_sents = np.random.choice(synonyms.data, nsamples)

correct_label = 0 #entailment

eval_test(synonym_sents, nsamples, correct_label)


100 samples of the following structure where the correct label is 0:
Premise: 'Kate, is a hairdresser' 
Hypothesis ' Kate a hairstylist'
             0   1   2 correct
base         7  85   8    7.0%
lstm        48  29  23   48.0%
bilstm      67  23  10   67.0%
bilstmpool  83  15   2   83.0%


# Sent embeddings evaluation