In [1]:
import json
import torch
import torch.nn as nn
import random
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
fact_checking_model = GPT2LMHeadModel.from_pretrained('gpt2')
fact_checking_model.cuda()
checkpoint = torch.load('save_fever5')
fact_checking_model.load_state_dict(checkpoint['model_state_dict'])
_ = fact_checking_model.eval()

In [4]:
def get_text_up_to_question(text):
    _claim_yn = 'The evidence supports the claim:\n'
    return text[:text.find(_claim_yn) + len(_claim_yn)]

In [5]:
def get_answer_from_text(text):
    _claim_yn = 'The evidence supports the claim:\n'
    pos = text.find(_claim_yn) + len(_claim_yn)
    return text[pos]

In [6]:
def generate_answer(fact_checking_model, text):
    prompt = get_text_up_to_question(text)
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 1
    tokens_length = tokens.shape[1]
    if tokens_length + _length >= 1024:
        raise RuntimeError('Text is longer than 1024')
    output = fact_checking_model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length, 
             pad_token_id=50256
    )
    to_return = tokenizer.decode(output[0], skip_special_tokens=True)
    perplexity = float(model(output, labels=output)[0])
    return get_answer_from_text(to_return), perplexity

In [7]:
def get_best_answer(fact_checking_model, text):
    prompt_y = get_text_up_to_question(text) + 'Y'
    prompt_n = get_text_up_to_question(text) + 'N'
    tokens_y = tokenizer.encode(prompt_y, return_tensors='pt').cuda()
    tokens_n = tokenizer.encode(prompt_n, return_tensors='pt').cuda()
    perplexity_y = float(model(tokens_y, labels=tokens_y)[0])
    perplexity_n = float(model(tokens_n, labels=tokens_n)[0])
    if perplexity_y < perplexity_n:
        return 'Y', perplexity_y
    return 'N', perplexity_n

# Question Answering part

In [8]:
_question_prompt = '\nQ: '
_answer_prompt = '\nA: '
    
def get_text_up_to_question_number(text, number):
    pos = text.find(_answer_prompt)
    for _ in range(number):
        pos = text.find(_answer_prompt, pos + 1)
    return text[0:pos + 1]
    
def get_answers_number(text):
    return text.count(_answer_prompt)

def get_answer_number(text, number):
    pos = text.find(_answer_prompt)
    for _ in range(number):
        pos = text.find(_answer_prompt, pos + 1)
    end = text.find('\n', pos + len(_answer_prompt))
    return text[pos + len(_answer_prompt):end]

def get_question_number(text, number):
    pos = text.find(_question_prompt)
    for _ in range(number):
        pos = text.find(_question_prompt, pos + 1)
    end = text.find('\n', pos + len(_question_prompt))
    return text[pos + len(_question_prompt):end]

def get_all_answers(dev_dict, dev_index):
    answers = [[item['input_text'] for item in dev_dict['data'][dev_index]['answers']]]
    answers += [[item['input_text'] for item in dev_dict['data'][dev_index]['additional_answers'][str(index)]] for index in range(3)]
    return [list(set([answers[j][i] for j in range(len(answers))])) for i in range(len(answers[0]))]

In [9]:
def get_text_from_data_item(item, max_num_questions=0, question_number=-1, last_question=True):
    text = 'In the text below two people are discussing a story.\n\n'
    text += 'Story:\n' + item['story'] + '\n\n'
    text += 'Discussion:\n'
    text += '\n'.join(['Q: ' + q['input_text'] 
                       + '\nA: ' + a['input_text'] 
                       for q, a in zip(item['questions'][max(0,question_number-max_num_questions):question_number+1], 
                                       item['answers'][max(0,question_number-max_num_questions):question_number+1]) 
                      ])
    if not last_question:
        text = '\n'.join(text.split('\n')[:-1]) + '\n'
    return text

In [10]:
def generate_multiple_answers(model, prompt, num_replicas=25):
    model.train()
    outputs = []
    with torch.no_grad():
        tokens = tokenizer.encode(prompt, return_tensors='pt')
        tokens = tokens.repeat(num_replicas,1)
        _length = 50
        tokens_length = tokens.shape[1]
        if tokens_length + _length > 1024:
            return ''

        
        output = model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length,
             pad_token_id=50256
        )
        for index in range(num_replicas):
            text = tokenizer.decode(output[index, :], skip_special_tokens=True)
            offset = len(prompt)
            start = offset + 1
            end = text.find('\n', start)
            outputs.append(text[start:end].split(':')[-1].strip())

    return outputs

In [11]:
def get_answer_from_text(text):
    _claim_yn = 'The evidence supports the claim:\n'
    pos = text.find(_claim_yn) + len(_claim_yn)
    return text[pos]

In [12]:
def generate_multiple_y_n_answers(model, prompt, num_replicas=20):
    model.train()
    outputs_count = {}
    with torch.no_grad():
        tokens = tokenizer.encode(prompt, return_tensors='pt')
        tokens = tokens.repeat(num_replicas,1)
        _length = 50
        tokens_length = tokens.shape[1]
        if tokens_length + _length > 1024:
            return ''

        
        output = model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length,
             pad_token_id=50256
        )
        for index in range(num_replicas):
            text = tokenizer.decode(output[index, :], skip_special_tokens=True)
            answer = get_answer_from_text(text)
            outputs_count.setdefault(answer, 0)
            outputs_count[answer] += 1

    total = sum(v for v in outputs_count.values())
    return [(k, v / total) for k, v in outputs_count.items()]

In [13]:
def generate_first_answer(model, prompt):
    model.eval()
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 50
    tokens_length = tokens.shape[1]
    if tokens_length + _length > 1024:
        return ''
    
    output = model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length,
             pad_token_id=50256
    )
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    offset = len(prompt)
    start = offset + 1
    end = output.find('\n', start)
    return output[start:end].split(':')[-1].strip()

In [14]:
config = GPT2Config(attn_pdrop=0.08, resid_pdrop=0.08, embd_pdrop=0.08)

In [15]:
model = GPT2LMHeadModel(config).from_pretrained('gpt2')
#model = GPT2LMHeadModel.from_pretrained('gpt2')
model.cuda()
checkpoint = torch.load('save_small' + str(6))
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [16]:
model.config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.9.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [17]:
dev_dict = json.load(open('../data/coqa-dev-v1.0.json', encoding='utf8'))
dev_list = json.load(open('../data/qa_dev_list.json', encoding='utf8'))

In [18]:
def get_text_from_data_item(item, max_num_questions=0, question_number=-1, last_question=True):
    text = 'In the text below two people are discussing a story.\n\n'
    text += 'Story:\n' + item['story'] + '\n\n'
    text += 'Discussion:\n'
    text += '\n'.join(['Q: ' + q['input_text'] 
                       + '\nA: ' + a['input_text'] 
                       for q, a in zip(item['questions'][max(0,question_number-max_num_questions):question_number+1], 
                                       item['answers'][max(0,question_number-max_num_questions):question_number+1]) 
                      ])
    if not last_question:
        text = '\n'.join(text.split('\n')[:-1]) + '\n'
    return text

In [19]:
doc=0
number = 1
small_text = get_text_from_data_item(dev_dict['data'][doc], 
                                     max_num_questions=5, 
                                     question_number=number,
                                     last_question=False)
first_answer = generate_first_answer(model, small_text)
answers = generate_multiple_answers(model, small_text)

In [20]:
def get_description_from_data_item(item):
    return item['story']

def get_dialogue_from_data_item(item, max_num_questions=0, question_number=-1, last_question=True):
    text = ''
    text += ' '.join([q['input_text'] + ' ' + a['input_text'] + '.'
                       for q, a in zip(item['questions'][max(0,question_number-max_num_questions):question_number+1], 
                                       item['answers'][max(0,question_number-max_num_questions):question_number+1]) 
                      ])
    if not last_question:
        text = '?'.join(text.split('?')[:-1]) + '?'
    return text

In [21]:
def create_claim_from_description_and_dialogue(description, dialogue):
    if dialogue[-1] == '.':
        dialogue = dialogue[:-1]    
    text = 'Evidence:\n'
    text += description.replace('\n\n', '\n') + '\n\n'
    text += 'Claim:\n'
    text += dialogue + '\n\n'
    text += 'The evidence supports the claim:\n'
    return text

In [22]:
device = 'cuda'

In [23]:
import numpy as np
from sentence_transformers import SentenceTransformer


sentence_model = SentenceTransformer('msmarco-distilbert-base-v3')
sentence_model = sentence_model.to(device)

In [24]:
def get_embeddings_from_text(text):
    outputs = sentence_model.encode(text)
    return outputs

def group_similar_answers_and_get_scores(answers):
    answers_dict = {}
    threshold = 0.7
    embeddings = get_embeddings_from_text(answers)
    embeddings = np.array([e/np.linalg.norm(e) for e in embeddings])
    similarity_matrix = np.matmul(embeddings, embeddings.transpose())
    superseded = set()
    superseded_from = {}
    for i in range(len(answers)):
        for j in range(len(answers)):
            if i > j:
                continue
            if i != j and answers[i] == answers[j]:
                continue
            if similarity_matrix[i][j] > threshold :
                answers_dict.setdefault(i, 0)
                answers_dict[i] += 1
                if i != j:
                    superseded.add(j)
                    superseded_from.setdefault(i, [])
                    superseded_from[i].append(j)

    answers_and_scores = [(index, score/len(answers))
                          for index, score in answers_dict.items() 
                          if index not in superseded]
    
    new_scores_dict = {}
    total_score = sum(item[1] for item in answers_and_scores)
    for answer_index, score in answers_and_scores:
        answer_group = [answers[answer_index]]
        if answer_index in superseded_from:
            answer_group += [answers[i] for i in superseded_from[answer_index]]
        answer_group = tuple(set(answer_group))
        if answer_group in new_scores_dict:
            new_scores_dict[answer_group] += score / total_score
        else:
            new_scores_dict[answer_group] = score / total_score
    
    
    return sorted(list(new_scores_dict.items()), key=lambda x: -x[1])

In [25]:
statement_model = GPT2LMHeadModel.from_pretrained('gpt2')
statement_model.cuda()
checkpoint = torch.load('save_statement' + str(2))
statement_model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [26]:
def generate_statement_from_dialogue(model, prompt):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 50
    tokens_length = tokens.shape[1]
    if tokens_length + _length > 1024:
        return ''
    output = model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length,
             #temperature=0,
             pad_token_id=50256
    )
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    offset = len(prompt)
    start = offset
    end = output.find('\n', start)
    return output[start:end].split(':')[-1].strip()

In [27]:
doc = 0
number = 6
small_text = get_text_from_data_item(dev_dict['data'][doc], 
                                     max_num_questions=5, 
                                     question_number=number,
                                     last_question=False)

In [28]:
def get_statement_prompt(item, max_num_questions=0, question_number=-1, use_answer=None):
    text = 'Discussion:\n'
    text += '\n'.join(['Q: ' + q['input_text'] 
                       + '\nA: ' + a['input_text'] 
                       for q, a in zip(item['questions'][max(0,question_number-max_num_questions):question_number+1], 
                                       item['answers'][max(0,question_number-max_num_questions):question_number+1]) 
                      ])
    if use_answer:
        text = '\n'.join(text.split('\n')[:-1]) + '\n' + 'A: ' + use_answer + '\n'
    text += '\nStatement:\n'
    return text

In [29]:
def select_answer(description, answers, doc, number):
    description = description.replace('\n\n', '\n')
    #print(description)
    best_probability = 0
    
    best_answer = ''
    for answer_tuple in answers:
        answer_sample = answer_tuple[0][0]
        #answer_sample = 'yes'
        answer_score = answer_tuple[1]
        #print(answer_score, answer_sample)
        statement_prompt = get_statement_prompt(dev_dict['data'][doc], 
                                 max_num_questions=5,
                                 question_number=number,
                                 use_answer=answer_sample)
        #print(statement_prompt)
        statement = generate_statement_from_dialogue(statement_model, statement_prompt)
        #print(statement)
        try:
            text = create_claim_from_description_and_dialogue(description, statement)
        except IndexError:
            continue
        
        y_n_tuples = generate_multiple_y_n_answers(fact_checking_model, text)
        #print(y_n_tuples)
        for y_n, yn_score in y_n_tuples:
            if answer_sample.lower() in ['no', 'no.', 'yes', 'yes.']:
                continue
            #if yn_score * answer_score > best_probability and y_n == 'Y':
            #if 'not' in statement and yn_score > best_probability and y_n == 'N':
            #    #best_probability = yn_score * answer_score
            #    best_probability = yn_score
            #    best_answer = answer_tuple[0]
            #el
            if yn_score * answer_score > best_probability and y_n == 'Y':
                best_probability = yn_score * answer_score
                #best_probability = yn_score
                best_answer = answer_tuple[0]
    if not best_answer:
        return answers[0]
    return best_answer, best_probability

In [38]:
doc = 0
number = 0
description = get_description_from_data_item(dev_dict['data'][doc])
small_text = get_text_from_data_item(dev_dict['data'][doc], 
                                     max_num_questions=5, 
                                     question_number=number,
                                     last_question=False)
print(small_text)
answers = generate_multiple_answers(model, small_text)
answers = group_similar_answers_and_get_scores(answers)
select_answer(description, answers, doc, number)

In the text below two people are discussing a story.

Story:
Once upon a time, in a barn near a farm house, there lived a little white kitten named Cotton. Cotton lived high up in a nice warm place above the barn where all of the farmer's horses slept. But Cotton wasn't alone in her little home above the barn, oh no. She shared her hay bed with her mommy and 5 other sisters. All of her sisters were cute and fluffy, like Cotton. But she was the only white one in the bunch. The rest of her sisters were all orange with beautiful white tiger stripes like Cotton's mommy. Being different made Cotton quite sad. She often wished she looked like the rest of her family. So one day, when Cotton found a can of the old farmer's orange paint, she used it to paint herself like them. When her mommy and sisters found her they started laughing. 

"What are you doing, Cotton?!" 

"I only wanted to be more like you". 

Cotton's mommy rubbed her face on Cotton's and said "Oh Cotton, but your fur is so pret

(('White', 'white'), 0.5999999999999998)

In [31]:
description = "Once upon a time, in a barn near a farm house, there lived a little white kitten named Cotton. Cotton lived high up in a nice warm place above the barn where all of the farmer's horses slept. But Cotton wasn't alone in her little home above the barn, oh no. She shared her hay bed with her mommy and 5 other sisters. All of her sisters were cute and fluffy, like Cotton. But she was the only white one in the bunch. The rest of her sisters were all orange with beautiful white tiger stripes like Cotton's mommy. Being different made Cotton quite sad. She often wished she looked like the rest of her family. So one day, when Cotton found a can of the old farmer's orange paint, she used it to paint herself like them. When her mommy and sisters found her they started laughing. "
statement = "Cotton was not happy that she looked different than the rest of her family."
text = create_claim_from_description_and_dialogue(description, statement)
generate_multiple_y_n_answers(fact_checking_model, text)

[('N', 0.95), ('Y', 0.05)]

#### TODO
* Try accuracy of only Y/N


* Try to invert no answers with N FEVER

* Try to understand how FEVER Y/N relates to Yes and No answers. It seems that N for a "no" answer means approval (Y)

# Computing accuracy after fact checking

In [32]:
def compute_accuracy_of_model(model):
    correct_predictions = []
    total_number_of_questions = 0
    correct_answers = 0
    wrong_predictions = []

    false_positives = []
    dlist = dev_list[:10]
    for index, text in tqdm(enumerate(dlist), total=len(dlist)):

        all_answers = get_all_answers(dev_dict, index)
        total_questions = len(all_answers)        
        
        for number in range(total_questions):
            small_text = get_text_from_data_item(dev_dict['data'][index], 
                                                 max_num_questions=8,
                                                 question_number=number,
                                                 last_question=False)

            all_predictions = generate_multiple_answers(model, small_text, num_replicas=20)
            all_predictions = group_similar_answers_and_get_scores(all_predictions)
            predictions = all_predictions[0]
            #predictions = select_answer(description, predictions, index, number)
            #print(predictions)
            for prediction in predictions[0]:
                it_was_answered = False
                if not prediction:
                    prediction = 'unknown'
                prediction = prediction.replace('.', '').replace('"', '')
                it_was_answered = False
                for label in all_answers[number]:
                    label = label.replace('.', '').replace('"', '')
                    if prediction.lower() != 'unknown' and label.lower() == 'unknown':
                        false_positives.append(prediction)
                    
                    label_embeddings = get_embeddings_from_text(label)
                    prediction_embeddings = get_embeddings_from_text(prediction)
                    
                    if np.dot(label_embeddings, prediction_embeddings) \
                       /np.linalg.norm(label_embeddings) \
                       /np.linalg.norm(prediction_embeddings) \
                                       > 0.7:
                        #print(label, prediction)
                        correct_answers += 1
                        it_was_answered = True
                        correct_predictions.append({
                            'index': index,
                            'number': number,
                            'all_predictions' : all_predictions,
                            'chosen predictions': predictions,
                            'label': label,
                        })
                        break
                    else:
                        wrong_predictions.append({'label': label, 'prediction': prediction})
                        
                if it_was_answered:
                    break

            total_number_of_questions += 1

    return correct_answers/total_number_of_questions, wrong_predictions, false_positives, correct_predictions

In [33]:
accuracy, wrong_predictions, false_positives, correct_predictions = compute_accuracy_of_model(model)

100%|██████████| 10/10 [02:12<00:00, 13.29s/it]


In [34]:
import json

filename = 'dict10_25replicas_correct_predictions.json'
json.dump(correct_predictions, open(filename, 'w'))

## Results with alternative answers

# Results

#### Using dev_dict 10, 20 multiple answers
0.7588652482269503
(without fact checking and 20 multiple answers it is 0.7163120567375887)
(without fact checking and 25 multiple answers it is 0.7659574468085106)

#### Using dev_dict 10, 20 multiple answers (NEW FEVER and qa_to_statement model)
fever 3
0.6950354609929078

#### Using dev_dict 10, 20 multiple answers (NEW FEVER and qa_to_statement model)
fever 5
USING MSMARCO 0.7 THRESHOLD: 
0.6879432624113475

USING MSMARCO 0.5 THRESHOLD:
0.7730496453900709

## USING GPT2Config(attn_pdrop=0.08, resid_pdrop=0.08, embd_pdrop=0.08)
#### Baseline USING MSMARCO 0.7 THRESHOLD 20 multiple answers, dict 10:
0.7021276595744681
0.7375886524822695
0.7021276595744681


## USING GPT2Config(attn_pdrop=0.11, resid_pdrop=0.11, embd_pdrop=0.11)
#### Baseline USING MSMARCO 0.7 THRESHOLD 20 multiple answers, dict 10:
0.7092198581560284
0.7375886524822695
0.7304964539007093

## USING GPT2Config(attn_pdrop=0.13, resid_pdrop=0.13, embd_pdrop=0.13)
#### Baseline USING MSMARCO 0.7 THRESHOLD 20 multiple answers, dict 10:
0.7021276595744681
0.7092198581560284
0.7163120567375887,

## USING GPT2Config(attn_pdrop=0.2, resid_pdrop=0.2, embd_pdrop=0.2)
#### Baseline USING MSMARCO 0.7 THRESHOLD 20 multiple answers, dict 10:
0.6879432624113475

## Using attn_dropout=0.05
#### Baseline USING MSMARCO 0.7 THRESHOLD 20 multiple answers, dict 10:
0.7021276595744681
0.6950354609929078


## Using attn_dropout=0.09
#### Baseline USING MSMARCO 0.7 THRESHOLD 20 multiple answers, dict 10:
0.6950354609929078


## Using attn_dropout=0.2
#### Baseline USING MSMARCO 0.7 THRESHOLD 20 multiple answers, dict 10:
0.7092198581560284

0.6950354609929078

## Using attn_dropout=0.1
#### Baseline USING MSMARCO 0.7 THRESHOLD 10 multiple answers, dict 10:
0.6950354609929078

#### Baseline USING MSMARCO 0.7 THRESHOLD 20 multiple answers, dict 10:
0.7163120567375887

#### Baseline USING MSMARCO 0.7 THRESHOLD 23 multiple answers, dict 10:
0.723404255319149

#### Baseline USING MSMARCO 0.7 THRESHOLD 25 multiple answers, dict 10:
0.7375886524822695

#### Baseline USING MSMARCO 0.7 THRESHOLD 15x2 multiple answers, dict 10:
0.6737588652482269

#### Baseline USING MSMARCO 0.7 THRESHOLD 25 multiple answers, dict 50:
0.65375


#### Baseline USING MSMARCO 0.5 THRESHOLD 20 multiple answers, dict 10:
0.7872340425531915



#### Using dev_dict 10, 20 multiple answers (using only yn_score for answer selection)
0.46808510638297873

#### Using dev_dict 10, 20 multiple answers (using only yn_score for answer selection, only for yn questions)
0.6595744680851063

#### Using dev_dict 10, 10 multiple answers (yn_score * answer_score, only for yn questions)
0.7021276595744681

#### Using dev_dict 10, 20 multiple answers (yn_score * answer_score, only for yn questions)
0.7304964539007093

#### Using dev_dict 50, 20 multiple answers (yn_score * answer_score, only for yn questions)
0.6525

#### Using dev_dict 10, 20 multiple answers (yn_score * answer_score, excluding yn questions)
0.7021276595744681


#### Using dev_dict 10, 25 multiple answers (yn_score * answer_score, only for yn questions)
0.6666666666666666
0.6808510638297872


#### Using dev_dict 10, 15 multiple answers, 15 FEVER (yn_score * answer_score, only for yn questions)
0.6737588652482269

#### Using dev_dict 10, 25 multiple answers, 20 FEVER (yn_score * answer_score, only for yn questions)
0.6737588652482269

In [35]:
# Baseline:

(0.7163120567375887,
 [{'label': 'orange and white', 'prediction': 'orange'},
  {'label': 'orange with white tiger stripes', 'prediction': 'orange'},
  {'label': 'the farmer', 'prediction': "her mommy's"},
  {'label': "the farmer's", 'prediction': "her mommy's"},
  {'label': "the old farmer's", 'prediction': "her mommy's"},
  {'label': 'the farmer', 'prediction': 'her mommy'},
  {'label': "the farmer's", 'prediction': 'her mommy'},
  {'label': "the old farmer's", 'prediction': 'her mommy'},
  {'label': 'rubbed her face', 'prediction': 'they laughed'},
  {'label': 'no', 'prediction': 'yes'},
  {'label': 'No', 'prediction': 'yes'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': "Asta's papa", 'prediction': 'Asta'},
  {'label': 'unknown', 'prediction': 'yes'},
  {'label': 'unknown', 'prediction': 'Yes'},
  {'label': 'yes', 'prediction': 'No'},
  {'label': 'Yes', 'prediction': 'No'},
  {'label': 'next door', 'prediction': 'Shanghai'},
  {'label': 'mother', 'prediction': 'He is her grandmother'},
  {'label': 'her brother', 'prediction': 'He is her grandmother'},
  {'label': 'grandmother', 'prediction': 'He is her grandmother'},
  {'label': 'mother', 'prediction': 'She is her grandmother'},
  {'label': 'her brother', 'prediction': 'She is her grandmother'},
  {'label': '-a thermos with hot soup and a stainless-steel container with rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake',
   'prediction': 'rice, vegetables and either chicken, meat or shrimp'},
  {'label': 'more nutrients',
   'prediction': 'rice, vegetables and either chicken, meat or shrimp'},
  {'label': 'food',
   'prediction': 'rice, vegetables and either chicken, meat or shrimp'},
  {'label': 'a thermos with hot soup and a stainless-steel container with rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake This',
   'prediction': 'rice, vegetables and either chicken, meat or shrimp'},
  {'label': '-a thermos with hot soup and a stainless-steel container with rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake',
   'prediction': 'rice, vegetables and chicken, meat or shrimp'},
  {'label': 'more nutrients',
   'prediction': 'rice, vegetables and chicken, meat or shrimp'},
  {'label': 'food',
   'prediction': 'rice, vegetables and chicken, meat or shrimp'},
  {'label': 'a thermos with hot soup and a stainless-steel container with rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake This',
   'prediction': 'rice, vegetables and chicken, meat or shrimp'},
  {'label': 'she has decided the narrator needs more nutrients',
   'prediction': 'heart surgery'},
  {'label': 'needs more nutrients before heart surger',
   'prediction': 'heart surgery'},
  {'label': 'has become an almost-daily practice',
   'prediction': 'heart surgery'},
  {'label': 'I am having heart surgery soon, so her mother has decided I need more nutrients',
   'prediction': 'heart surgery'},
  {'label': 'an iPad', 'prediction': 'communication between us'},
  {'label': 'rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake',
   'prediction': 'food'},
  {'label': 'hot soup and a container with rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake',
   'prediction': 'food'},
  {'label': 'hot soup, rice, vegetables, chicken, meat, or shrimp, sometimes with a kind of pancake',
   'prediction': 'food'},
  {'label': 'soup and a stainless-steel container with rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake',
   'prediction': 'food'},
  {'label': 'Nicole',
   'prediction': 'I point to the screen, and she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'I point to the screen, and she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'use the iPad',
   'prediction': 'I point to the screen, and she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'go with her to her house',
   'prediction': 'I point to the screen, and she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'Nicole',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'use the iPad',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'go with her to her house',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'Nicole',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right'},
  {'label': 'use the iPad',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right'},
  {'label': 'go with her to her house',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right'},
  {'label': 'Nicole',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so she indicated I should go wit'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so she indicated I should go wit'},
  {'label': 'use the iPad',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so she indicated I should go wit'},
  {'label': 'go with her to her house',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so she indicated I should go wit'},
  {'label': 'Nicole',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'use the iPad',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'go with her to her house',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'Nicole',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling her mother that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling her mother that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'use the iPad',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling her mother that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'go with her to her house',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling her mother that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'cop-turned-actor', 'prediction': 'Dapper'},
  {'label': 'Actor', 'prediction': 'Dapper'},
  {'label': 'he was an actor', 'prediction': 'Dapper'},
  {'label': 'cop-turned-actor', 'prediction': 'dapper'},
  {'label': 'Actor', 'prediction': 'dapper'},
  {'label': 'he was an actor', 'prediction': 'dapper'},
  {'label': 'Farina was cast in a film',
   'prediction': 'He joined the cast of Law & Order'},
  {'label': 'he got into acting',
   'prediction': 'He joined the cast of Law & Order'},
  {'label': 'got into acting',
   'prediction': 'He joined the cast of Law & Order'},
  {'label': 'he was a consultant',
   'prediction': 'He joined the cast of Law & Order'},
  {'label': 'Farina was cast in a film',
   'prediction': 'He joined the cast of the long-running Law & Order'},
  {'label': 'he got into acting',
   'prediction': 'He joined the cast of the long-running Law & Order'},
  {'label': 'got into acting',
   'prediction': 'He joined the cast of the long-running Law & Order'},
  {'label': 'he was a consultant',
   'prediction': 'He joined the cast of the long-running Law & Order'},
  {'label': 'Farina was cast in a film',
   'prediction': 'He was on Law & Order'},
  {'label': 'he got into acting', 'prediction': 'He was on Law & Order'},
  {'label': 'got into acting', 'prediction': 'He was on Law & Order'},
  {'label': 'he was a consultant', 'prediction': 'He was on Law & Order'},
  {'label': 'He joined a TV show cast',
   'prediction': 'he joined the cast of the long-running Law & Order'},
  {'label': 'Detective Joe Fontana', 'prediction': 'Jerry Orbach'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': 'no', 'prediction': 'yes'},
  {'label': 'No', 'prediction': 'yes'},
  {'label': "go to Quentin's house", 'prediction': 'walk to the bus stop'},
  {'label': 'go in for cookies and milk',
   'prediction': 'walk to the bus stop'},
  {'label': 'they go in for cookies and milk',
   'prediction': 'walk to the bus stop'},
  {'label': 'get cookies and milk', 'prediction': 'walk to the bus stop'},
  {'label': "go to Quentin's house",
   'prediction': 'walk home from the bus stop'},
  {'label': 'go in for cookies and milk',
   'prediction': 'walk home from the bus stop'},
  {'label': 'they go in for cookies and milk',
   'prediction': 'walk home from the bus stop'},
  {'label': 'get cookies and milk',
   'prediction': 'walk home from the bus stop'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': 'right before bedtime', 'prediction': 'every morning'},
  {'label': 'right before bedtime', 'prediction': 'every morning'},
  {'label': 'before bedtime', 'prediction': 'every morning'},
  {'label': 'right before bedtime', 'prediction': 'every day'},
  {'label': 'right before bedtime', 'prediction': 'every day'},
  {'label': 'before bedtime', 'prediction': 'every day'},
  {'label': 'no one answered',
   'prediction': 'she thought something might be wrong'},
  {'label': 'no answer', 'prediction': 'she thought something might be wrong'},
  {'label': "Quinton's mother", 'prediction': 'her mother'},
  {'label': 'yes', 'prediction': 'tomorrow'},
  {'label': 'after lunch', 'prediction': 'tomorrow'},
  {'label': 'New York', 'prediction': 'New Jersey'},
  {'label': 'New York', 'prediction': 'New Jersey'},
  {'label': 'In the southwest of the city', 'prediction': 'Staten Island'},
  {'label': 'the southernmost part of both the city and state of New York',
   'prediction': 'Staten Island'},
  {'label': 'southernmost part of both the city and state of New York',
   'prediction': 'Staten Island'},
  {'label': 'Arthur Kill and the Kill Van Kull',
   'prediction': 'Conference House Park'},
  {'label': 'the Arthur Kill and the Kill Van Kull',
   'prediction': 'Conference House Park'},
  {'label': 'the Arthur Kill and the Kill Van Kull,',
   'prediction': 'Conference House Park'},
  {'label': 'a non-Hispanic White majority', 'prediction': 'White'},
  {'label': 'non-Hispanic White', 'prediction': 'White'},
  {'label': 'a non-Hispanic White majority', 'prediction': 'white'},
  {'label': 'non-Hispanic White', 'prediction': 'white'},
  {'label': 'The North Shore', 'prediction': 'the East Shore'},
  {'label': 'North Shore', 'prediction': 'the East Shore'},
  {'label': 'The North Shore', 'prediction': 'The East Shore'},
  {'label': 'North Shore', 'prediction': 'The East Shore'},
  {'label': 'firefighter', 'prediction': 'Fire'},
  {'label': 'a firefighter', 'prediction': 'Fire'},
  {'label': 'Firefighter', 'prediction': 'Fire'},
  {'label': 'A violent storm', 'prediction': 'Glass, wood, and plaster'},
  {'label': 'Glass, wood, plaster, and maybe the washing machine',
   'prediction': 'Glass, wood, and plaster'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': 'Eppes', 'prediction': 'RJ'},
  {'label': 'The flashlight',
   'prediction': "Light of his father's flashlight"},
  {'label': 'the suspect', 'prediction': 'Giordano'},
  {'label': 'Maryland', 'prediction': 'Gaithersburg'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'being held in an Aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'being held in an Aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'being held in an Aruban jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'in an Aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'in an Aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'in an Aruban jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'he is being held in an aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an aruban jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an Aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'he is being held in an Aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an Aruban jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'In an Aruba jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'In an Aruba jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'In an Aruba jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an Aruba jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'he is being held in an Aruba jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an Aruba jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'He is being held in an Aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'He is being held in an Aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'He is being held in an Aruban jail'},
  {'label': 'Aruban Solicitor General Taco Stein', 'prediction': 'Taco Stein'},
  {'label': 'Solicitor General', 'prediction': 'Taco Stein'},
  {'label': 'Robyn Gardne', 'prediction': 'Robyn Gardner'},
  {'label': 'snorkeling', 'prediction': 'swimming'},
  {'label': 'Giordano', 'prediction': 'Gardner'},
  {'label': 'No, Gardner was nowhere to be found', 'prediction': 'no'},
  {'label': 'locals say is not a popular snorkeling spot', 'prediction': 'no'},
  {'label': '50-year-old', 'prediction': '50'},
  {'label': 'Two', 'prediction': 'unknown'},
  {'label': '2, Giordano told authorities that he had been snorkeling with Gardner',
   'prediction': 'unknown'},
  {'label': 'Steam it',
   'prediction': 'it lost nearly all its healthy qualities'},
  {'label': 'They steam it',
   'prediction': 'it lost nearly all its healthy qualities'},
  {'label': 'prune it',
   'prediction': 'it lost nearly all its healthy qualities'},
  {'label': 'steam it',
   'prediction': 'it lost nearly all its healthy qualities'},
  {'label': 'Steam it',
   'prediction': 'they lose nearly all of its healthy qualities'},
  {'label': 'They steam it',
   'prediction': 'they lose nearly all of its healthy qualities'},
  {'label': 'prune it',
   'prediction': 'they lose nearly all of its healthy qualities'},
  {'label': 'steam it',
   'prediction': 'they lose nearly all of its healthy qualities'},
  {'label': 'Steam it',
   'prediction': 'it lost nearly all of its healthy qualities'},
  {'label': 'They steam it',
   'prediction': 'it lost nearly all of its healthy qualities'},
  {'label': 'prune it',
   'prediction': 'it lost nearly all of its healthy qualities'},
  {'label': 'steam it',
   'prediction': 'it lost nearly all of its healthy qualities'},
  {'label': 'Steam it',
   'prediction': 'it loses nearly all its healthy qualities'},
  {'label': 'They steam it',
   'prediction': 'it loses nearly all its healthy qualities'},
  {'label': 'prune it',
   'prediction': 'it loses nearly all its healthy qualities'},
  {'label': 'steam it',
   'prediction': 'it loses nearly all its healthy qualities'},
  {'label': 'Steam it',
   'prediction': 'It loses nearly all of its healthy qualities'},
  {'label': 'They steam it',
   'prediction': 'It loses nearly all of its healthy qualities'},
  {'label': 'prune it',
   'prediction': 'It loses nearly all of its healthy qualities'},
  {'label': 'steam it',
   'prediction': 'It loses nearly all of its healthy qualities'},
  {'label': 'Steam it',
   'prediction': 'it loses nearly all of its healthy qualities'},
  {'label': 'They steam it',
   'prediction': 'it loses nearly all of its healthy qualities'},
  {'label': 'prune it',
   'prediction': 'it loses nearly all of its healthy qualities'},
  {'label': 'steam it',
   'prediction': 'it loses nearly all of its healthy qualities'},
  {'label': 'It may prevent heart disease',
   'prediction': 'it loses nearly all of its healthy qualities'},
  {'label': 'may prevent heart disease',
   'prediction': 'it loses nearly all of its healthy qualities'},
  {'label': 'Prevent heart disease',
   'prediction': 'it loses nearly all of its healthy qualities'},
  {'label': 'may prevent heart disease',
   'prediction': 'it loses nearly all of its healthy qualities'},
  {'label': 'It may prevent heart disease',
   'prediction': "it's healthy qualities"},
  {'label': 'may prevent heart disease',
   'prediction': "it's healthy qualities"},
  {'label': 'Prevent heart disease', 'prediction': "it's healthy qualities"},
  {'label': 'may prevent heart disease',
   'prediction': "it's healthy qualities"},
  {'label': 'Leaves fell into the hot water', 'prediction': 'by accident'},
  {'label': 'unknown', 'prediction': 'bad digestion'},
  {'label': 'bloody', 'prediction': 'it was laid over a patch of sand'},
  {'label': 'bloody',
   'prediction': 'it was laid over a patch of sand and grass'},
  {'label': 'bloody', 'prediction': 'sprawled over a patch of sand'},
  {'label': 'propped up, back to back',
   'prediction': 'soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'what appears to be two bodies propped up,',
   'prediction': 'soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'two bodies propped up',
   'prediction': 'soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'propped up, back to back',
   'prediction': 'soldiers kneeling by a bloody body'},
  {'label': 'what appears to be two bodies propped up,',
   'prediction': 'soldiers kneeling by a bloody body'},
  {'label': 'two bodies propped up',
   'prediction': 'soldiers kneeling by a bloody body'},
  {'label': 'propped up, back to back',
   'prediction': 'two US soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'what appears to be two bodies propped up,',
   'prediction': 'two US soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'two bodies propped up',
   'prediction': 'two US soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'in front of a military vehicle', 'prediction': 'a post'},
  {'label': 'military vehicle', 'prediction': 'a post'},
  {'label': 'a military vehicle', 'prediction': 'a post'},
  {'label': 'souvenirs or trophies',
   'prediction': 'taking or retaining individual souvenirs'},
  {'label': 'Holmes is charged with the premeditated deaths of three civilians',
   'prediction': 'murder'},
  {'label': 'The premeditated deaths of three civilians, possessing a dismembered human finger, wrongfully possessing photographs of human casualties, and smoking hashish',
   'prediction': 'murder'},
  {'label': 'the premeditated deaths of three civilians',
   'prediction': 'murder'}],
 ['yes', 'Yes', 'bad digestion'])

(0.7163120567375887,
 [{'label': 'orange and white', 'prediction': 'orange'},
  {'label': 'orange with white tiger stripes', 'prediction': 'orange'},
  {'label': 'the farmer', 'prediction': "her mommy's"},
  {'label': "the farmer's", 'prediction': "her mommy's"},
  {'label': "the old farmer's", 'prediction': "her mommy's"},
  {'label': 'the farmer', 'prediction': 'her mommy'},
  {'label': "the farmer's", 'prediction': 'her mommy'},
  {'label': "the old farmer's", 'prediction': 'her mommy'},
  {'label': 'rubbed her face', 'prediction': 'they laughed'},
  {'label': 'no', 'prediction': 'yes'},
  {'label': 'No', 'prediction': 'yes'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': "Asta's papa", 'prediction': 'Asta'},
  {'label': 'unknown', 'prediction': 'yes'},
  {'label': 'unknown', 'prediction': 'Yes'},
  {'label': 'yes', 'prediction': 'No'},
  {'label': 'Yes', 'prediction': 'No'},
  {'label': 'next door', 'prediction': 'Shanghai'},
  {'label'

In [36]:
# With answer selection
(0.6879432624113475,
 [{'label': 'orange and white', 'prediction': 'orange'},
  {'label': 'orange with white tiger stripes', 'prediction': 'orange'},
  {'label': 'the farmer', 'prediction': "Cotton's mommy's"},
  {'label': "the farmer's", 'prediction': "Cotton's mommy's"},
  {'label': "the old farmer's", 'prediction': "Cotton's mommy's"},
  {'label': 'the farmer', 'prediction': "Cotton's"},
  {'label': "the farmer's", 'prediction': "Cotton's"},
  {'label': "the old farmer's", 'prediction': "Cotton's"},
  {'label': 'the farmer', 'prediction': "Cotton's mommy"},
  {'label': "the farmer's", 'prediction': "Cotton's mommy"},
  {'label': "the old farmer's", 'prediction': "Cotton's mommy"},
  {'label': 'rubbed her face', 'prediction': 'they laughed'},
  {'label': 'no', 'prediction': 'yes'},
  {'label': 'No', 'prediction': 'yes'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': "Asta's papa", 'prediction': 'Asta'},
  {'label': 'unknown', 'prediction': 'Yes'},
  {'label': 'An elderly Chinese lady and a little boy',
   'prediction': 'a lady'},
  {'label': 'elderly Chinese lady', 'prediction': 'a lady'},
  {'label': 'an elderly Chinese lady', 'prediction': 'a lady'},
  {'label': 'An elderly Chinese lady and a little boy',
   'prediction': 'a Chinese lady'},
  {'label': 'next door', 'prediction': 'Shanghai'},
  {'label': 'mother', 'prediction': 'She is her grandmother'},
  {'label': 'her brother', 'prediction': 'She is her grandmother'},
  {'label': 'she has decided the narrator needs more nutrients',
   'prediction': 'she has been having heart surgery'},
  {'label': 'needs more nutrients before heart surger',
   'prediction': 'she has been having heart surgery'},
  {'label': 'has become an almost-daily practice',
   'prediction': 'she has been having heart surgery'},
  {'label': 'I am having heart surgery soon, so her mother has decided I need more nutrients',
   'prediction': 'she has been having heart surgery'},
  {'label': 'she has decided the narrator needs more nutrients',
   'prediction': 'heart surgery'},
  {'label': 'needs more nutrients before heart surger',
   'prediction': 'heart surgery'},
  {'label': 'has become an almost-daily practice',
   'prediction': 'heart surgery'},
  {'label': 'I am having heart surgery soon, so her mother has decided I need more nutrients',
   'prediction': 'heart surgery'},
  {'label': 'she has decided the narrator needs more nutrients',
   'prediction': 'she has had heart surgery'},
  {'label': 'needs more nutrients before heart surger',
   'prediction': 'she has had heart surgery'},
  {'label': 'has become an almost-daily practice',
   'prediction': 'she has had heart surgery'},
  {'label': 'I am having heart surgery soon, so her mother has decided I need more nutrients',
   'prediction': 'she has had heart surgery'},
  {'label': 'she has decided the narrator needs more nutrients',
   'prediction': 'she has heart surgery'},
  {'label': 'needs more nutrients before heart surger',
   'prediction': 'she has heart surgery'},
  {'label': 'has become an almost-daily practice',
   'prediction': 'she has heart surgery'},
  {'label': 'I am having heart surgery soon, so her mother has decided I need more nutrients',
   'prediction': 'she has heart surgery'},
  {'label': 'she has decided the narrator needs more nutrients',
   'prediction': 'she was having heart surgery'},
  {'label': 'needs more nutrients before heart surger',
   'prediction': 'she was having heart surgery'},
  {'label': 'has become an almost-daily practice',
   'prediction': 'she was having heart surgery'},
  {'label': 'I am having heart surgery soon, so her mother has decided I need more nutrients',
   'prediction': 'she was having heart surgery'},
  {'label': 'an iPad', 'prediction': 'communication between us'},
  {'label': 'rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake',
   'prediction': 'food'},
  {'label': 'hot soup and a container with rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake',
   'prediction': 'food'},
  {'label': 'hot soup, rice, vegetables, chicken, meat, or shrimp, sometimes with a kind of pancake',
   'prediction': 'food'},
  {'label': 'soup and a stainless-steel container with rice, vegetables and either chicken, meat or shrimp, sometimes with a kind of pancake',
   'prediction': 'food'},
  {'label': 'Nicole',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was all right'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was all right'},
  {'label': 'use the iPad',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was all right'},
  {'label': 'go with her to her house',
   'prediction': 'I point to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was all right'},
  {'label': 'Nicole',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so she indicated I should go wit'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so she indicated I should go wit'},
  {'label': 'use the iPad',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so she indicated I should go wit'},
  {'label': 'go with her to her house',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so she indicated I should go wit'},
  {'label': 'Nicole',
   'prediction': 'she points to the screen, which displayed a message from her daughter telling her mother that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so I insisted I should g'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'she points to the screen, which displayed a message from her daughter telling her mother that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so I insisted I should g'},
  {'label': 'use the iPad',
   'prediction': 'she points to the screen, which displayed a message from her daughter telling her mother that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so I insisted I should g'},
  {'label': 'go with her to her house',
   'prediction': 'she points to the screen, which displayed a message from her daughter telling her mother that her mother wanted to know if the food was all right and whether it was too salty I am not used to iPads, so I insisted I should g'},
  {'label': 'Nicole',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'use the iPad',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'go with her to her house',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'Nicole',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'use the iPad',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'go with her to her house',
   'prediction': 'I pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'Nicole',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'use the iPad',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'go with her to her house',
   'prediction': 'she pointed to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'Nicole',
   'prediction': 'she points to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'I am now working on some more Chinese words',
   'prediction': 'she points to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'use the iPad',
   'prediction': 'she points to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'go with her to her house',
   'prediction': 'she points to the screen, which displayed a message from her daughter telling me that her mother wanted to know if the food was all right and whether it was too salty'},
  {'label': 'cop-turned-actor', 'prediction': 'dapper'},
  {'label': 'Actor', 'prediction': 'dapper'},
  {'label': 'he was an actor', 'prediction': 'dapper'},
  {'label': 'Farina was cast in a film',
   'prediction': 'he joined the cast of the long-running Law & Order'},
  {'label': 'he got into acting',
   'prediction': 'he joined the cast of the long-running Law & Order'},
  {'label': 'got into acting',
   'prediction': 'he joined the cast of the long-running Law & Order'},
  {'label': 'he was a consultant',
   'prediction': 'he joined the cast of the long-running Law & Order'},
  {'label': 'Farina was cast in a film',
   'prediction': 'He joined the cast of the long-running Law & Order'},
  {'label': 'he got into acting',
   'prediction': 'He joined the cast of the long-running Law & Order'},
  {'label': 'got into acting',
   'prediction': 'He joined the cast of the long-running Law & Order'},
  {'label': 'he was a consultant',
   'prediction': 'He joined the cast of the long-running Law & Order'},
  {'label': 'Farina was cast in a film',
   'prediction': 'He was on Law & Order'},
  {'label': 'he got into acting', 'prediction': 'He was on Law & Order'},
  {'label': 'got into acting', 'prediction': 'He was on Law & Order'},
  {'label': 'he was a consultant', 'prediction': 'He was on Law & Order'},
  {'label': 'He joined a TV show cast',
   'prediction': 'he joined the cast of the long-running Law & Order'},
  {'label': 'Detective Joe Fontana', 'prediction': 'Jerry Orbach'},
  {'label': 'no', 'prediction': 'yes'},
  {'label': 'No', 'prediction': 'yes'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': 'An expensive car', 'prediction': 'a car'},
  {'label': 'an expensive car', 'prediction': 'a car'},
  {'label': 'flashy clothes and an expensive car', 'prediction': 'a car'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': 'school', 'prediction': 'to and from school'},
  {'label': 'no', 'prediction': 'yes'},
  {'label': 'No', 'prediction': 'yes'},
  {'label': "go to Quentin's house", 'prediction': 'walk to the bus stop'},
  {'label': 'go in for cookies and milk',
   'prediction': 'walk to the bus stop'},
  {'label': 'they go in for cookies and milk',
   'prediction': 'walk to the bus stop'},
  {'label': 'get cookies and milk', 'prediction': 'walk to the bus stop'},
  {'label': "go to Quentin's house",
   'prediction': 'walk home from the bus stop'},
  {'label': 'go in for cookies and milk',
   'prediction': 'walk home from the bus stop'},
  {'label': 'they go in for cookies and milk',
   'prediction': 'walk home from the bus stop'},
  {'label': 'get cookies and milk',
   'prediction': 'walk home from the bus stop'},
  {'label': "go to Quentin's house",
   'prediction': 'walk to and from the bus stop'},
  {'label': 'go in for cookies and milk',
   'prediction': 'walk to and from the bus stop'},
  {'label': 'they go in for cookies and milk',
   'prediction': 'walk to and from the bus stop'},
  {'label': 'get cookies and milk',
   'prediction': 'walk to and from the bus stop'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': 'no one answered', 'prediction': 'she saw something was wrong'},
  {'label': 'no answer', 'prediction': 'she saw something was wrong'},
  {'label': 'no one answered',
   'prediction': 'she thought something might be wrong'},
  {'label': 'no answer', 'prediction': 'she thought something might be wrong'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': "Quinton's mother", 'prediction': 'the bus driver'},
  {'label': 'yes', 'prediction': 'Tomorrow'},
  {'label': 'after lunch', 'prediction': 'Tomorrow'},
  {'label': 'yes', 'prediction': 'tomorrow'},
  {'label': 'after lunch', 'prediction': 'tomorrow'},
  {'label': 'New York', 'prediction': 'New Jersey'},
  {'label': 'New York', 'prediction': 'New Jersey'},
  {'label': 'In the southwest of the city', 'prediction': 'in the southwest'},
  {'label': 'the southernmost part of both the city and state of New York',
   'prediction': 'in the southwest'},
  {'label': 'southernmost part of both the city and state of New York',
   'prediction': 'in the southwest'},
  {'label': 'Arthur Kill and the Kill Van Kull',
   'prediction': 'Conference House Park'},
  {'label': 'the Arthur Kill and the Kill Van Kull',
   'prediction': 'Conference House Park'},
  {'label': 'the Arthur Kill and the Kill Van Kull,',
   'prediction': 'Conference House Park'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'a non-Hispanic White majority', 'prediction': 'White'},
  {'label': 'non-Hispanic White', 'prediction': 'White'},
  {'label': 'A violent storm', 'prediction': 'Glass, wood, and plaster'},
  {'label': 'Glass, wood, plaster, and maybe the washing machine',
   'prediction': 'Glass, wood, and plaster'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': 'The flashlight',
   'prediction': "Light of his father's flashlight"},
  {'label': 'the suspect', 'prediction': 'Gary Giordano'},
  {'label': 'Maryland', 'prediction': 'Gaithersburg'},
  {'label': 'Montgomery County', 'prediction': 'Montgomery'},
  {'label': 'Gaithersburg', 'prediction': 'Montgomery'},
  {'label': 'Maryland', 'prediction': 'Alabama'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'being held in an Aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'being held in an Aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'being held in an Aruban jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'He is being held in an jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'He is being held in an jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'He is being held in an jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'he is being held in an aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an aruban jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an Aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'he is being held in an Aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an Aruban jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'he is being held in an jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'he is being held in an jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'he is currently being held in an Aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'he is currently being held in an Aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'he is currently being held in an Aruban jail'},
  {'label': 'suspect in the recent disappearance of an American woman',
   'prediction': 'He is being held in an Aruban jail'},
  {'label': 'he is the suspect in the disappearance of an American woman',
   'prediction': 'He is being held in an Aruban jail'},
  {'label': 'he is a suspect in the recent disappearance of an American woman',
   'prediction': 'He is being held in an Aruban jail'},
  {'label': 'Aruban Solicitor General Taco Stein', 'prediction': 'Taco Stein'},
  {'label': 'Solicitor General', 'prediction': 'Taco Stein'},
  {'label': 'at least eight more days', 'prediction': 'eight'},
  {'label': 'eight more days', 'prediction': 'eight'},
  {'label': 'Robyn Gardne', 'prediction': 'Robyn Gardner'},
  {'label': 'snorkeling', 'prediction': 'snorkeling with Gardner'},
  {'label': 'Giordano', 'prediction': 'Gardner'},
  {'label': 'No, Gardner was nowhere to be found', 'prediction': 'no'},
  {'label': '50-year-old', 'prediction': '50'},
  {'label': 'may be 30 feet tall', 'prediction': 'three or four feet'},
  {'label': '30 feet tall', 'prediction': 'three or four feet'},
  {'label': '30 feet tall', 'prediction': 'three or four feet'},
  {'label': 'may be 30 feet tall', 'prediction': 'Three or four feet'},
  {'label': '30 feet tall', 'prediction': 'Three or four feet'},
  {'label': '30 feet tall', 'prediction': 'Three or four feet'},
  {'label': 'may be 30 feet tall', 'prediction': 'Three or four feet tall'},
  {'label': '30 feet tall', 'prediction': 'Three or four feet tall'},
  {'label': '30 feet tall', 'prediction': 'Three or four feet tall'},
  {'label': 'may be 30 feet tall', 'prediction': 'Three or four feet tall'},
  {'label': '30 feet tall', 'prediction': 'Three or four feet tall'},
  {'label': '30 feet tall', 'prediction': 'Three or four feet tall'},
  {'label': 'may be 30 feet tall', 'prediction': 'three or four feet tall'},
  {'label': '30 feet tall', 'prediction': 'three or four feet tall'},
  {'label': '30 feet tall', 'prediction': 'three or four feet tall'},
  {'label': 'Steam it', 'prediction': 'they added it to the steamed'},
  {'label': 'They steam it', 'prediction': 'they added it to the steamed'},
  {'label': 'prune it', 'prediction': 'they added it to the steamed'},
  {'label': 'steam it', 'prediction': 'they added it to the steamed'},
  {'label': 'Steam it', 'prediction': 'steamed it'},
  {'label': 'They steam it', 'prediction': 'steamed it'},
  {'label': 'prune it', 'prediction': 'steamed it'},
  {'label': 'steam it', 'prediction': 'steamed it'},
  {'label': 'Leaves fell into the hot water',
   'prediction': 'by steamed leaves'},
  {'label': 'By accident', 'prediction': 'by steamed leaves'},
  {'label': 'Leaves from a wild tea tree fell into a hot water pot',
   'prediction': 'by steamed leaves'},
  {'label': 'by accident', 'prediction': 'by steamed leaves'},
  {'label': 'Leaves fell into the hot water',
   'prediction': 'steamed right after the leaves are picked'},
  {'label': 'By accident',
   'prediction': 'steamed right after the leaves are picked'},
  {'label': 'Leaves from a wild tea tree fell into a hot water pot',
   'prediction': 'steamed right after the leaves are picked'},
  {'label': 'by accident',
   'prediction': 'steamed right after the leaves are picked'},
  {'label': 'Leaves fell into the hot water',
   'prediction': 'steamed right after the leaves are picked'},
  {'label': 'By accident',
   'prediction': 'steamed right after the leaves are picked'},
  {'label': 'Leaves from a wild tea tree fell into a hot water pot',
   'prediction': 'steamed right after the leaves are picked'},
  {'label': 'by accident',
   'prediction': 'steamed right after the leaves are picked'},
  {'label': 'Leaves fell into the hot water',
   'prediction': 'steamed right after the leaves were picked'},
  {'label': 'By accident',
   'prediction': 'steamed right after the leaves were picked'},
  {'label': 'Leaves from a wild tea tree fell into a hot water pot',
   'prediction': 'steamed right after the leaves were picked'},
  {'label': 'by accident',
   'prediction': 'steamed right after the leaves were picked'},
  {'label': 'Leaves fell into the hot water',
   'prediction': 'steamed right after the leaves were picked'},
  {'label': 'By accident',
   'prediction': 'steamed right after the leaves were picked'},
  {'label': 'Leaves from a wild tea tree fell into a hot water pot',
   'prediction': 'steamed right after the leaves were picked'},
  {'label': 'by accident',
   'prediction': 'steamed right after the leaves were picked'},
  {'label': 'unknown', 'prediction': 'he drank it all'},
  {'label': 'propped up, back to back',
   'prediction': 'soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'what appears to be two bodies propped up,',
   'prediction': 'soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'two bodies propped up',
   'prediction': 'soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'propped up, back to back',
   'prediction': 'a soldier kneeling by a bloody body'},
  {'label': 'what appears to be two bodies propped up,',
   'prediction': 'a soldier kneeling by a bloody body'},
  {'label': 'two bodies propped up',
   'prediction': 'a soldier kneeling by a bloody body'},
  {'label': 'propped up, back to back',
   'prediction': 'soldiers kneeling by a bloody body'},
  {'label': 'what appears to be two bodies propped up,',
   'prediction': 'soldiers kneeling by a bloody body'},
  {'label': 'two bodies propped up',
   'prediction': 'soldiers kneeling by a bloody body'},
  {'label': 'propped up, back to back',
   'prediction': 'two US soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'what appears to be two bodies propped up,',
   'prediction': 'two US soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'two bodies propped up',
   'prediction': 'two US soldiers kneeling by a bloody body sprawled over a patch of sand and grass'},
  {'label': 'in front of a military vehicle', 'prediction': 'a post'},
  {'label': 'military vehicle', 'prediction': 'a post'},
  {'label': 'a military vehicle', 'prediction': 'a post'},
  {'label': 'souvenirs or trophies',
   'prediction': 'taking or retaining personal souvenirs'},
  {'label': 'Holmes is charged with the premeditated deaths of three civilians',
   'prediction': 'murder'},
  {'label': 'The premeditated deaths of three civilians, possessing a dismembered human finger, wrongfully possessing photographs of human casualties, and smoking hashish',
   'prediction': 'murder'},
  {'label': 'the premeditated deaths of three civilians',
   'prediction': 'murder'}],
 ['Yes', 'he drank it all'])

(0.6879432624113475,
 [{'label': 'orange and white', 'prediction': 'orange'},
  {'label': 'orange with white tiger stripes', 'prediction': 'orange'},
  {'label': 'the farmer', 'prediction': "Cotton's mommy's"},
  {'label': "the farmer's", 'prediction': "Cotton's mommy's"},
  {'label': "the old farmer's", 'prediction': "Cotton's mommy's"},
  {'label': 'the farmer', 'prediction': "Cotton's"},
  {'label': "the farmer's", 'prediction': "Cotton's"},
  {'label': "the old farmer's", 'prediction': "Cotton's"},
  {'label': 'the farmer', 'prediction': "Cotton's mommy"},
  {'label': "the farmer's", 'prediction': "Cotton's mommy"},
  {'label': "the old farmer's", 'prediction': "Cotton's mommy"},
  {'label': 'rubbed her face', 'prediction': 'they laughed'},
  {'label': 'no', 'prediction': 'yes'},
  {'label': 'No', 'prediction': 'yes'},
  {'label': 'no', 'prediction': 'Yes'},
  {'label': 'No', 'prediction': 'Yes'},
  {'label': "Asta's papa", 'prediction': 'Asta'},
  {'label': 'unknown', 'prediction'

## Repeat the test with save number 7 <= there was an error with the evaluation function

## Todo

* properly train FEVER. Why is it so good on fever dev data but louse on coqa? (leakage?)

* Is the FEVER system skewed to say Y? Should you have a threshold for the "correct" answer?

* maybe you can add some syntethic corereference to the training data for the statements