# Evaluating the models for the OWQA task
---



>The next cell contains functions from the official evaluation script for the SQuAD dataset 1.1.
>Taken from:
>https://github.com/abisee/cs224n-win18-squad/blob/master/code/evaluate.py


In [6]:
############################################ START ##############################################
##### Taken from: https://github.com/abisee/cs224n-win18-squad/blob/master/code/evaluate.py #####

from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn') 


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def white_space_fix(text):
        return ' '.join(text.split())

    def lower(text):
        return text.lower()

    return white_space_fix(lower(s))


def topn_score(predictions, ground_truth):
    
    for pred in predictions:
        if (normalize_answer(pred) == normalize_answer(ground_truth)):
            return 1
    return 0

def em_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


##### Taken from: https://github.com/abisee/cs224n-win18-squad/blob/master/code/evaluate.py #####
############################################# END ##############################################

# My Evaluation Functions
----


### Function to evaluate the model performance on F1 and EM scores

In [None]:
def evaluate(data):
    top2 = top3 = em = 0
    total = len(data['start_wordloc'])

    ground_truths = [np.argmax(data['start_wordloc'][i]) for i in range(total)]
    preds = [data['start_pred'][i].argsort()[-1] for i in range(total)]
    preds2 = [data['start_pred'][i].argsort()[-2] for i in range(total)]
    preds3 = [data['start_pred'][i].argsort()[-3] for i in range(total)]
    
    for i in range(total):
        context = data['contexts_tokens'][i]
        ground_truth = context[ground_truths[i]]
        #print(ground_truth, data['answers'][i])
        
        # if the answer is within the length of its specific context
        if preds[i] < len(context) and preds2[i] < len(data['contexts_tokens'][i]) and preds3[i] < len(context):
            prediction = context[preds[i]]
            prediction2 = context[preds2[i]]
            prediction3 = context[preds3[i]]
            
            em += em_score(prediction, ground_truth)
            top2 += topn_score([prediction, prediction2], ground_truth)
            top3 += topn_score([prediction, prediction2, prediction3], ground_truth)

    em = 100.0 * em / total
    top2 = 100.0 * top2 / total
    top3 = 100.0 * top3 / total

    return {'em': em, 'top2': top2, 'top3': top3}

In [None]:
def print_eval(model, evaluation):
    for name, val in zip(model.metrics_names, evaluation):
        print(' '*25, name, ' = ', val)

---
### Main Function called in the models

Pretty prints all the model results and performance.

In [None]:
def evaluation_main(train_data, dev_data, model, history_dict):
    train_evaluation = model.evaluate([train_data['contexts'], train_data['questions']],
                    train_data['start_wordloc'], verbose=1)

    dev_evaluation = model.evaluate([dev_data['contexts'], dev_data['questions']],
                    dev_data['start_wordloc'], verbose=1)
    
    train_scores = evaluate(train_data)
    dev_scores = evaluate(dev_data)
    
    print('\n'+'_'*110, '\n')
    print(' '*37, 'TRAINING LOSS AND ACCURACY PLOTS')
    print('_'*110, '\n')
    plot(history_dict)
    
    print('_'*110, '\n')
    print(' '*40, 'EVALUATION ON TRAINING SET')
    print('_'*110, '\n')
    print(' '*12, 'EM: ', train_scores['em'], ';   Top2-EM: ', train_scores['top2'],
                                              ';   Top3-EM: ', train_scores['top3'], '\n')
    print_eval(model, train_evaluation)
    
    print('_'*110, '\n')
    print(' '*39, 'EVALUATION ON DEVELOPMENT SET')
    print('_'*110, '\n')
    print(' '*12, 'EM: ', dev_scores['em'], ';   Top2-EM: ', dev_scores['top2'], 
                                            ';   Top3-EM: ', dev_scores['top3'], '\n')
    print_eval(model, dev_evaluation)
    

# Plots
---




### Printing Predictions

In [None]:
def plot_predictions(output):
    
    x = range(1, len(output) + 1)

    plt.figure(figsize=(5, 3))
    plt.plot(x, output)
    plt.xlabel('Tokens')
    plt.ylabel('Softmax Output')
    plt.grid(True)
    plt.show()

---

### Plotting One-word Answers Loss/Accuracy

In [None]:
def plot_loss(history_dict):
    
    train_loss = history_dict['loss']
    val_loss = history_dict['val_loss']
    
    eps = range(1, len(train_loss) + 1)

    plt.plot(eps, train_loss, 'coral', label = 'Training loss')
    plt.plot(eps, val_loss, 'purple', label = 'Validation loss')
    plt.title("Training and validation loss")
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()
    
def plot_acc(history_dict):
    
    train_acc = history_dict['acc']
    val_acc = history_dict['val_acc']
    
    eps = range(1, len(train_acc) + 1)

    plt.plot(eps, train_acc, 'coral', label = 'Training acc')
    plt.plot(eps, val_acc, 'purple', label = 'Validation acc')
    plt.title('Training and validation acc')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()