# Evaluating the models for the QA task
---



>The next cell contains functions from the official evaluation script for the SQuAD dataset 1.1.
>Taken from:
>https://github.com/abisee/cs224n-win18-squad/blob/master/code/evaluate.py



In [6]:

############################################ START ##############################################
##### Taken from: https://github.com/abisee/cs224n-win18-squad/blob/master/code/evaluate.py #####

from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn') 


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def em_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


##### Taken from: https://github.com/abisee/cs224n-win18-squad/blob/master/code/evaluate.py #####
############################################# END ##############################################

# My Evaluation Functions
----


### Function to evaluate the model performance on F1 and EM scores

In [None]:
def evaluate(data):
    f1 = em = 0
    total = len(data['answers'])
    
    start_ground_truths = [np.argmax(data['start_wordloc'][i]) for i in range(total)]
    end_ground_truths = [np.argmax(data['end_wordloc'][i]) for i in range(total)]
    start_preds = [np.argmax(data['start_pred'][i]) for i in range(total)]
    end_preds = [np.argmax(data['end_pred'][i]) for i in range(total)]
    
    for i in range(total):
        ground_truth = data['contexts_tokens'][i][start_ground_truths[i]:end_ground_truths[i]]
        ground_truth = " ".join(ground_truth)
        #print(ground_truth, data['answers'][i])
        
        # if the answer is within the length of its specific context
        if start_preds[i] < len(data['contexts_tokens'][i]) or end_preds[i] < len(data['contexts_tokens'][i]):
            prediction = data['contexts_tokens'][i][start_preds[i]:end_preds[i]]     
            prediction = " ".join(prediction)
            em += em_score(prediction, ground_truth)
            f1 += f1_score(prediction, ground_truth)

    em = 100.0 * em / total
    f1 = 100.0 * f1 / total

    return {'em': em, 'f1': f1}

def print_eval(model, evaluation):
    for name, val in zip(model.metrics_names, evaluation):
        print(' '*25, name, ' = ', val)

---
### Main Function called in the models

Pretty prints all the model results and performance.

In [None]:
def evaluation_main(train_data, dev_data, model, history_dict):
    train_evaluation = model.evaluate([train_data['contexts'], train_data['questions']],
                    [train_data['start_wordloc'], train_data['end_wordloc']], verbose=1)

    dev_evaluation = model.evaluate([dev_data['contexts'], dev_data['questions']],
                    [dev_data['start_wordloc'], dev_data['end_wordloc']], verbose=1)
    
    train_scores = evaluate(train_data)
    dev_scores = evaluate(dev_data)
    
    print('\n'+'_'*110, '\n')
    print(' '*37, 'TRAINING LOSS AND ACCURACY PLOTS')
    print('_'*110, '\n')
    plot_multioutput(history_dict)
    
    print('_'*110, '\n')
    print(' '*40, 'EVALUATION ON TRAINING SET')
    print('_'*110, '\n')
    print(' '*25, 'EM: ', train_scores['em'], ';     F1: ', train_scores['f1'], '\n')
    print_eval(model, train_evaluation)
    
    print('_'*110, '\n')
    print(' '*39, 'EVALUATION ON DEVELOPMENT SET')
    print('_'*110, '\n')
    print(' '*25, 'EM: ', dev_scores['em'], ';     F1: ', dev_scores['f1'], '\n')
    print_eval(model, dev_evaluation)
    

# Plots
---




### Printing the Softmax Activation of the Predictions

In [None]:
def plot_predictions(output):
    
    x = range(1, len(output) + 1)

    plt.figure(figsize=(5, 3))
    plt.plot(x, output)
    plt.xlabel('Tokens')
    plt.ylabel('Softmax Output')
    plt.grid(True)
    plt.show()

---

### Plotting Multi-Output Loss/Accuracy

In [None]:
def plot_multioutput(history_dict):
    fig = plt.figure()
    fig.set_figheight(5) # optional setting the height of the image
    fig.set_figwidth(16)
    
    plot_loss, plot_acc = fig.add_subplot(1,2,1), fig.add_subplot(1,2,2)
    train_loss = history_dict['loss']
    eps = range(1, len(train_loss) + 1)
    
    start_loss = history_dict['start_loss']
    end_loss = history_dict['end_loss']
    val_start_loss = history_dict['val_start_loss']
    val_end_loss = history_dict['val_end_loss']
    
    start_acc = history_dict['start_acc']
    end_acc = history_dict['end_acc']
    val_start_acc = history_dict['val_start_acc']
    val_end_acc = history_dict['val_end_acc']
    
    plot_loss.plot(eps, start_loss, 'deepskyblue', label = 'Start Training loss')
    plot_loss.plot(eps, end_loss, 'dodgerblue', label = 'End Training loss')
    plot_loss.plot(eps, val_start_loss, 'mediumslateblue', label = 'Start Validation loss')
    plot_loss.plot(eps, val_end_loss, 'blueviolet', label = 'End Validation loss')
    
    plot_acc.plot(eps, start_acc, 'deepskyblue', label = 'Start Training acc')
    plot_acc.plot(eps, end_acc, 'dodgerblue', label = 'End Training acc')
    plot_acc.plot(eps, val_start_acc, 'mediumslateblue', label = 'Start Validation acc')
    plot_acc.plot(eps, val_end_acc, 'blueviolet', label = 'End Validation acc')
    
    plot_loss.set_title("Training and validation loss")
    plot_loss.set_xlabel('Epochs')
    plot_loss.set_ylabel('Loss')
    plot_acc.set_title("Training and validation accuracy")
    plot_acc.set_xlabel('Epochs')
    plot_acc.set_ylabel('Accuracy')
    plot_loss.legend(loc="upper left")
    plot_acc.legend(loc="upper left")
    
    plt.grid(True)
    plt.show()
