# Evaluate predictions on HotpotQA
- Model predicts weather a sentence is a supporting fact to answer a question
- This notebook rearranges the predictions and evaluates the performance just like the hotpot evaluation script

In [1]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import precision_score, recall_score

In [2]:
gt = [[0,0],[0,1],[1,0],[1,1]]
pred = [[1,1],[1,1],[1,0],[1,1]]

In [3]:
def exact_match(gt, pred):
    assert(len(gt) == len(pred))
    total_size = len(pred)
    num_correct = 0
    for i in range(total_size):
        if(gt[i] == pred[i]):
            num_correct += 1
    return num_correct/total_size

In [4]:
exact_match(gt, pred)

0.5

In [5]:
def evaluate(gt, pred):
    assert(len(gt) == len(pred))
    total_size = len(pred)
    assert(len(gt) != 0)
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    total_correct = 0
    for i in range(total_size):
        if(gt[i] == pred[i]):
            total_correct += 1
        p = precision_score(gt[i], pred[i],average="binary")
        r = recall_score(gt[i], pred[i],average="binary")
        total_precision += p
        total_recall += r
        total_f1 += 2*(p*r)/(p+r) if (p+r)>0 else 0
    return total_precision/total_size, total_recall/total_size, total_f1/total_size, total_correct/total_size

In [6]:
evaluate(gt, pred)

  'recall', 'true', average, warn_for)


(0.625, 0.75, 0.6666666666666666, 0.5)

In [7]:
def reorganize_predictions(predictions, document_lengths):
    out_list = []
    start_index = 0
    for i in range(len(document_lengths)):
        p = predictions[start_index:start_index+document_lengths[i]]
        out_list.append(p)
        start_index += document_lengths[i]
    return out_list

In [9]:
reorganize_predictions(list(range(20)), [1,19])

[[0], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]