## Fine tuning of BERT model for question answering

Please run the below code on Google CLoud Instance. The training takes nearly 8-10 hours. Due to lack of GPU computation service, the current code was run on 102GB RAM and 16VCPU instances. 

Create a folder SQUAD and upload the train-v1.1.json and dev-v1.1.json of SQuAD 1.0

In [None]:
## Download the pretrained BERT Model
## Uncomment the below lines if you don't have the 'uncased_L-12_H-768_A-12' folder 

# ! wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
# !unzip uncased_L-12_H-768_A-12.zip


In [28]:
## Download the bert repository for using the bert modules
## Replace the optimization, modelling, tokenization and run_squad files with the files provided in
## zip folder for submission

import sys
# !test -d bert_repo || git clone https://github.com/google-research/bert bert_repo

if not 'bert_repo' in sys.path:
    sys.path += ['bert_repo']
    
import tokenization
import run_squad
import modeling
import optimization

In [2]:
## libraries

import zipfile
import sys
import datetime
import tensorflow as tf
import os
import json
import six
from collections import Counter
import string
import re

#import tensorflow_hub as hub


In [16]:
def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

def read_json_file(input_file):
    with tf.io.gfile.GFile(input_file, "r") as reader:
        data = json.load(reader)["data"]
        return data
    
    

    
BERT_MODEL='./uncased_L-12_H-768_A-12'    
VOCAB_FILE = os.path.join(BERT_MODEL, 'vocab.txt')

## Defining tokenizer
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=True)

In [17]:
## Reading data from training files

def read_squad_train(input_file):
    
    ## Loading the json file
    data=read_json_file(input_file)

    instances = []
    for row in data:
        for paragraph in row["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            
            #print (paragraph_text)
            #print (len(paragraph_text))
            
            ## Assigning a new token as soon as it sees a space
            for character in paragraph_text:
                if is_whitespace(character):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(character)
                    else:
                        doc_tokens[-1] += character
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
                
#             print (char_to_word_offset)
#             print (len(char_to_word_offset))

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False

                if not is_impossible:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset + answer_length -1]

                    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(orig_answer_text))
                    
                    if actual_text.find(cleaned_answer_text) == -1:
                        print ("Could not find answer") 
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""

                instance = run_squad.SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text, start_position=start_position,end_position=end_position,
                is_impossible=is_impossible)
                instances.append(instance)
    return instances

In [18]:
train_examples = read_squad_train("./SQUAD/train-v1.1.json")
print("Total train examples are ",len(train_examples))
train_examples=train_examples[:25000]
print("Total train examples used ",len(train_examples))

Total train examples are  87599
Total train examples used  25000


In [7]:
print("question_text : ",train_examples[0].question_text)
print("doc_tokens : ",train_examples[0].doc_tokens)
print("start_position : ",train_examples[0].start_position)
print("end_position : ",train_examples[0].end_position)

question_text :  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
doc_tokens :  ['Architecturally,', 'the', 'school', 'has', 'a', 'Catholic', 'character.', 'Atop', 'the', 'Main', "Building's", 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'Virgin', 'Mary.', 'Immediately', 'in', 'front', 'of', 'the', 'Main', 'Building', 'and', 'facing', 'it,', 'is', 'a', 'copper', 'statue', 'of', 'Christ', 'with', 'arms', 'upraised', 'with', 'the', 'legend', '"Venite', 'Ad', 'Me', 'Omnes".', 'Next', 'to', 'the', 'Main', 'Building', 'is', 'the', 'Basilica', 'of', 'the', 'Sacred', 'Heart.', 'Immediately', 'behind', 'the', 'basilica', 'is', 'the', 'Grotto,', 'a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection.', 'It', 'is', 'a', 'replica', 'of', 'the', 'grotto', 'at', 'Lourdes,', 'France', 'where', 'the', 'Virgin', 'Mary', 'reputedly', 'appeared', 'to', 'Saint', 'Bernadette', 'Soubirous', 'in', '1858.', 'At', 'the', 'end', 'of', 'the', 'main', 'drive', '(and', 'i

In [14]:
train_examples[4]

qas_id: 5733be284776f4190066117e, question_text: What sits on top of the Main Building at Notre Dame?, doc_tokens: [Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.], start_position: 14, end_position: 20, is_impossible: False

## Creating train tf records

In [None]:
## Create an OUTPUT Directory and give the path to it below

OUTPUT_DIR="OUTPUT"
MAX_SEQ_LENGTH = 256
train_writer = run_squad.FeatureWriter(filename=os.path.join(OUTPUT_DIR, "train.tf_record"), is_training=True)

def append_feature(feature):
    train_features.append(feature)
    train_writer.process_feature(feature)
    
train_features = []
run_squad.convert_examples_to_features(train_examples, tokenizer, MAX_SEQ_LENGTH, 128, 64, True, output_fn=append_feature)
train_writer.close()

In [None]:
print("tokens: ", train_features[0].tokens)
print("token_to_orig_map: ", train_features[0].token_to_orig_map)
print("start_position: ",train_features[0].start_position)
print("end_position: ",train_features[0].end_position)

## Model Definition

In [25]:
## Pretrained model files

CONFIG_FILE = os.path.join(BERT_MODEL, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_MODEL, 'bert_model.ckpt')


## Model parameters
TRAIN_BATCH_SIZE = 128
LEARNING_RATE = 3e-5
NUM_TRAIN_EPOCHS = 2.0
WARMUP_PROPORTION = 0.05
EVAL_BATCH_SIZE = 8
SAVE_CHECKPOINTS_STEPS = 100
ITERATIONS_PER_LOOP = 20



In [26]:
num_train_steps = int(len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

print ("Train Steps: ",num_train_steps)

Train Steps:  468


In [None]:
model_fn = run_squad.model_fn_builder(bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps,
    use_tpu=False, use_one_hot_embeddings=True)

estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=False, model_fn=model_fn,config=tf.compat.v1.estimator.tpu.RunConfig(),
    train_batch_size=TRAIN_BATCH_SIZE, predict_batch_size=EVAL_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE)

In [25]:
print('***** Started training at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))

train_input_fn = run_squad.input_fn_builder(input_file=train_writer.filename,
    seq_length=MAX_SEQ_LENGTH, is_training=True, drop_remainder=True)

estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('***** Finished training at {} *****'.format(datetime.datetime.now()))

***** Started training at 2020-11-12 18:07:34.133945 *****
  Num examples = 20000
  Batch size = 64
INFO:tensorflow:  Num steps = 625
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by `tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data optimizations will take care of using the fused implementation.
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running train on CPU/GPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = end_positions, shape = (64,)
INFO:tensorflow:  name = input_ids, shape = (64, 256)
INFO:tensorflow:  name = input_mask, shape = (64, 256)
INFO:tensorflow:  name = segment_ids, shape = (64, 256)
INFO:tensorflow:  name = start_positions, shape = (64,)
INFO:tensorflow:  name = unique_ids, shape = (64,)
Inst

## On Evaluation/Testing set

In [None]:
## Reading data from testing files

def read_squad_test(input_file):
    ## Loading the json file
    data=read_json_file(input_file)


    instances = []
    for row in input_data:
        for paragraph in row["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            
            ## Assigning a new token as soon as it sees a space
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False

                instance = run_squad.SquadExample(qas_id=qas_id,
                question_text=question_text,doc_tokens=doc_tokens, orig_answer_text=orig_answer_text,
                start_position=start_position, end_position=end_position,is_impossible=is_impossible)
                instances.append(instance)
    return instances

In [None]:
eval_examples = read_squad_test("SQUAD/dev-v1.1.json")
eval_examples=eval_examples[:4000]

print("type of eval ",type(eval_examples))
print("-"*60)
print(eval_examples[0].qas_id)
print("-"*60)
print(eval_examples[0].doc_tokens)

## Writing Test TF records

In [None]:
OUTPUT_DIR="OUTPUT"
eval_writer = run_squad.FeatureWriter(
    filename=os.path.join(OUTPUT_DIR, "eval.tf_record"),
    is_training=False)


def append_feature(feature):
    eval_features.append(feature)
    eval_writer.process_feature(feature)

In [21]:
eval_features = []

run_squad.convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH,
        doc_stride=128, max_query_length=64,
        is_training=False, output_fn=append_feature)

eval_writer.close()

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 1000000000
INFO:tensorflow:example_index: 0
INFO:tensorflow:doc_span_index: 0
INFO:tensorflow:tokens: [CLS] which nfl team represented the afc at super bowl 50 ? [SEP] super bowl 50 was an american football game to determine the champion of the national football league ( nfl ) for the 2015 season . the american football conference ( afc ) champion denver broncos defeated the national football conference ( nfc ) champion carolina panthers 24 – 10 to earn their third super bowl title . the game was played on february 7 , 2016 , at levi ' s stadium in the san francisco bay area at santa clara , california . as this was the 50th super bowl , the league emphasized the " golden anniversary " with various gold - themed initiatives , as well as temporarily suspend ##ing the tradition of naming each super bowl game with roman nu ##meral ##s ( under which the game would have been known as " super bowl l " ) , so that the logo could promi

In [23]:

predict_input_fn = run_squad.input_fn_builder(
    input_file=eval_writer.filename,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

INFO:tensorflow:***** Running predictions *****
INFO:tensorflow:  Num orig examples = 4000
INFO:tensorflow:  Num features = 4334


In [None]:
all_results = []
for result in estimator.predict(predict_input_fn, yield_single_examples=True):
    if len(all_results) % 1000 == 0:
        print("Processing example: %d" % (len(all_results)))
    unique_id = int(result["unique_ids"])
    start_logits = [float(x) for x in result["start_logits"].flat]
    end_logits = [float(x) for x in result["end_logits"].flat]
    all_results.append(
          run_squad.RawResult(
              unique_id=unique_id,
              start_logits=start_logits,
              end_logits=end_logits))

INFO:tensorflow:Could not find trained model in model_dir: /tmp/tmp3pr3llw5, running initialization to predict.
Instructions for updating:
Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by `tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data optimizations will take care of using the fused implementation.
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU/GPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (None, 256)
INFO:tensorflow:  name = input_mask, shape = (None, 256)
INFO:tensorflow:  name = segment_ids, shape = (None, 256)
INFO:tensorflow:  name = unique_ids, shape = (None,)
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*
INFO

In [87]:
Prediction_DIR='./Predictions/'
output_prediction_file = os.path.join(Prediction_DIR, "predictions.json")
output_nbest_file = os.path.join(Prediction_DIR, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(Prediction_DIR, "null_odds.json")

In [None]:
def get_final_text(pred_text, orig_text, do_lower_case):
    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

  # We first tokenize `orig_text`, strip whitespace from the result
  # and `pred_text`, and check if they are the same length. If they are
  # NOT the same length, the heuristic has failed. If they are the same
  # length, we assume the characters are one-to-one aligned.
    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if True:
            tf.compat.v1.logging.info(
              "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if True:
            tf.compat.v1.logging.info("Length not equal after stripping spaces: '%s' vs '%s'",
                      orig_ns_text, tok_ns_text)
        return orig_text

  # We then project the characters in `pred_text` back to `orig_text` using
  # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if True:
            tf.logging.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if True:
            tf.logging.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text

In [None]:
def write_predictions(all_examples, all_features, all_results, n_best_size,
                      max_answer_length, do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file):
    tf.compat.v1.logging.info("Writing predictions to: %s" % (output_prediction_file))
    tf.compat.v1.logging.info("Writing nbest to: %s" % (output_nbest_file))

    example_index_to_features = collections.defaultdict(list)
    for feature in all_features:
        example_index_to_features[feature.example_index].append(feature)

    unique_id_to_result = {}
    for result in all_results:
        unique_id_to_result[result.unique_id] = result

    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
      "PrelimPrediction",
      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])

    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    scores_diff_json = collections.OrderedDict()

    for (example_index, example) in enumerate(all_examples):
        features = example_index_to_features[example_index]

        prelim_predictions = []
    # keep track of the minimum score of null start+end of position 0
        score_null = 1000000  # large and positive
        min_null_feature_index = 0  # the paragraph slice with min mull score
        null_start_logit = 0  # the start logit at the slice with min null score
        null_end_logit = 0  # the end logit at the slice with min null score
        for (feature_index, feature) in enumerate(features):
            result = unique_id_to_result[feature.unique_id]
            start_indexes = run_squad._get_best_indexes(result.start_logits, n_best_size)
            end_indexes = run_squad._get_best_indexes(result.end_logits, n_best_size)
            
            if False:
                feature_null_score = result.start_logits[0] + result.end_logits[0]
                if feature_null_score < score_null:
                    score_null = feature_null_score
                    min_null_feature_index = feature_index
                    null_start_logit = result.start_logits[0]
                    null_end_logit = result.end_logits[0]
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if start_index >= len(feature.tokens):
                        continue
                    if end_index >= len(feature.tokens):
                        continue
                    if start_index not in feature.token_to_orig_map:
                        continue
                    if end_index not in feature.token_to_orig_map:
                        continue
                    if not feature.token_is_max_context.get(start_index, False):
                        continue
                    if end_index < start_index:
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:
                        continue
                    prelim_predictions.append(
                        _PrelimPrediction(
                            feature_index=feature_index,
                            start_index=start_index,
                            end_index=end_index,
                            start_logit=result.start_logits[start_index],
                            end_logit=result.end_logits[end_index]))

        if False:
            prelim_predictions.append(
                _PrelimPrediction(
                    feature_index=min_null_feature_index,
                    start_index=0,
                    end_index=0,
                    start_logit=null_start_logit,
                    end_logit=null_end_logit))
        prelim_predictions = sorted(
            prelim_predictions,
            key=lambda x: (x.start_logit + x.end_logit),
            reverse=True)

        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
            "NbestPrediction", ["text", "start_logit", "end_logit"])

        seen_predictions = {}
        nbest = []
        for pred in prelim_predictions:
            if len(nbest) >= n_best_size:
                break
            feature = features[pred.feature_index]
            if pred.start_index > 0:  # this is a non-null prediction
                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
                orig_doc_start = feature.token_to_orig_map[pred.start_index]
                orig_doc_end = feature.token_to_orig_map[pred.end_index]
                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
                tok_text = " ".join(tok_tokens)

                # De-tokenize WordPieces that have been split off.
                tok_text = tok_text.replace(" ##", "")
                tok_text = tok_text.replace("##", "")

                # Clean whitespace
                tok_text = tok_text.strip()
                tok_text = " ".join(tok_text.split())
                orig_text = " ".join(orig_tokens)

                final_text = get_final_text(tok_text, orig_text, do_lower_case)
                if final_text in seen_predictions:
                    continue

                seen_predictions[final_text] = True
            else:
                final_text = ""
                seen_predictions[final_text] = True

            nbest.append(
                _NbestPrediction(
                    text=final_text,
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit))

    # if we didn't inlude the empty option in the n-best, inlcude it
        if False:
            if "" not in seen_predictions:
                nbest.append(
                    _NbestPrediction(
                        text="", start_logit=null_start_logit,
                        end_logit=null_end_logit))
    # In very rare edge cases we could have no valid predictions. So we
    # just create a nonce prediction in this case to avoid failure.
        if not nbest:
            nbest.append(
                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))

        assert len(nbest) >= 1

        total_scores = []
        best_non_null_entry = None
        for entry in nbest:
            total_scores.append(entry.start_logit + entry.end_logit)
            if not best_non_null_entry:
                if entry.text:
                    best_non_null_entry = entry

        probs = run_squad._compute_softmax(total_scores)

        nbest_json = []
        for (i, entry) in enumerate(nbest):
            output = collections.OrderedDict()
            output["text"] = entry.text
            output["probability"] = probs[i]
            output["start_logit"] = entry.start_logit
            output["end_logit"] = entry.end_logit
            nbest_json.append(output)

        assert len(nbest_json) >= 1

        if not False:
            all_predictions[example.qas_id] = nbest_json[0]["text"]
        else:
          # predict "" iff the null score - the score of best non-null > threshold
            score_diff = score_null - best_non_null_entry.start_logit - (
                best_non_null_entry.end_logit)
            scores_diff_json[example.qas_id] = score_diff
            if score_diff > 0.0:
                all_predictions[example.qas_id] = ""
            else:
                all_predictions[example.qas_id] = best_non_null_entry.text

        all_nbest_json[example.qas_id] = nbest_json

    with tf.io.gfile.GFile(output_prediction_file, "w") as writer:
        writer.write(json.dumps(all_predictions, indent=4) + "\n")

    with tf.io.gfile.GFile(output_nbest_file, "w") as writer:
        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

    if False:
        with tf.io.gfile.GFile(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

In [None]:
import collections
import json

write_predictions(eval_examples, eval_features, all_results,
                      20, 30,
                      True, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file)

## Evaluation on Testing Dataset

In [26]:
import re
from collections import Counter
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
  
def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(dataset, predictions):
    f1 = exact_match = total = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                total += 1
                if qa['id'] not in predictions:
                    continue
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'f1 Score': f1}

def evaluate_squad(data_file,pred_file):
    with open(data_file) as dataset_file:
        dataset_json = json.load(dataset_file)
        dataset = dataset_json['data']
    with open(pred_file) as prediction_file:
        predictions = json.load(prediction_file)
      
    print(evaluate(dataset, predictions))


In [27]:
evaluate_squad("./SQUAD/dev-v1.1.json","./first_results/Predictions/predictions.json")

{'f1 Score': 0.797341178243337}


## References
https://github.com/google-research/bert

https://github.com/allenai/bi-att-flow/blob/master/squad/