<a href="https://colab.research.google.com/github/flogothetis/SQuAD-QueryAnswering-BERT-Keras/blob/main/SQuAD_QuestionAnswering_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
import json 
import numpy as np
import tensorflow as tf
import re
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel,BertConfig
import os
import string 

configuration = BertConfig()  # default parameters and configuration for BERT
max_len = 384

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 17.4MB/s eta 0:00:01[K     |▌                               | 20kB 2.1MB/s eta 0:00:01[K     |▉                               | 30kB 2.6MB/s eta 0:00:01[K     |█                               | 40kB 3.0MB/s eta 0:00:01[K     |█▎                              | 51kB 2.4MB/s eta 0:00:01[K     |█▋                              | 61kB 2.7MB/s eta 0:00:01[K     |█▉                              | 71kB 3.0MB/s eta 0:00:01[K     |██                              | 81kB 3.3MB/s eta 0:00:01[K     |██▍                             | 92kB 3.4MB/s eta 0:00:01[K     |██▋                             | 102kB 3.3MB/s eta 0:00:01[K     |██▉                             | 112kB 3.3MB/s eta 0:00:01[K     |███▏                            | 122kB 3.3M

## Bert Tokenizer and model for fine-tuning

In [2]:
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




## Preprocessing of data
- Create each example as SQuAD class object

In [3]:
class SQuADEntry:
    def __init__(self, context, question, basic_answer, more_answers, start_idx):
        self.context = context
        self.question = question
        self.basic_answer = basic_answer
        self.more_answers = more_answers
        self.start_idx = start_idx
        self.end_idx = None
        self.start_idx_token = start_idx
        self.end_idx_token = None
        self.offsets = None
        self.input_ids = None 
        self.attention_mask = None
        self.token_type_ids = None
        self.validExample= True
    def __call__(self):

         # Clean context, answer and question
        self.context = " ".join(str(self.context).split())
        self.question = " ".join(str(self.question).split())
        self.basic_answer = " ".join(str(self.basic_answer).split())


        #Calculate end_idx
        self.end_idx = self.start_idx + len(self.basic_answer)
        if (self.end_idx >=len(self.context)):
            self.validExample= False
            return

        #find characters of context that are part of answer
        is_part_of_answer = [0]*len(self.context)
        for i in range (self.start_idx, self.end_idx):
            is_part_of_answer[i] = 1

        contextTokenizer =  tokenizer.encode(self.context)  
        #find index of token that corresponds to start and the end of the answer
        answer_id_token=[]
        for idx, (start,end) in enumerate(contextTokenizer.offsets):
            if (sum(is_part_of_answer[start:end]) >0 ):
                answer_id_token.append(idx)
        #data to predict
        if len(answer_id_token) == 0 :
            self.validExample=False
            return         
        self.start_idx_token = answer_id_token[0]
        self.end_idx_token = answer_id_token[-1]
        self.offsets = contextTokenizer.offsets
        
        # work on question
        questionTokinizer  = tokenizer.encode(self.question)

        #Create model's inputs 
        self.input_ids = contextTokenizer.ids + questionTokinizer.ids[1:]
        # Dropout long examples
        if (max_len - len(self.input_ids) < 0 ):
            self.validExample= False
            return
        self.attention_mask = [1] * len (self.input_ids)
        self.token_type_ids = [0] * len(contextTokenizer.ids) + \
                                [1]*len(questionTokinizer.ids[1:])

                                




## Load data from JSON

In [4]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]

                #context, question, basic_answer, more_answers, start_idx
                squad_eg = SQuADEntry(
                 context,question, answer_text, all_answers, start_char_idx,
                )
                squad_eg()
                squad_examples.append(squad_eg)
    return squad_examples

# Data url 
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = tf.keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = tf.keras.utils.get_file("eval.json", eval_data_url)  
#open JSON's
with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)


Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json


## Creating training examples

In [5]:
def createTrainingExamples(sqaud_examples):
    dict_training={
        "input_ids" : [],
        "attention_mask" : [],
        "token_type_ids" : [],
        "start_idx_token" : [],
        "end_idx_token" : []
    }
    count = 0
    for x in sqaud_examples:
        if (x.validExample):
            dict_training["input_ids"].append(x.input_ids)
            dict_training["attention_mask"].append(x.attention_mask)
            dict_training["token_type_ids"].append(x.token_type_ids)
            dict_training['start_idx_token'].append(x.start_idx_token)
            dict_training['end_idx_token'].append(x.end_idx_token)
        else:
            count+=1
    print(count)            
    #convert to numpy arrays 
    for key in dict_training:
        dict_training[key]= np.array(dict_training[key])
        print(dict_training[key])
    #padding
    dict_training["input_ids"] = tf.keras.preprocessing.sequence.pad_sequences( dict_training["input_ids"],padding='post',maxlen=max_len)
    dict_training["attention_mask"]=tf.keras.preprocessing.sequence.pad_sequences(dict_training["attention_mask"],padding='post',maxlen=max_len)
    dict_training["token_type_ids"]=tf.keras.preprocessing.sequence.pad_sequences(dict_training["token_type_ids"],padding='post',maxlen=max_len)
            
    X= [dict_training["input_ids"], dict_training["attention_mask"], dict_training["token_type_ids"]]
    Y= [dict_training["start_idx_token"], dict_training["end_idx_token"]]
    return X,Y    




In [None]:
sQUADTrain  = create_squad_examples(raw_train_data)
XTrain, YTrain = createTrainingExamples(sQUADTrain)
sQUADEval  = create_squad_examples(raw_eval_data)
XEval, YEval = createTrainingExamples(sQUADEval)

In [8]:
print(f"{len(sQUADTrain)} training points created.")
print(f"{len(sQUADEval)} evaluation points created.")


87599 training points created.
10570 evaluation points created.


## Create model


In [28]:
def createModel():
    #Bert model
    TFmodel = TFBertModel.from_pretrained('bert-base-uncased')
    #Create model's inputs
    inputIds= tf.keras.layers.Input((max_len,), dtype=tf.int32)
    inputAttention= tf.keras.layers.Input((max_len,), dtype=tf.int32)
    inputTokenType= tf.keras.layers.Input((max_len,), dtype=tf.int32)
    
    #Pass through BERT model
    bertOut= TFmodel(inputIds,attention_mask=inputAttention,token_type_ids=inputTokenType)[0]

    #Create classification layer
    start_logits= tf.keras.layers.Dense(1, use_bias= False)(bertOut)
    start_logits= tf.keras.layers.Flatten()(start_logits)    
    end_logits= tf.keras.layers.Dense(1, use_bias=False)(bertOut)
    end_logits= tf.keras.layers.Flatten()(end_logits)
    
    #Pass though softmax
    start_prob = tf.keras.layers.Activation(tf.keras.activations.softmax)(start_logits)
    end_prob = tf.keras.layers.Activation(tf.keras.activations.softmax)(end_logits)

    model =tf.keras.Model(inputs=[ inputIds, inputAttention, inputTokenType],
                          outputs=[start_prob, end_prob])
    loss= tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

    optimizer = tf.keras.optimizers.Adam(lr= 5e-5)
    model.compile(optimizer = optimizer, loss= [loss, loss])
    return model





## Run on TPU's

In [29]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = createModel()
else:
    model = createModel()

model.summary()





INFO:tensorflow:Initializing the TPU system: grpc://10.30.125.34:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.30.125.34:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   ((None, 384, 768), ( 109482240   input_4[0][0]                    
                                                                 input_5[0][0]         

## Train and evaluate model on validation data

In [30]:
def normalizeText(text):
    # lower case
    text = text.lower()
    #remove a, an, the 
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)

    #remove puctuation
    puctuation = set(string.punctuation)
    text = "".join(_ for _ in text if _ not in puctuation)
    #remove extra white spaces
    text = " ".join(text.split())
    return text

class ExactMatchCallback(tf.keras.callbacks.Callback):
    def __init__(self, x_eval, y_eval):
        self.x_eval=x_eval
        self.y_eval=y_eval

    def on_epoch_end(self,epoch,logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        valid_examples = [_ for _ in sQUADEval if _.validExample]
        counter = 0
        for idx, (start,end) in enumerate(zip(pred_start, pred_end)):
            squadV = valid_examples[idx]
            #start and end is max_len-demensionality vector.
            token_idx_start = np.argmax(start)
            token_idx_end = np.argmax(end)
            if (token_idx_start >= len(squadV.offsets)):
                continue
            # start and end indices in context
            start_char_idx = squadV.offsets[token_idx_start][0]
    
            #predicted answer
            if (token_idx_end < len(squadV.offsets)):
                end_char_idx = squadV.offsets[token_idx_end][1]
                predictedAnswer = squadV.context[start_char_idx:end_char_idx]
            else:
                predictedAnswer = squadV.context[start_char_idx:]
            #normalize texts to check similarity with the other available answers
            available_answers = [normalizeText(_) for _ in squadV.more_answers]
            predictedAnswer = normalizeText(predictedAnswer)
            #Check for similarity of answers
            if (predictedAnswer in available_answers):
                counter+=1
        acc = counter/len(self.y_eval[0])
        print()
        print("Exact Match: ", acc)    



In [31]:
history = model.fit(x=XTrain, y=YTrain, batch_size= 64, epochs=3, verbose=1, callbacks=[ExactMatchCallback(XEval,YEval)])


Epoch 1/3




















Exact Match:  0.7742716097183235
Epoch 2/3
Exact Match:  0.7890814054786565
Epoch 3/3
Exact Match:  0.7794018004065434
