# Question Answering BERT Testing Notebook

## Libraries

In [1]:
import json
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertTokenizer, TFBertModel, BertConfig
from tokenizers import BertWordPieceTokenizer

## Global Parameters

In [2]:
MAX_LEN = 512
MODEL_NAME = "dbmdz/bert-base-turkish-cased"
configuration = BertConfig()

## Create Model

In [3]:
def create_model():
    ## BERT encoder
    encoder = TFBertModel.from_pretrained(MODEL_NAME)
    
    # QA model
    input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    embedding = encoder.bert(input_ids,
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_mask)[0]
    
    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)
    
    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)
    
    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

## Load Tokenizer

In [4]:
slow_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
splitted_model =  MODEL_NAME.split("/")
save_path = "bert_base_turkish_cased/"

if not os.path.exists(save_path):
    os.makedirs(save_path)
    
slow_tokenizer.save_pretrained(save_path)
tokenizer = BertWordPieceTokenizer(save_path + "vocab.txt", lowercase=False)

## Create model and Load Weights

In [5]:
model = create_model()
model.summary()

Some layers from the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-turkish-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 110617344   input_1[0][0]                    
______________________________________________________________________________________________

In [6]:
model.load_weights("bert_base_turkish_cased/weights/dbmdz-bert-base-turkish-cased_seqlen512_epochs10.h5")

## Testing

In [7]:
class WikiElement:
    def __init__(self, question, context):
        self.question = question
        self.context = context
        
    def preprocess(self):    
        # tokenize context   
        tokenized_context = tokenizer.encode(self.context)
        
        # tokenize question
        tokenized_question = tokenizer.encode(self.question)

        # create inputs       
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
            
        attention_mask = [1] * len(input_ids)
        
        # padding for equal lenght sequence
        padding_length = MAX_LEN - len(input_ids)
        if padding_length > 0: # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length) # len(input) [1] + padding [0]
            token_type_ids = token_type_ids + ([0] * padding_length) # context [0] + question [1] + padding [0]
        elif padding_length < 0:
            return
        
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.context_token_to_char = tokenized_context.offsets

In [8]:
def create_input_targets(element):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
    }
    
    i = 0

    for key in dataset_dict:
        dataset_dict[key].append(getattr(element, key))
            
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
        
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    return x

In [15]:
def predict_answer(question, context):
    element = WikiElement(my_question, my_context)
    element.preprocess()
    
    x = create_input_targets(element)
    
    pred_start, pred_end = model2.predict(x)
    
    start = np.argmax(pred_start)
    end = np.argmax(pred_end)
    offsets = element.context_token_to_char

    pred_char_start = offsets[start][0]

    if end < len(offsets):
        pred_char_end = offsets[end][1]
        pred_ans = element.context[pred_char_start:pred_char_end]
    else:
        pred_ans = element.context[pred_char_start:]

    print("question: {}\n\npredicted_answer: {}\n\ncontext: {}".format(element.question, pred_ans, element.context))
    
    result = {"question": element.question,
             "predicted_answer": pred_ans,
             "context": element.context}
    
    return result

In [9]:
my_context = "Soy gaz veya asal gaz, standart şartlar altında her biri, diğer elementlere kıyasla daha düşük kimyasal reaktifliğe sahip, kokusuz, renksiz, tek atomlu gaz olan kimyasal element grubudur. Helyum (He), neon (Ne), argon (Ar), kripton (Kr), ksenon (Xe) ve radon (Rn) doğal olarak bulunan altı soy gazdır ve tamamı ametaldir. Her biri periyodik tablonun sırasıyla ilk altı periyodunda, 18. grubunda (8A) yer alır. Grupta yer alan oganesson (Og) için ise önceleri soy gaz olabileceği ihtimali üzerinde durulsa da günümüzde metalik görünümlü reaktif bir katı olduğu öngörülmektedir."
my_question = "Doğal olarak bulunan altı soy gaz nelerdir?"

In [16]:
result = predict_answer(my_question, my_context)

question: Doğal olarak bulunan altı soy gaz nelerdir?

predicted_answer: Helyum (He), neon (Ne), argon (Ar), kripton (Kr), ksenon (Xe) ve radon (Rn)

context: Soy gaz veya asal gaz, standart şartlar altında her biri, diğer elementlere kıyasla daha düşük kimyasal reaktifliğe sahip, kokusuz, renksiz, tek atomlu gaz olan kimyasal element grubudur. Helyum (He), neon (Ne), argon (Ar), kripton (Kr), ksenon (Xe) ve radon (Rn) doğal olarak bulunan altı soy gazdır ve tamamı ametaldir. Her biri periyodik tablonun sırasıyla ilk altı periyodunda, 18. grubunda (8A) yer alır. Grupta yer alan oganesson (Og) için ise önceleri soy gaz olabileceği ihtimali üzerinde durulsa da günümüzde metalik görünümlü reaktif bir katı olduğu öngörülmektedir.


In [13]:
model.save("bert_base_turkish_cased/dbmdz-bert-base-turkish-cased_seqlen512_epochs10")

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: bert_base_turkish_cased/dbmdz-bert-base-turkish-cased_seqlen512_epochs10\assets


Hata için buna bak [github](https://github.com/huggingface/transformers/issues/3627)

Modeli model dosyası olarak kaydet, yoksa yüklemesi baya uzun sürüyor.
Arkaplanda çalışan bir servis olsun. Bert, Electra, Albert.



In [14]:
model2 = keras.models.load_model('bert_base_turkish_cased/dbmdz-bert-base-turkish-cased_seqlen512_epochs10')