# Text Extraction with BERT

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
! pip install transformers
! pip install tokenizers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 6.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 25.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 46.3MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=99050f451e8

In [3]:
import json
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertTokenizer, TFBertModel, BertConfig
from tokenizers import BertWordPieceTokenizer

In [4]:
class WikiElement:
    def __init__(self, question, context, answer, answer_start, answer_end):
        self.question = question
        self.context = context
        self.answer = answer
        self.answer_start = answer_start
        self.answer_end = answer_end
        
    def preprocess(self):
        # create context vector with answers marked
        context_vector = [0] * len(self.context)
        for index in range(self.answer_start, self.answer_end):
            context_vector[index] = 1
            
        # tokenize context   
        tokenized_context = tokenizer.encode(self.context)
        
        # find answer token indices 
        answer_token_index = []
        for index, (start, end)  in enumerate(tokenized_context.offsets):
            if sum(context_vector[start:end]) > 0: # if token is answer
                answer_token_index.append(index)
        
        if len(answer_token_index) == 0:
            return
        
        # start and end token index
        start_token_index = answer_token_index[0]
        end_token_index = answer_token_index[-1]
        
        # tokenize question
        tokenized_question = tokenizer.encode(self.question)

        # create inputs       
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
            
        attention_mask = [1] * len(input_ids)
        
        # padding for equal lenght sequence
        padding_length = max_len - len(input_ids)
        if padding_length > 0: # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length) # len(input) [1] + padding [0]
            token_type_ids = token_type_ids + ([0] * padding_length) # context [0] + question [1] + padding [0]
        elif padding_length < 0:
            return
        
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_index = start_token_index
        self.end_token_index = end_token_index
        self.context_token_to_char = tokenized_context.offsets
            
      
    def class_print(self):
        print("Question: {}\nAnswer: {}\nAnswer Start: {}\nAnswer End: {}\nContext: {}".format(self.question, 
                                                                                              self.answer,  
                                                                                              self.answer_start, 
                                                                                              self.answer_end,
                                                                                              self.context))

In [5]:
def read_json(file_name):
    with open(file_name, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    return data

def json_to_list(json_dataset):
    dataset = []
    for paragraph_element in json_dataset["data"]:
        for question_element in paragraph_element["qas"]:
            dataset.append(WikiElement(question_element["question"],
                                       paragraph_element["text"],
                                       question_element["answer"],
                                       question_element["answer_start"],
                                       question_element["answer_end"]))
    print("Number of questions: ", len(dataset))
    return dataset

def create_input_targets(dataset):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_index": [],
        "end_token_index": [],
    }
    
    for item in dataset:
        for key in dataset_dict:
            dataset_dict[key].append(getattr(item, key))
            
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
        
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    
    y = [dataset_dict["start_token_index"], dataset_dict["end_token_index"]]
    return x, y

def find_max_length(dataset):
    max_ = 0
    index = 0
    i = 0
    for element in dataset:
        tokenized_question = tokenizer.encode(element.question)
        tokenized_context = tokenizer.encode(element.context)
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        
        if len(input_ids) > max_:
            max_ = len(input_ids)
            index = i
        i += 1
        
    print("Max length: {}, Index: {}".format(max_, index))
    return max_

def train_test_split(dataset):
    random.shuffle(dataset) 
    cut = int(len(dataset)*0.1)
    train, test = dataset[:-cut], dataset[-cut:] 
    
    return train, test

def create_model():
    ## BERT encoder
    encoder = TFBertModel.from_pretrained("dbmdz/bert-base-turkish-cased")
    
    # QA model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(input_ids,
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_mask)[0]
    
    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)
    
    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)
    
    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

## Load BERT Tokenizer

In [6]:
path = "/content/gdrive/MyDrive/Graduation Thesis/"

In [7]:
slow_tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
save_path = path + "bert_base_turkish_cased/"

if not os.path.exists(save_path):
    os.makedirs(save_path)
    
slow_tokenizer.save_pretrained(save_path)
tokenizer = BertWordPieceTokenizer(path + "bert_base_turkish_cased/vocab.txt", lowercase=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=251003.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=60.0, style=ProgressStyle(description_w…




## Load Dataset

In [8]:
file_path = path + "wiki_dataV1.json"
json_dataset = read_json(file_path)
json_dataset["data"][0]["qas"][1]

{'answer': 'Bulgaristan',
 'answer_end': 196,
 'answer_start': 185,
 'id': 1,
 'question': "Türkiye'nin kuzeybatısındaki komşusu kimdir?"}

In [9]:
raw_dataset = json_to_list(json_dataset)
raw_dataset[0].class_print()

Number of questions:  964
Question: Türkiye'nin topraklarının büyük bölümü nerededir?
Answer: Anadolu
Answer Start: 69
Answer End: 76
Context: Türkiye Cumhuriyeti ya da kısaca Türkiye, topraklarının büyük bölümü Anadolu'da, küçük bir bölümü ise Balkan Yarımadası'nın güneydoğu uzantısı olan Trakya'da yer alan ülke. Kuzeybatıda Bulgaristan, batıda Yunanistan, kuzeydoğuda Gürcistan, doğuda Ermenistan, İran ve Azerbaycan'ın ekslav toprağı Nahçıvan, güneydoğuda ise Irak ve Suriye komşusudur. Güneyini Kıbrıs adası ve Akdeniz. Batısını Ege Denizi ve kuzeyini Karadeniz çevreler. Marmara Denizi ise İstanbul Boğazı ve Çanakkale Boğazı ile birlikte Anadolu'yu Trakya'dan yani Asya'yı Avrupa'dan ayırır. Türkiye, Avrupa ve Asya'nın kavşak noktasında yer alması sayesinde önemli bir jeostratejik güce sahiptir.


In [10]:
max_len = find_max_length(raw_dataset)

dataset = []
for data in raw_dataset:
    data.preprocess()
    dataset.append(data)
    
print("Dataset len: ", len(dataset))

Max length: 954, Index: 705
Dataset len:  964


In [11]:
train, test = train_test_split(dataset)

x_train, y_train = create_input_targets(train)
x_test, y_test = create_input_targets(test)

print(len(x_train[0]), len(x_test[0]))

868 96


In [12]:
configuration = BertConfig()  # default parameters and configuration for BERT

In [13]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.44.212.106:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.44.212.106:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=545150592.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-turkish-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported


Cause: while/else statement not yet supported


Cause: while/else statement not yet supported




Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 954)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 954)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 954)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 110617344   input_1[0][0]                    
                                                                 input_3[0][0]                

In [14]:
model.fit(
    x_train,
    y_train,
    epochs=3,  # For demonstration, 3 epochs are recommended
    verbose=2,
    batch_size=64,
)

Epoch 1/3
























14/14 - 140s - loss: 9.9618 - activation_loss: 4.7476 - activation_1_loss: 5.2141
Epoch 2/3
14/14 - 8s - loss: 4.5823 - activation_loss: 2.0470 - activation_1_loss: 2.5353
Epoch 3/3
14/14 - 8s - loss: 3.2116 - activation_loss: 1.4730 - activation_1_loss: 1.7387


<tensorflow.python.keras.callbacks.History at 0x7fd2de5668d0>

In [24]:
pred_start, pred_end = model.predict(x_test)
count = 0

for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
  element = test[idx]
  offsets = element.context_token_to_char
  start = np.argmax(start)
  end = np.argmax(end)

  if start >= len(offsets):
    continue

  pred_char_start = offsets[start][0]

  if end < len(offsets):
    pred_char_end = offsets[end][1]
    pred_ans = element.context[pred_char_start:pred_char_end]
  else:
    pred_ans = element.context[pred_char_start:]
  '''print(f"Question: {element.question}")
  print(f"Prediction: {pred_ans}\nTrue Answer: {element.answer}")
  print(f"Context: {element.context}")
  print("\n")'''
  if pred_ans == element.answer:
    count += 1

acc = count / len(y_test[0])

print(f"accuracy:={acc:.2f}")

accuracy:=0.49


In [None]:
model.save(path + "bert_model_v1")