# Text Extraction with Electra

> Electra



Resource: [Text Extraction with Bert](https://keras.io/examples/nlp/text_extraction_with_bert/)

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
! pip install transformers
! pip install tokenizers

Collecting transformers
  Using cached https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl
Collecting huggingface-hub==0.0.8
  Using cached https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
  Using cached https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 6.8MB/s 
Installing collected packages: huggingface-hub, tokenizers, sacremoses, transformers
Successfully in

In [7]:
import json
import os
import random
import numpy as np
import collections
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import transformers
from transformers import TFElectraModel, ElectraTokenizerFast

In [8]:
def tokens_to_offsets(original_text, tokenized_text):
  offsets = []
  special_tokens_mask = tokenized_text['special_tokens_mask']
  tokens = tokenizer.convert_ids_to_tokens(tokenized_text["input_ids"])
  index = 0
  for i in range(len(tokens)):
    if special_tokens_mask[i] == 1:
      offsets.append((0, 0))
    else:
      new_index = index + len(tokens[i])
      offsets.append((index, new_index))

      while (new_index < len(original_text) and original_text[new_index] == ' '):
        new_index += 1
      index = new_index
  return offsets

In [9]:
class WikiElement:
    def __init__(self, question, context, answer, answer_start, answer_end):
        self.question = question
        self.context = context
        self.answer = answer
        self.answer_start = answer_start
        self.answer_end = answer_end
        
    def preprocess(self):
        # create context vector with answers marked
        context_vector = [0] * len(self.context)
        for index in range(self.answer_start, self.answer_end):
            context_vector[index] = 1
            
        # tokenize context   
        tokenized_context = tokenizer(self.context, return_offsets_mapping=True)
        context_offsets = tokenized_context['offset_mapping']

        # find answer token indices 
        answer_token_index = []
        for index, (start, end)  in enumerate(context_offsets):
            if sum(context_vector[start:end]) > 0: # if token is answer
                answer_token_index.append(index)
        
        if len(answer_token_index) == 0:
            return 0
        
        # start and end token index
        start_token_index = answer_token_index[0]
        end_token_index = answer_token_index[-1]
        
        # tokenize question
        tokenized_question = tokenizer(self.question, return_special_tokens_mask=True)

        # create inputs       
        input_ids = tokenized_context['input_ids'] + tokenized_question['input_ids'][1:]
        token_type_ids = [0] * len(tokenized_context['input_ids']) + [1] * len(tokenized_question['input_ids'][1:])
            
        attention_mask = [1] * len(input_ids)
        
        # padding for equal lenght sequence
        padding_length = max_len - len(input_ids)
        if padding_length > 0: # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length) # len(input) [1] + padding [0]
            token_type_ids = token_type_ids + ([0] * padding_length) # context [0] + question [1] + padding [0]
        elif padding_length < 0:
            return 0
        
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_index = start_token_index
        self.end_token_index = end_token_index
        self.context_token_to_char = context_offsets
        return 1
            
      
    def class_print(self):
        print("Question: {}\nAnswer: {}\nAnswer Start: {}\nAnswer End: {}\nContext: {}".format(self.question, 
                                                                                              self.answer,  
                                                                                              self.answer_start, 
                                                                                              self.answer_end,
                                                                                              self.context))

In [20]:
def read_json(file_name):
    with open(file_name, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    return data

def json_to_list(json_dataset):
    dataset = []
    for paragraph_element in json_dataset["data"]:
        for question_element in paragraph_element["qas"]:
            dataset.append(WikiElement(question_element["question"],
                                       paragraph_element["text"],
                                       question_element["answer"],
                                       question_element["answer_start"],
                                       question_element["answer_end"]))
    print("Number of questions: ", len(dataset))
    return dataset

def create_input_targets(dataset):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_index": [],
        "end_token_index": [],
    }
    i=0
    for item in dataset:
        # print(i)
        i = i + 1
        # print(item.class_print())
        for key in dataset_dict:
            dataset_dict[key].append(getattr(item, key))
            
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
        
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    
    y = [dataset_dict["start_token_index"], dataset_dict["end_token_index"]]
    return x, y

def find_max_length(dataset):
    max_ = 0
    index = 0
    i = 0
    for element in dataset:
        tokenized_question = tokenizer(element.question)
        tokenized_context = tokenizer(element.context)
        input_ids = tokenized_context['input_ids'] + tokenized_question['input_ids'][1:]
        
        if len(input_ids) > max_:
            max_ = len(input_ids)
            index = i
        i += 1
        
    print("Max length: {}, Index: {}".format(max_, index))
    return max_

def train_test_split(dataset):
    random.shuffle(dataset) 
    cut = int(len(dataset)*0.1)
    train, test = dataset[:-cut], dataset[-cut:] 
    
    return train, test

def create_model():
    ## Electra encoder
    encoder = TFElectraModel.from_pretrained(MODEL_NAME)
    
    # QA model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder.electra(input_ids,
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_mask)[0]
    
    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)
    
    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)
    
    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

In [11]:
import pickle

def save_data_as_file(data, file_name):
  with open(path + file_name + ".dat", "wb") as f:
    pickle.dump(data, f)

def read_saved_data(file_name):
  with open(path + file_name + ".dat", "rb") as f:
    data = pickle.load(f)
    return data


## Load Tokenizer

In [12]:
path = "/content/gdrive/MyDrive/Q&A projesi/"
models_path = path + "models/"
MODEL_NAME = "dbmdz/electra-base-turkish-cased-discriminator"
model_save_name = MODEL_NAME.split("/")[0] + "-" + MODEL_NAME.split("/")[1] + "/"
save_path = models_path + model_save_name

In [13]:
tokenizer = ElectraTokenizerFast.from_pretrained(MODEL_NAME, do_lower_case=False)

if not os.path.exists(save_path):
    os.makedirs(save_path)
    
tokenizer.save_pretrained(save_path)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=251003.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=83.0, style=ProgressStyle(description_w…




('/content/gdrive/MyDrive/Q&A projesi/models/dbmdz-electra-base-turkish-cased-discriminator/tokenizer_config.json',
 '/content/gdrive/MyDrive/Q&A projesi/models/dbmdz-electra-base-turkish-cased-discriminator/special_tokens_map.json',
 '/content/gdrive/MyDrive/Q&A projesi/models/dbmdz-electra-base-turkish-cased-discriminator/vocab.txt',
 '/content/gdrive/MyDrive/Q&A projesi/models/dbmdz-electra-base-turkish-cased-discriminator/added_tokens.json',
 '/content/gdrive/MyDrive/Q&A projesi/models/dbmdz-electra-base-turkish-cased-discriminator/tokenizer.json')

## Load Dataset

In [None]:
file_path = path + "json_dataset/Wiki_Dataset_Final.json"
json_dataset = read_json(file_path)
json_dataset["data"][0]["qas"][1]

{'answer': 'Bulgaristan',
 'answer_end': 196,
 'answer_start': 185,
 'id': 1,
 'question': "Türkiye'nin kuzeybatısındaki komşusu kimdir?"}

In [None]:
raw_dataset = json_to_list(json_dataset)
raw_dataset[0].class_print()

Number of questions:  5025
Question: Türkiye'nin topraklarının büyük bölümü nerededir?
Answer: Anadolu
Answer Start: 69
Answer End: 76
Context: Türkiye Cumhuriyeti ya da kısaca Türkiye, topraklarının büyük bölümü Anadolu'da, küçük bir bölümü ise Balkan Yarımadası'nın güneydoğu uzantısı olan Trakya'da yer alan ülke. Kuzeybatıda Bulgaristan, batıda Yunanistan, kuzeydoğuda Gürcistan, doğuda Ermenistan, İran ve Azerbaycan'ın ekslav toprağı Nahçıvan, güneydoğuda ise Irak ve Suriye komşusudur. Güneyini Kıbrıs adası ve Akdeniz. Batısını Ege Denizi ve kuzeyini Karadeniz çevreler. Marmara Denizi ise İstanbul Boğazı ve Çanakkale Boğazı ile birlikte Anadolu'yu Trakya'dan yani Asya'yı Avrupa'dan ayırır. Türkiye, Avrupa ve Asya'nın kavşak noktasında yer alması sayesinde önemli bir jeostratejik güce sahiptir.


In [None]:
max_len = 384

dataset = []
for data in raw_dataset:
    flag = data.preprocess()
    if flag == 1:
      dataset.append(data)
    
print("Dataset len: ", len(dataset))

Dataset len:  4154


In [None]:
train, test = train_test_split(dataset)

x_train, y_train = create_input_targets(train)
x_test, y_test = create_input_targets(test)

print(len(x_train[0]), len(y_train[0]), len(x_test[0]), len(y_test[0]))

3739 3739 415 415


In [None]:
save_data_as_file(test, "test_384_electra")
save_data_as_file(train, "train_384_electra")

## Training

In [14]:
max_len = 512
train = read_saved_data("train_" + str(max_len) + "_electra")
test = read_saved_data("test_" + str(max_len) + "_electra")

In [24]:
x_train, y_train = create_input_targets(train)
x_test, y_test = create_input_targets(test)

print(len(x_train[0]), len(y_train[0]), len(x_test[0]), len(y_test[0]))

4266 4266 474 474


In [15]:
configuration = transformers.ElectraConfig()  # default parameters and configuration for ELECTRA

In [None]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()

In [33]:
# Load Weights from Drive
# model.load_weights(path + "models/bertV1_weights.h5")

model.fit(
    x_train,
    y_train,
    epochs=3,  # For demonstration, 3 epochs are recommended
    verbose=2,
    batch_size=64,
)

Epoch 1/3
67/67 - 16s - loss: 0.0358 - activation_2_loss: 0.0163 - activation_3_loss: 0.0195
Epoch 2/3
67/67 - 16s - loss: 0.0325 - activation_2_loss: 0.0151 - activation_3_loss: 0.0174
Epoch 3/3
67/67 - 16s - loss: 0.0397 - activation_2_loss: 0.0166 - activation_3_loss: 0.0231


<tensorflow.python.keras.callbacks.History at 0x7f8fed595910>

In [34]:
pred_start, pred_end = model.predict(x_test)
count = 0
results = []
total_f1 = 0
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
  element = test[idx]
  offsets = element.context_token_to_char
  start = np.argmax(start)
  end = np.argmax(end)

  if start >= len(offsets):
    continue

  pred_char_start = offsets[start][0]

  if end < len(offsets):
    pred_char_end = offsets[end][1]
    pred_ans = element.context[pred_char_start:pred_char_end]
  else:
    pred_ans = element.context[pred_char_start:]

  pred_tokens = pred_ans.split()
  true_tokens = element.answer.split()
  common = collections.Counter(true_tokens) & collections.Counter(pred_tokens)
  num_same = sum(common.values())

  if len(true_tokens) == 0 or len(pred_tokens) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    f1 =  int(true_tokens == pred_tokens)
  elif num_same == 0:
    f1 =  0
  else:
    precision = 1.0 * num_same / len(pred_tokens)
    recall = 1.0 * num_same / len(true_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
  total_f1 += f1

  results.append({
      "question": element.question,
      "true answer": element.answer,
      "predicted answer": pred_ans,
      "context": element.context,
      "f1 score": f1,
  })

  # print(f"Question: {element.question}")
  # print(f"Prediction: {pred_ans}\nTrue Answer: {element.answer}")
  # print(f"Context: {element.context}")
  # print("\n")
  if pred_ans == element.answer:
    count += 1

acc = count / len(y_test[0])
F1 = total_f1 / len(y_test[0])

print(f"exact match:={acc:.2f} f1:={F1:.2f}")

exact match:=0.64 f1:=0.81


In [None]:
results[0]["question"]

'Başlıca İslam mezhepleri nelerdir?'

In [None]:
with open(save_path + "test-results/" + model_save_name.split("/")[0] + "_10epoch_result.txt", "w") as f:
  for result in results:
    f.write('%s\n' %result)

### Save Weights to Google Drive

In [35]:
model.save_weights(save_path + "weights/" + model_save_name.split("/")[0] + "_seqlen512_bacth64_epochs15_weights.h5")