<a href="https://colab.research.google.com/github/derektorquette/pln-com-deep-learning-ia-expert/blob/main/perguntas_respostas_com_BART.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Etapa 1: Instalação e importação das bibliotecas

- SQUAD Link: https://rajpurkar.github.io/SQuAD-explorer/

In [1]:
#!pip install tf-models-nightly==2.9.0.dev20220528
#!pip install tf-nightly==2.9.0.dev20220201

In [None]:
!pip install tf-models-official==2.2
# !pip install tensorflow==2.2
!pip install tensorflow==2.8.0rc0 # atualização
!pip install tensorflow==2.16

In [None]:
import tensorflow as tf
tf.__version__

'2.2.0'

In [None]:
import tensorflow_hub as hub
from official.nlp.bert.tokenization import FullTokenizer
from official.nlp.bert.input_pipeline import create_squad_dataset
from official.nlp.data.squad_lib import generate_tf_record_from_json_file
#from official.nlp.bert.data.squad_lib import generate_tf_record_from_json_file
from official.nlp.data.squad_lib import generate_tf_record_from_json_file
from official.nlp import optimization
from official.nlp.data.squad_lib import read_squad_examples
from official.nlp.data.squad_lib import FeatureWriter
from official.nlp.data.squad_lib import convert_examples_to_features
from official.nlp.data.squad_lib import write_predictions

In [None]:
import numpy as np
import math
import random
import time
import json
import collections
import os
from google.colab import drive

# Etapa 2: Pré-processamento

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
input_meta_data = generate_tf_record_from_json_file(
    "/content/drive/My Drive/Cursos - recursos/BERT/train-v1.1.json",
    "/content/drive/My Drive/Cursos - recursos/BERT/vocab.txt",
    "/content/drive/My Drive/Cursos - recursos/BERT/train-v1.1.tf_record"
)

In [None]:
with tf.io.gfile.GFile("/content/drive/My Drive/Cursos - recursos/BERT/train_meta_data", "w") as writer:
  writer.write(json.dumps(input_meta_data, indent=4) + "\n")

In [None]:
BATCH_SIZE = 4
train_dataset = create_squad_dataset(
    "/content/drive/My Drive/Cursos - recursos/BERT/train-v1.1.tf_record",
    input_meta_data['max_seq_length'], # 384
    BATCH_SIZE,
    is_training = True
)

# Etapa 3: Construção do modelo

## Camada Squad

In [None]:
class BertSquardLayer(tf.keras.layers.Layer):
  def __init__(self):
    super(BertSquardLayer, self).__init__()
    self.final_dense = tf.keras.layers.Dense(units=2,
                                             kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))

  def call(self, inputs):
    logits = self.final_dense(inputs) # (batch_size, seq_len, 2)
    logits = tf.transpose(logits, [2, 0, 1]) # (2, batch_size, seq_len)
    unstacked_logits = tf.unstack(logits, axis=0) # [(batch_size, seq_len), (batch_size, seq_len)]
    return unstacked_logits[0], unstacked_logits[1]

## Modelo completo

In [None]:
class BERTSquad(tf.keras.Model):
  def __init__(self, name="bert_squad"):
    super(BERTSquad, self).__init__(name=name)
    self.bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                                     trainable = True)
    self.squad_layer = BertSquardLayer()

  def apply_bert(self, inputs):
    _, sequence_output = self.bert_layer([inputs["input_word_ids"],
                                          inputs["input_mask"],
                                          inputs["input_type_ids"]])
    return sequence_output

  def call(self, inputs):
    seq_outputs = self.apply_bert(inputs)
    start_logits, end_logits = self.squad_layer(seq_outputs)
    return start_logits, end_logits

# Etapa 4: Treinamento

## Criação da IA

In [None]:
TRAIN_DATA_SIZE = 88641
NB_BATCHES_TRAIN = 2000
BATCH_SIZE = 4
NB_EPOCHS = 3
INIT_LR = 5e-5
WARMUP_STEPS = int(NB_BATCHES_TRAIN * 0.1)

In [None]:
train_dataset_light = train_dataset.take(NB_BATCHES_TRAIN)

In [None]:
bert_squad = BERTSquad()

In [None]:
optimizer = optimization.create_optimizer(init_lr=INIT_LR,
                                          num_train_steps=NB_BATCHES_TRAIN,
                                          num_warmup_steps = WARMUP_STEPS)

In [None]:
def squad_loss_fn(labels, model_outputs):
  start_positions = labels['start_positions']
  end_positions = labels['end_positions']
  start_logits, end_logits = model_outputs

  start_loss = tf.keras.backend.sparse_categorical_crossentropy(start_positions, start_logits, from_logits=True)
  end_loss = tf.keras.backend.sparse_categorical_crossentropy(end_positions, end_logits, from_logits=True)

  total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2

  return total_loss

In [None]:
train_loss = tf.keras.metrics.Mean(name="train_loss")

In [None]:
bert_squad.compile(optimizer, squad_loss_fn)

In [None]:
checkpoint_path = "/content/drive/My Drive/Cursos - recursos/BERT/Q&A/"
ckpt = tf.train.Checkpoint(bert_squad=bert_squad)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


## Treinamento personalizado

In [None]:
for epoch in range(NB_EPOCHS):
  print("Start of epoch {}".format(epoch + 1))
  start = time.time()

  train_loss.reset_states()

  for (batch, (inputs, targets)) in enumerate(train_dataset_light):
    with tf.GradientTape() as tape:
      model_outputs = bert_squad(inputs)
      loss = squad_loss_fn(targets, model_outputs)

    gradients = tape.gradient(loss, bert_squad.trainable_variables)
    optimizer.apply_gradients(zip(gradients, bert_squad.trainable_variables))

    train_loss(loss)

    if batch % 50 == 0:
      print("Epoch {} Batch {} Loss {:.4f}".format(epoch+1, batch, train_loss.result()))

    if batch % 500 == 0:
      ckpt_save_path = ckpt_manager.save()
      print("Saving checkpoint for epoch {} at {}".format(epoch+1, ckpt_save_path))

  print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))

Start of epoch 1
Epoch 1 Batch 0 Loss 0.5710
Saving checkpoint for epoch 1 at /content/drive/My Drive/Cursos - recursos/BERT/Q&A/ckpt-25
Epoch 1 Batch 50 Loss 0.9684
Epoch 1 Batch 100 Loss 1.0702
Epoch 1 Batch 150 Loss 1.0109
Epoch 1 Batch 200 Loss 1.0355
Epoch 1 Batch 250 Loss 1.0123
Epoch 1 Batch 300 Loss 0.9947
Epoch 1 Batch 350 Loss 0.9826
Epoch 1 Batch 400 Loss 0.9500
Epoch 1 Batch 450 Loss 0.9253
Epoch 1 Batch 500 Loss 0.9003
Saving checkpoint for epoch 1 at /content/drive/My Drive/Cursos - recursos/BERT/Q&A/ckpt-26
Epoch 1 Batch 550 Loss 0.8746
Epoch 1 Batch 600 Loss 0.8666
Epoch 1 Batch 650 Loss 0.8447
Epoch 1 Batch 700 Loss 0.8243
Epoch 1 Batch 750 Loss 0.8015
Epoch 1 Batch 800 Loss 0.7919
Epoch 1 Batch 850 Loss 0.7923
Epoch 1 Batch 900 Loss 0.7848
Epoch 1 Batch 950 Loss 0.7674
Epoch 1 Batch 1000 Loss 0.7545
Saving checkpoint for epoch 1 at /content/drive/My Drive/Cursos - recursos/BERT/Q&A/ckpt-27
Epoch 1 Batch 1050 Loss 0.7468
Epoch 1 Batch 1100 Loss 0.7327
Epoch 1 Batch 115

# Etapa 5: Avaliação

## Preparação da avaliação

Obter o conjunto dev na sessão

In [None]:
eval_examples = read_squad_examples(
    "/content/drive/My Drive/Cursos - recursos/BERT/dev-v1.1.json",
    is_training=False,
    version_2_with_negative=False)

Definição da função que gerará o arquivo tf_record

In [None]:
eval_writer = FeatureWriter(
    filename=os.path.join("/content/drive/My Drive/Cursos - recursos/BERT/",
                          "eval.tf_record"),
    is_training=False)

Criação do tokenizador

In [None]:
my_bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
    trainable=False)
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

Criação da função que adicona características na lista eva_features

In [None]:
def _append_feature(feature, is_padding):
    if not is_padding:
        eval_features.append(feature)
    eval_writer.process_feature(feature)

Criação das características e geração do arquivo tf.record

In [None]:
eval_features = []
dataset_size = convert_examples_to_features(
    examples=eval_examples,
    tokenizer=tokenizer,
    max_seq_length=384,
    doc_stride=128,
    max_query_length=64,
    is_training=False,
    output_fn=_append_feature,
    batch_size=4)

In [None]:
eval_writer.close()

Carregamento da base de dados

In [None]:
BATCH_SIZE = 4

eval_dataset = create_squad_dataset(
    "/content/drive/My Drive/Cursos - recursos/BERT/eval.tf_record",
    384, # input_meta_data['max_seq_length'],
    BATCH_SIZE,
    is_training=False)

## Fazendo as previsões com as funções do Google

In [None]:
RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])

In [None]:
def get_raw_results(predictions):
  for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
                                                  predictions['start_logits'],
                                                  predictions['end_logits']):
    yield RawResult(
        unique_id=unique_ids.numpy(),
        start_logits=start_logits.numpy().tolist(),
        end_logits=end_logits.numpy().tolist()
    )

In [None]:
all_results = []
for count, inputs in enumerate(eval_dataset):
  x, _ = inputs
  #print(x)
  unique_ids = x.pop("unique_ids")
  start_logits, end_logits = bert_squad(x, training=False)
  output_dict = dict(
      unique_ids = unique_ids,
      start_logits=start_logits,
      end_logits=end_logits
  )
  for result in get_raw_results(output_dict):
    all_results.append(result)

  print(count)
  if count % 100 == 0:
    print("{}/{}".format(count, 2709))

In [None]:
len(all_results)

In [None]:
all_results[0]

In [None]:
output_prediction_file = "/content/drive/My Drive/Cursos - recursos/BERT/predictions.json"
output_nbest_file = "/content/drive/My Drive/Cursos - recursos/BERT/nbest_predictions.json"
output_null_log_odds_file = "/content/drive/My Drive/Cursos - recursos/BERT/null_odds.json"

In [None]:
write_predictions(
    eval_examples,
    eval_features,
    all_results,
    20,
    30,
    True,
    output_prediction_file,
    output_nbest_file,
    output_null_log_odds_file,
    verbose=False
)

## Fazendo previsões personalizadas

### Criação do dicionário de inputs

In [None]:
my_bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
    trainable=False)
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
def is_whitespace(c):
  if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
    return True
  return False

In [None]:
is_whitespace("a")

In [None]:
is_whitespace("\n")

In [None]:
is_whitespace(" ")

In [None]:
def whitespace_split(text):
  doc_tokens = []
  prev_is_whitespace = True
  for c in text:
    if is_whitespace(c):
      prev_is_whitespace = True
    else:
      if prev_is_whitespace:
        doc_tokens.append(c)
      else:
        doc_tokens[-1] += c
      prev_is_whitespace = False
  return doc_tokens

In [None]:
whitespace_split("My dog likes strawberries.")

In [None]:
tokenizer.tokenize("My")

In [None]:
t = tokenizer.tokenize("strawberries")
t

In [None]:
len(t)

In [None]:
[1] * len(t)

In [None]:
def tokenize_context(text_words):
  text_tok = []
  tok_to_word_id = []
  for word_id, word in enumerate(text_words):
    word_tok = tokenizer.tokenize(word)
    text_tok += word_tok
    tok_to_word_id += [word_id] * len(word_tok)
  return text_tok, tok_to_word_id

In [None]:
tokenize_context(whitespace_split("My dog likes strawberries."))

In [None]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id
    return seg_ids

In [None]:
my_context = '''Neoclassical economics views inequalities in the distribution of income as arising from differences in value added by labor, capital and land. Within labor income distribution is due to differences in value added by different classifications of workers. In this perspective, wages and profits are determined by the marginal value added of each economic actor (worker, capitalist/business owner, landlord). Thus, in a market economy, inequality is a reflection of the productivity gap between highly-paid professions and lower-paid professions.'''

Neoclassical economics views inequalities in the distribution of income as arising from differences in value added by labor, capital and land. Within labor income distribution is due to differences in value added by different classifications of workers. In this perspective, wages and profits are determined by the marginal value added of each economic actor (worker, capitalist/business owner, landlord). Thus, in a market economy, inequality is a reflection of the productivity gap between highly-paid professions and lower-paid professions.

In [None]:
my_question = '''What are examples of economic actors?'''
#my_question = '''What philosophy of thought addresses wealth inequality?'''
#my_question = '''In a market economy, what is inequality a reflection of?'''

In [None]:
def create_input_dict(question, context):
  question_tok = tokenizer.tokenize(my_question)
  #print(question_tok)

  context_words = whitespace_split(context)
  #print(context_words)
  context_tok, context_tok_to_word_id = tokenize_context(context_words)
  #print(context_tok)
  #print(context_tok_to_word_id)

  input_tok = question_tok + ["[SEP]"] + context_tok + ["[SEP]"]
  #print(input_tok)
  input_tok += ["[PAD]"]*(384-len(input_tok))
  #print(input_tok)

  input_dict = {}
  input_dict["input_word_ids"] = tf.expand_dims(tf.cast(get_ids(input_tok), tf.int32), 0)
  input_dict["input_mask"] = tf.expand_dims(tf.cast(get_mask(input_tok), tf.int32), 0)
  input_dict["input_type_ids"] = tf.expand_dims(tf.cast(get_segments(input_tok), tf.int32), 0)

  return input_dict, context_words, context_tok_to_word_id, len(question_tok)

In [None]:
my_input_dict, my_context_words, context_tok_to_word_id, question_tok_len = create_input_dict(my_question, my_context)

In [None]:
my_input_dict["input_word_ids"]

In [None]:
my_input_dict["input_mask"]

In [None]:
my_input_dict["input_type_ids"]

In [None]:
print(my_context_words)

In [None]:
print(context_tok_to_word_id)

In [None]:
question_tok_len

### Previsões

In [None]:
start_logits, end_logits = bert_squad(my_input_dict, training = False)

In [None]:
start_logits[0]

In [None]:
question_tok_len

In [None]:
start_logits_context = start_logits.numpy()[0, question_tok_len + 1:]

In [None]:
start_logits_context

In [None]:
end_logits_context = end_logits.numpy()[0, question_tok_len + 1:]

In [None]:
print(context_tok_to_word_id)

In [None]:
start_word_id = context_tok_to_word_id[np.argmax(start_logits_context)]
start_word_id

In [None]:
end_word_id = context_tok_to_word_id[np.argmax(end_logits_context)]
end_word_id

### Resposta final

In [None]:
predicted_answer = ' '.join(my_context_words[start_word_id:end_word_id + 1])
print("The answer to:\n" + my_question + "\nis:\n" + predicted_answer)

In [None]:
from IPython.core.display import HTML
display(HTML(f'<h2>{my_question.upper()}</h2>'))
marked_text = str(my_context.replace(predicted_answer, f"<mark>{predicted_answer}</mark>"))
#print(marked_text)
display(HTML(f"""<blockquote> {marked_text} </blockquote>"""))