## Get the answer by google search

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -qq transformers
!pip install -qq nlp

[K     |████████████████████████████████| 2.9 MB 5.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 36.0 MB/s 
[K     |████████████████████████████████| 895 kB 36.7 MB/s 
[K     |████████████████████████████████| 636 kB 47.5 MB/s 
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
[K     |████████████████████████████████| 1.7 MB 5.4 MB/s 
[K     |████████████████████████████████| 243 kB 42.6 MB/s 
[?25h

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from tqdm import tqdm
import tensorflow.keras.backend as K
import tensorflow as tf
from keras.layers import Lambda
import nlp
import keras
import string
import math
import random

eli5 = nlp.load_dataset('eli5')

Downloading:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading and preparing dataset eli5/LFQA_reddit (download: 6.03 MiB, generated: 1.26 GiB, post-processed: Unknown sizetotal: 1.26 GiB) to /root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/58e61a99404336f0891b4457a02232489b50131bdca9c1691054aeee2f6f1a6e...


Downloading:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/576M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/286M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/36.2M [00:00<?, ?B/s]

Dataset eli5 downloaded and prepared to /root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/58e61a99404336f0891b4457a02232489b50131bdca9c1691054aeee2f6f1a6e. Subsequent calls will reuse this data.


#### Handle Eli5 dataset

> After observing: There are some answers are not relevant to the questions and there are some questions are too short (one or two words)

> I just get the questions which have more than two words
> Just get maximum 2 answers with highest score each question.
> Just get answer which more than 6 words

> After cleaning and check validation

- Number of all questions: 263186
- Number of all answes: 425285
- Number of training questions: 688469
- Number of training answers: 688469

> Too large to train. It takes 6-7 hours per epoch

> I cut off to 200,000 questions by random sampling

In [19]:
def is_valid_question(text):
  if len(text.split()) > 3:
    return True  
  if len(text.split()) == 3:
    if ('?' in text) or (text[0].lower() in ['w', 'h', 'i', 'a', 'd']):
      return True
  return False

def is_valid_answer(text):
  if len(text.split()) > 8:
    return True
  return False

def clean_text(text):
  text = text.replace("\n","")
  text = ' '.join([x for x in text.split() if x != "" and x not in string.punctuation and "URL" not in x and "@" not in x and "www" not in x])
  return text

In [28]:
def create_eli5_q_a_dict(data, n_samples = None):
  list_dict = []

  for idx in tqdm(range(len(data))):
    item = data[idx]
    q_a = {}
    
    q_a['question'] = clean_text(item['title'])
    if is_valid_question(q_a['question']) == False:
      continue
    
    q_a['answers'] = []

    for ans in item['answers']['text']:
      tmp = clean_text(ans)
      if is_valid_answer(tmp) == True:
        q_a['answers'].append(tmp)
        if len(q_a['answers']) == 2:
          break

    if len(q_a['answers']) > 0:
      list_dict.append(q_a)

  if n_samples is not None:
    shuffle_indices = random.sample(range(len(list_dict)), n_samples)
    list_dict = np.array(list_dict)
    list_dict = list_dict[shuffle_indices]

  return list_dict

In [29]:
eli5_q_a_training_dict = create_eli5_q_a_dict(eli5['train_eli5'], n_samples=200000)
eli5_q_a_valid_dict = create_eli5_q_a_dict(eli5['validation_eli5'])

100%|██████████| 272634/272634 [01:12<00:00, 3784.29it/s]
100%|██████████| 9812/9812 [00:02<00:00, 3801.11it/s]


#### Handle FQA Covid 19 dataset

In [33]:
import pandas as pd
covid19_df1 = pd.read_csv("/content/drive/MyDrive/FAQ_Bank.csv")
covid19_df1.dropna(inplace=True)

covid19_questions_data1 = covid19_df1[covid19_df1['language'] == 'en']['question'].apply(lambda x: clean_text(x))
covid19_answers_data1 = covid19_df1[covid19_df1['language'] == 'en']['answer'].apply(lambda x: clean_text(x))
covid19_questions_data1 = covid19_questions_data1.values
covid19_answers_data1 = covid19_answers_data1.values

covid19_df2 = pd.read_csv("/content/drive/MyDrive/faq_covidbert.csv")

covid19_questions_data2 = covid19_df2[covid19_df2['lang']=='en']['question'].apply(lambda x: clean_text(x))
covid19_answers_data2 = covid19_df2[covid19_df2['lang']=='en']['answer'].apply(lambda x: clean_text(x))
covid19_questions_data2 = covid19_questions_data2.values
covid19_answers_data2 = covid19_answers_data2.values

covid19_questions = np.concatenate([covid19_questions_data1, covid19_questions_data2])
covid19_answers = np.concatenate([covid19_answers_data1, covid19_answers_data2])

In [47]:
n_samples = len(covid19_questions)
shuffle_indices = random.sample(range(n_samples), n_samples)

covid19_questions = covid19_questions[shuffle_indices]
covid19_answers = covid19_answers[shuffle_indices]

covid19_train_questions = covid19_questions[:8000]
covid19_train_anwers = covid19_answers[:8000]

covid19_vali_questions = covid19_questions[8000:]
covid19_valid_answes = covid19_answers[8000:]

In [34]:
def create_covid19_q_a_dict(covid19_questions, covid19_answers):
  list_dict = []

  for idx in range(len(covid19_questions)):
    q_a_dict = {}
    q_a_dict['question'] = covid19_questions[idx]
    q_a_dict['answers'] = [covid19_answers[idx]]

    list_dict.append(q_a_dict)
  
  return np.array(list_dict)

In [51]:
covid19_train_q_a_dict = create_covid19_q_a_dict(covid19_train_questions, covid19_train_anwers)
covid19_valid_q_a_dict = create_covid19_q_a_dict(covid19_vali_questions, covid19_valid_answes)
print('Number of questions in train set:', len(covid19_train_q_a_dict))
print('Number of questions in valid set:', len(covid19_valid_q_a_dict))

Number of questions in train set: 8000
Number of questions in valid set: 1362


#### Combine two datasets

In [54]:
q_a_training_dict = np.concatenate([eli5_q_a_training_dict, covid19_train_q_a_dict])
q_a_valid_dict = np.concatenate([eli5_q_a_valid_dict, covid19_valid_q_a_dict])

# after concat, shuffle data:

n_samples = len(q_a_training_dict)
shuffle_indices = random.sample(range(n_samples), n_samples)
q_a_training_dict = q_a_training_dict[shuffle_indices]

n_samples = len(q_a_valid_dict)
shuffle_indices = random.sample(range(n_samples), n_samples)
q_a_valid_dict = q_a_valid_dict[shuffle_indices]

print('Length of training dict', len(q_a_training_dict))
print('Length of valid dict', len(q_a_valid_dict))

Length of training dict 208000
Length of valid dict 10934


In [61]:
def create_all_training_q_a_list(data): 
  all_questions = []
  all_answers = []
  answer_question_mapping_index = []

  for idx in tqdm(range(len(data))):
    item = data[idx]
    
    all_questions.append(item['question'])  
    for ans in item['answers']:
      all_answers.append(ans)
      answer_question_mapping_index.append(idx)

  return all_questions, all_answers, answer_question_mapping_index

def create_pairs_dataset(data):
  questions = []
  answers = []

  # 1: positive, 0: negative
  labels = []

  for idx in tqdm(range(len(data))):
    item = data[idx]
    for ans in item['answers']:
      questions.append(item['question'])
      answers.append(ans)
      labels.append(1)
    
    neg_idx = np.random.randint(0, len(data),1)[0]
    if neg_idx != idx:
      questions.append(item['question'])
      answers.append(data[neg_idx]['answers'][0])
      labels.append(0)

  return questions, answers, labels

In [63]:
all_questions, all_answers, answer_question_mapping_index = create_all_training_q_a_list(q_a_training_dict)
train_questions, train_answers, train_labels = create_pairs_dataset(q_a_training_dict)
valid_questions, valid_answers, valid_labels = create_pairs_dataset(q_a_valid_dict)

100%|██████████| 208000/208000 [00:00<00:00, 563010.18it/s]
100%|██████████| 208000/208000 [00:07<00:00, 26020.75it/s]
100%|██████████| 10934/10934 [00:00<00:00, 33447.20it/s]


In [70]:
print('Number of all questions:',len(all_questions))
print('Number of all answes:',len(all_answers))
print('Number of training questions:', len(train_questions))
print('Number of training answers:', len(train_answers))

Number of all questions: 208000
Number of all answes: 331119
Number of training questions: 539119
Number of training answers: 539119


In [71]:
train_labels = np.array(train_labels)
valid_labels = np.array(valid_labels)

In [72]:
transformer_layer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [73]:
def bert_encode(texts, tokenizer, max_len=512):
  all_tokens = []

  for idx in tqdm(range(len(texts))):
    text = texts[idx]

    text = ' '.join([x for x in text.split()[:100]])
    text = tokenizer.tokenize(text)
    text = text[:max_len-2]
    input_sequence = ["[CLS]"] + text + ["[SEP]"]

    pad_len = max_len - len(input_sequence)
    tokens = tokenizer.convert_tokens_to_ids(input_sequence)
    tokens += [0]*pad_len
    all_tokens.append(tokens)

  return np.array(all_tokens)

def build_siamese_model(transformer, max_len=512, embedding_dims = 128):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]

    #cls_token = sequence_output[:, 0, :]
    out = tf.reduce_mean(sequence_output, axis=1)
    out = Dense(128, activation = 'relu')(out)
    
    model = Model(inputs=input_word_ids, outputs=out)    
    return model

In [74]:
train_questions_tokens = bert_encode(train_questions, tokenizer, max_len=100)
train_answers_tokens = bert_encode(train_answers, tokenizer, max_len=100)

valid_questions_tokens = bert_encode(valid_questions, tokenizer, max_len=100)
valid_answers_tokens = bert_encode(valid_answers, tokenizer, max_len=100)

all_questions_tokens = bert_encode(all_questions, tokenizer, max_len=100)
all_answers_tokens = bert_encode(all_answers, tokenizer, max_len=100)

100%|██████████| 539119/539119 [05:03<00:00, 1777.75it/s]
100%|██████████| 539119/539119 [20:26<00:00, 439.48it/s]
100%|██████████| 27845/27845 [00:16<00:00, 1646.29it/s]
100%|██████████| 27845/27845 [01:01<00:00, 449.74it/s]
100%|██████████| 208000/208000 [01:54<00:00, 1821.38it/s]
100%|██████████| 331119/331119 [12:22<00:00, 445.73it/s]


In [75]:
np.save("/content/drive/MyDrive/train_questions_tokens.npy", train_questions_tokens)
np.save("/content/drive/MyDrive/train_answers_tokens.npy", train_answers_tokens)
np.save("/content/drive/MyDrive/valid_questions_tokens.npy", valid_questions_tokens)
np.save("/content/drive/MyDrive/valid_answers_tokens.npy", valid_answers_tokens)
np.save("/content/drive/MyDrive/all_questions_tokens.npy", all_questions_tokens)
np.save("/content/drive/MyDrive/all_answers_tokens.npy", all_answers_tokens)
np.save("/content/drive/MyDrive/train_labels.npy", train_labels)
np.save("/content/drive/MyDrive/valid_labels.npy", valid_labels)
np.save("/content/drive/MyDrive/answer_question_mapping_index.npy", answer_question_mapping_index)

In [None]:
train_questions_tokens = np.load("/content/drive/MyDrive/train_questions_tokens.npy")
train_answers_tokens = np.load("/content/drive/MyDrive/train_answers_tokens.npy")
valid_questions_tokens = np.load("/content/drive/MyDrive/valid_questions_tokens.npy")
valid_answers_tokens = np.load("/content/drive/MyDrive/valid_answers_tokens.npy")
all_questions_tokens = np.load("/content/drive/MyDrive/all_questions_tokens.npy")
all_answers_tokens = np.load("/content/drive/MyDrive/all_answers_tokens.npy")

In [None]:
shuffle_indices = random.sample(range(1000), 1000)
train_answers_tokens = train_answers_tokens[shuffle_indices]
train_questions_tokens = train_questions_tokens[shuffle_indices]
labels = labels[shuffle_indices] 
answer_question_mapping_index = np.array(answer_question_mapping_index[:1000])

In [None]:
class HardNegativeMiningCallback(keras.callbacks.Callback):
  def __init__(self, all_question_tokens, all_answer_tokens, answer_question_mapping_index):
    self.all_question_tokens = all_question_tokens
    self.all_answer_tokens = all_answer_tokens
    self.answer_question_mapping_index = np.array(answer_question_mapping_index)

  def on_epoch_end(self, epoch, logs=None):
    if epoch == 1:
      question_embeddings = self.model.layers[2].predict(self.all_question_tokens)
      answer_embeddings = self.model.layers[2].predict(self.all_answer_tokens)

      dist_matrix = tf.reduce_sum((tf.expand_dims(answer_embeddings[:100], 1)-tf.expand_dims(question_embeddings, 0))**2,2)

      for i in tqdm(range(100, answer_embeddings.shape[0], 100)):
        tmp = tf.reduce_sum((tf.expand_dims(answer_embeddings[i:i+100], 1)-tf.expand_dims(question_embeddings, 0))**2,2)
        dist_matrix = tf.concat([dist_matrix, tmp], axis=0)

      min_dist_indices = keras.backend.eval(tf.argmin(dist_matrix, axis=1))

      hard_negative_indices = (min_dist_indices != self.answer_question_mapping_index).nonzero()[0]
      hard_negative_pairs = [[i, min_dist_indices[i]] for i in hard_negative_indices]
      hard_negative_pairs = np.array(hard_negative_pairs)

      print('\nFound: {} hard negative pairs'.format(len(hard_negative_pairs)))

      hard_negative_question_tokens = self.all_question_tokens[hard_negative_pairs[:,1]]
      hard_negative_answer_tokens = self.all_answer_tokens[hard_negative_pairs[:,0]]

      global train_questions_tokens
      global train_answers_tokens
      global labels
      train_questions_tokens = np.concatenate([train_questions_tokens, hard_negative_question_tokens], axis=0)
      train_answers_tokens = np.concatenate([train_answers_tokens, hard_negative_answer_tokens], axis=0)
      labels = np.concatenate([labels, np.zeros(shape=(len(hard_negative_pairs)))])

      shuffle_indices = random.sample(range(len(labels)), len(labels))
      train_answers_tokens = train_answers_tokens[shuffle_indices]
      train_questions_tokens = train_questions_tokens[shuffle_indices]
      labels = labels[shuffle_indices] 

      print('Train on {} samples'.format(len(train_questions_tokens)))

class DistilBertRetriever():
  def __init__(self):
    pass
  
  def euclidean_distance(self, vectors):
    featsA, featsB = vectors
    # compute the sum of squared distances between the vectors
    sumSquared = K.sum(K.square(featsA - featsB), axis=1,keepdims=True)
    # return the euclidean distance between the vectors
    return K.sqrt(K.maximum(sumSquared, K.epsilon()))

  def contrastive_loss(self, y, preds, margin=1):
    # explicitly cast the true class label data type to the predicted
    # class label data type (otherwise we run the risk of having two
    # separate data types, causing TensorFlow to error out)
    y = tf.cast(y, preds.dtype)
    # calculate the contrastive loss between the true labels and
    # the predicted labels
    squaredPreds = K.square(preds)
    squaredMargin = K.square(K.maximum(margin - preds, 0))
    loss = K.mean(y * squaredPreds + (1 - y) * squaredMargin)
    # return the computed contrastive loss to the calling function
    return loss

  def build(self):
    A = Input(shape=100)
    B = Input(shape=100)
    featureExtractor = build_siamese_model(transformer_layer, max_len=100)
    featsA = featureExtractor(A)
    featsB = featureExtractor(B)
    # finally, construct the siamese network
    distance = Lambda(self.euclidean_distance)([featsA, featsB])
    model = Model(inputs=[A, B], outputs=distance)
    model.compile(loss=self.contrastive_loss, optimizer=Adam(learning_rate=0.0001))
    return model

### Using cls token

In [None]:
retriever = DistilBertRetriever().build()

mycallback = HardNegativeMiningCallback(all_questions_tokens, all_answers_tokens, 
                                        answer_question_mapping_index[:1000])

for i in range(5):
  retriever.fit([train_questions_tokens, train_answers_tokens], train_labels,
            validation_data=([valid_questions_tokens,valid_answers_tokens], valid_labels[:1000]),
            batch_size=32,
            epochs=1,
            verbose=1,
            callbacks = [mycallback],
            shuffle = False)

### Using mean of sequence

In [None]:
retriever = DistilBertRetriever().build()

mycallback = HardNegativeMiningCallback(all_questions_tokens, all_answers_tokens, 
                                        answer_question_mapping_index[:1000])

for i in range(5):
  retriever.fit([train_questions_tokens, train_answers_tokens], labels,
            validation_data=([valid_questions_tokens,valid_answers_tokens], valid_labels[:1000]),
            batch_size=32,
            epochs=1,
            verbose=1,
            callbacks = [mycallback],
            shuffle = False)



100%|██████████| 9/9 [00:00<00:00, 469.10it/s]



Found: 971 hard negative pairs
Train on 1971 samples


100%|██████████| 9/9 [00:00<00:00, 558.97it/s]



Found: 560 hard negative pairs
Train on 2531 samples


100%|██████████| 9/9 [00:00<00:00, 607.39it/s]



Found: 417 hard negative pairs
Train on 2948 samples


100%|██████████| 9/9 [00:00<00:00, 520.85it/s]



Found: 286 hard negative pairs
Train on 3234 samples


100%|██████████| 9/9 [00:00<00:00, 476.50it/s]


Found: 194 hard negative pairs
Train on 3428 samples





### Using 2 epochs per hard negative

In [None]:
retriever = DistilBertRetriever().build()

mycallback = HardNegativeMiningCallback(all_questions_tokens, all_answers_tokens, 
                                        answer_question_mapping_index[:1000])

for i in range(5):
  retriever.fit([train_questions_tokens, train_answers_tokens], labels,
            validation_data=([valid_questions_tokens,valid_answers_tokens], valid_labels[:1000]),
            batch_size=32,
            epochs=2,
            verbose=1,
            callbacks = [mycallback],
            shuffle = False)

Epoch 1/2
Epoch 2/2


100%|██████████| 9/9 [00:00<00:00, 555.29it/s]



Found: 966 hard negative pairs
Train on 1966 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 9/9 [00:00<00:00, 385.96it/s]



Found: 467 hard negative pairs
Train on 2433 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 9/9 [00:00<00:00, 565.06it/s]



Found: 255 hard negative pairs
Train on 2688 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 9/9 [00:00<00:00, 564.80it/s]



Found: 132 hard negative pairs
Train on 2820 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 9/9 [00:00<00:00, 551.67it/s]


Found: 111 hard negative pairs
Train on 2931 samples





### Combine all data and train

In [None]:
train_questions_tokens = np.concatenate([train_questions_tokens, covid19_train_questions_tokens])
train_answers_tokens = np.concatenate([train_answers_tokens, covid19_train_anwers_tokens])
valid_questions_tokens = np.concatenate([valid_questions_tokens, covid19_valid_questions_tokens])
valid_answers_tokens = np.concatenate([valid_answers_tokens, covid19_valid_answes_tokens])
all_questions_tokens = np.concatenate([all_questions_tokens, covid19_questions_tokens])
all_answers_tokens = np.concatenate([all_answers_tokens, covid19_answers_tokens])

train_labels = np.concatenate([train_labels, covid19_train_labels])
valid_labels = np.concatenate([valid_labels, covid19_valid_labels])

In [None]:
np.save("/content/drive/MyDrive/all_train_questions_tokens.npy", train_questions_tokens)
np.save("/content/drive/MyDrive/all_train_answers_tokens.npy", train_answers_tokens)
np.save("/content/drive/MyDrive/all_valid_questions_tokens.npy", valid_questions_tokens)
np.save("/content/drive/MyDrive/all_valid_answers_tokens.npy", valid_answers_tokens)
np.save("/content/drive/MyDrive/all_all_questions_tokens.npy", all_questions_tokens)
np.save("/content/drive/MyDrive/all_all_answers_tokens.npy", all_answers_tokens)

In [None]:
answer_question_mapping_index = np.array(answer_question_mapping_index)

In [None]:
answer_question_mapping_index = np.concatenate([answer_question_mapping_index, np.arange(200000, 209362)])

In [None]:
n_sample = len(train_answers_tokens)
shuffle_indices = random.sample(range(n_sample), n_sample)
train_answers_tokens = train_answers_tokens[shuffle_indices]
train_questions_tokens = train_questions_tokens[shuffle_indices]
train_labels = train_labels[shuffle_indices]

In [None]:
retriever = DistilBertRetriever().build()

mycallback = HardNegativeMiningCallback(all_questions_tokens, all_answers_tokens, 
                                        answer_question_mapping_index)

for i in range(5):
  retriever.fit([train_questions_tokens, train_answers_tokens], train_labels,
            validation_data=([valid_questions_tokens,valid_answers_tokens], valid_labels),
            batch_size=64,
            epochs=2,
            verbose=1,
            callbacks = [mycallback],
            shuffle = False)

Epoch 1/2
 724/8376 [=>............................] - ETA: 5:01:46 - loss: 0.0834

KeyboardInterrupt: ignored