In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [1]:
!pip install -qq transformers
!pip install -qq nlp

[K     |████████████████████████████████| 2.9 MB 2.8 MB/s 
[K     |████████████████████████████████| 636 kB 64.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 82.9 MB/s 
[K     |████████████████████████████████| 895 kB 63.0 MB/s 
[K     |████████████████████████████████| 56 kB 6.3 MB/s 
[K     |████████████████████████████████| 1.7 MB 3.7 MB/s 
[K     |████████████████████████████████| 243 kB 55.1 MB/s 
[?25h

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from tqdm import tqdm
import tensorflow.keras.backend as K
import tensorflow as tf
from keras.layers import Lambda
import nlp
import keras
import string
import math
import random

eli5 = nlp.load_dataset('eli5')

Downloading:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading and preparing dataset eli5/LFQA_reddit (download: 6.03 MiB, generated: 1.26 GiB, post-processed: Unknown sizetotal: 1.26 GiB) to /root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/58e61a99404336f0891b4457a02232489b50131bdca9c1691054aeee2f6f1a6e...


Downloading:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/576M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/286M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/36.2M [00:00<?, ?B/s]

Dataset eli5 downloaded and prepared to /root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/58e61a99404336f0891b4457a02232489b50131bdca9c1691054aeee2f6f1a6e. Subsequent calls will reuse this data.


#### Handle Eli5 dataset

> After observing: There are some answers are not relevant to the questions and there are some questions are too short (one or two words)

> I just get the questions which have more than two words
> Just get maximum 2 answers with highest score each question.
> Just get answer which more than 6 words

> After cleaning and check validation

- Number of all questions: 263186
- Number of all answes: 425285
- Number of training questions: 688469
- Number of training answers: 688469

> Too large to train. It takes 6-7 hours per epoch

> I cut off to 200,000 questions by random sampling

In [None]:
def is_valid_question(text):
  if len(text.split()) > 3:
    return True  
  if len(text.split()) == 3:
    if ('?' in text) or (text[0].lower() in ['w', 'h', 'i', 'a', 'd']):
      return True
  return False

def is_valid_answer(text):
  if len(text.split()) > 8:
    return True
  return False

def clean_text(text):
  text = text.replace("\n","")
  text = ' '.join([x for x in text.split() if x != "" and x not in string.punctuation and "URL" not in x and "@" not in x and "www" not in x])
  return text

def create_eli5_q_a_dict(data, n_samples = None):
  list_dict = []

  for idx in tqdm(range(len(data))):
    item = data[idx]
    q_a = {}
    
    q_a['question'] = clean_text(item['title'])
    if is_valid_question(q_a['question']) == False:
      continue
    
    q_a['answers'] = []

    for ans in item['answers']['text']:
      tmp = clean_text(ans)
      if is_valid_answer(tmp) == True:
        q_a['answers'].append(tmp)
        if len(q_a['answers']) == 2:
          break

    if len(q_a['answers']) > 0:
      list_dict.append(q_a)

  if n_samples is not None:
    shuffle_indices = random.sample(range(len(list_dict)), n_samples)
    list_dict = np.array(list_dict)
    list_dict = list_dict[shuffle_indices]

  return list_dict

In [None]:
eli5_q_a_training_dict = create_eli5_q_a_dict(eli5['train_eli5'], n_samples=200000)
eli5_q_a_valid_dict = create_eli5_q_a_dict(eli5['validation_eli5'])

100%|██████████| 272634/272634 [00:48<00:00, 5571.88it/s]
100%|██████████| 9812/9812 [00:01<00:00, 5766.37it/s]


#### Handle FQA Covid 19 dataset

In [None]:
import pandas as pd
covid19_df1 = pd.read_csv("/content/drive/MyDrive/FAQ_Bank.csv")
covid19_df1.dropna(inplace=True)

covid19_questions_data1 = covid19_df1[covid19_df1['language'] == 'en']['question'].apply(lambda x: clean_text(x))
covid19_answers_data1 = covid19_df1[covid19_df1['language'] == 'en']['answer'].apply(lambda x: clean_text(x))
covid19_questions_data1 = covid19_questions_data1.values
covid19_answers_data1 = covid19_answers_data1.values

covid19_df2 = pd.read_csv("/content/drive/MyDrive/faq_covidbert.csv")

covid19_questions_data2 = covid19_df2[covid19_df2['lang']=='en']['question'].apply(lambda x: clean_text(x))
covid19_answers_data2 = covid19_df2[covid19_df2['lang']=='en']['answer'].apply(lambda x: clean_text(x))
covid19_questions_data2 = covid19_questions_data2.values
covid19_answers_data2 = covid19_answers_data2.values

covid19_questions = np.concatenate([covid19_questions_data1, covid19_questions_data2])
covid19_answers = np.concatenate([covid19_answers_data1, covid19_answers_data2])

n_samples = len(covid19_questions)
shuffle_indices = random.sample(range(n_samples), n_samples)

covid19_questions = covid19_questions[shuffle_indices]
covid19_answers = covid19_answers[shuffle_indices]

covid19_train_questions = covid19_questions[:8000]
covid19_train_anwers = covid19_answers[:8000]

covid19_vali_questions = covid19_questions[8000:]
covid19_valid_answes = covid19_answers[8000:]

In [None]:
def create_covid19_q_a_dict(covid19_questions, covid19_answers):
  list_dict = []

  for idx in range(len(covid19_questions)):
    q_a_dict = {}
    q_a_dict['question'] = covid19_questions[idx]
    q_a_dict['answers'] = [covid19_answers[idx]]

    list_dict.append(q_a_dict)
  
  return np.array(list_dict)

covid19_train_q_a_dict = create_covid19_q_a_dict(covid19_train_questions, covid19_train_anwers)
covid19_valid_q_a_dict = create_covid19_q_a_dict(covid19_vali_questions, covid19_valid_answes)
print('Number of questions in train set:', len(covid19_train_q_a_dict))
print('Number of questions in valid set:', len(covid19_valid_q_a_dict))

Number of questions in train set: 8000
Number of questions in valid set: 1362


#### Combine two datasets

In [None]:
q_a_training_dict = np.concatenate([eli5_q_a_training_dict, covid19_train_q_a_dict])
q_a_valid_dict = np.concatenate([eli5_q_a_valid_dict, covid19_valid_q_a_dict])

# after concat, shuffle data:

n_samples = len(q_a_training_dict)
shuffle_indices = random.sample(range(n_samples), n_samples)
q_a_training_dict = q_a_training_dict[shuffle_indices]

n_samples = len(q_a_valid_dict)
shuffle_indices = random.sample(range(n_samples), n_samples)
q_a_valid_dict = q_a_valid_dict[shuffle_indices]

print('Length of training dict', len(q_a_training_dict))
print('Length of valid dict', len(q_a_valid_dict))

Length of training dict 208000
Length of valid dict 10934


In [None]:
def create_all_training_q_a_list(data): 
  all_questions = []
  all_answers = []
  answer_question_mapping_index = []

  for idx in tqdm(range(len(data))):
    item = data[idx]
    
    all_questions.append(item['question'])  
    for ans in item['answers']:
      all_answers.append(ans)
      answer_question_mapping_index.append(idx)

  return all_questions, all_answers, answer_question_mapping_index

def create_pairs_dataset(data):
  questions = []
  answers = []

  # 1: positive, 0: negative
  labels = []

  for idx in tqdm(range(len(data))):
    item = data[idx]
    for ans in item['answers']:
      questions.append(item['question'])
      answers.append(ans)
      labels.append(1)
    
    neg_idx = np.random.randint(0, len(data),1)[0]
    if neg_idx != idx:
      questions.append(item['question'])
      answers.append(data[neg_idx]['answers'][0])
      labels.append(0)

  return questions, answers, labels

In [None]:
all_questions, all_answers, answer_question_mapping_index = create_all_training_q_a_list(q_a_training_dict[:500])
train_questions, train_answers, train_labels = create_pairs_dataset(q_a_training_dict[:500])
valid_questions, valid_answers, valid_labels = create_pairs_dataset(q_a_valid_dict[:100])

print('Number of all questions:',len(all_questions))
print('Number of all answes:',len(all_answers))
print('Number of training questions:', len(train_questions))
print('Number of training answers:', len(train_answers))

train_labels = np.array(train_labels)
valid_labels = np.array(valid_labels)

100%|██████████| 500/500 [00:00<00:00, 370521.55it/s]
100%|██████████| 500/500 [00:00<00:00, 51492.92it/s]
100%|██████████| 100/100 [00:00<00:00, 56527.01it/s]

Number of all questions: 500
Number of all answes: 824
Number of training questions: 1323
Number of training answers: 1323





In [9]:
transformer_layer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [3]:
def bert_encode(texts, tokenizer, max_len=512):
  all_tokens = []

  for idx in tqdm(range(len(texts))):
    text = texts[idx]

    text = ' '.join([x for x in text.split()[:100]])
    text = tokenizer.tokenize(text)
    text = text[:max_len-2]
    input_sequence = ["[CLS]"] + text + ["[SEP]"]

    pad_len = max_len - len(input_sequence)
    tokens = tokenizer.convert_tokens_to_ids(input_sequence)
    tokens += [0]*pad_len
    all_tokens.append(tokens)

  return np.array(all_tokens)

def build_siamese_model(transformer, max_len=512, embedding_dims = 128):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]

    #out = sequence_output[:, 0, :]
    out = tf.reduce_mean(sequence_output, axis=1)
    out = Dense(128, activation = 'relu')(out)
    
    model = Model(inputs=input_word_ids, outputs=out)    
    return model

In [4]:
class HardNegativeMiningCallback(keras.callbacks.Callback):
  def __init__(self, all_question_tokens, all_answer_tokens, answer_question_mapping_index):
    self.all_question_tokens = all_question_tokens
    self.all_answer_tokens = all_answer_tokens
    self.answer_question_mapping_index = np.array(answer_question_mapping_index)

  def on_epoch_end(self, epoch, logs=None):
    if epoch ==1:
      question_embeddings = self.model.layers[2].predict(self.all_question_tokens[:512], batch_size = 512)
      answer_embeddings = self.model.layers[2].predict(self.all_answer_tokens[:512], batch_size = 512)

      for i in tqdm(range(512, len(self.all_question_tokens), 512)):
        batch_question_embeddings = self.model.layers[2].predict(self.all_question_tokens[i:i+512], batch_size = 512)
        question_embeddings = tf.concat([question_embeddings, batch_question_embeddings], axis=0)

      for i in tqdm(range(512, len(self.all_answer_tokens), 512)):
        batch_answer_embeddings = self.model.layers[2].predict(self.all_answer_tokens[i:i+512], batch_size = 512)
        answer_embeddings = tf.concat([answer_embeddings, batch_answer_embeddings], axis=0)

      dist_matrix = tf.reduce_sum((tf.expand_dims(answer_embeddings[:5], 1)-tf.expand_dims(question_embeddings, 0))**2,2)
      min_dist_indices = tf.argmin(dist_matrix, axis=1)

      for i in tqdm(range(5, answer_embeddings.shape[0], 5)):
        tmp = tf.reduce_sum((tf.expand_dims(answer_embeddings[i:i+5], 1)-tf.expand_dims(question_embeddings, 0))**2,2)
        tmp_min_dist_indices = tf.argmin(tmp, axis=1)
        min_dist_indices = tf.concat([min_dist_indices, tmp_min_dist_indices], axis=0)

      min_dist_indices = keras.backend.eval(min_dist_indices)
      hard_negative_indices = (min_dist_indices != self.answer_question_mapping_index).nonzero()[0]
      hard_negative_pairs = [[i, min_dist_indices[i]] for i in hard_negative_indices]
      hard_negative_pairs = np.array(hard_negative_pairs)

      print('\nFound: {} hard negative pairs'.format(len(hard_negative_pairs)))

      hard_negative_question_tokens = self.all_question_tokens[hard_negative_pairs[:,1]]
      hard_negative_answer_tokens = self.all_answer_tokens[hard_negative_pairs[:,0]]

      global train_questions_tokens
      global train_answers_tokens
      global train_labels
      train_questions_tokens = np.concatenate([train_questions_tokens, hard_negative_question_tokens], axis=0)
      train_answers_tokens = np.concatenate([train_answers_tokens, hard_negative_answer_tokens], axis=0)
      train_labels = np.concatenate([train_labels, np.zeros(shape=(len(hard_negative_pairs)))])

      # shuffle_indices = random.sample(range(len(labels)), len(labels))
      # train_answers_tokens = train_answers_tokens[shuffle_indices]
      # train_questions_tokens = train_questions_tokens[shuffle_indices]
      # train_labels = train_labels[shuffle_indices] 

      print('Train on {} samples'.format(len(train_questions_tokens)))

class DistilBertRetriever():
  def __init__(self):
    pass
  
  def euclidean_distance(self, vectors):
    featsA, featsB = vectors
    # compute the sum of squared distances between the vectors
    sumSquared = K.sum(K.square(featsA - featsB), axis=1,keepdims=True)
    # return the euclidean distance between the vectors
    return K.sqrt(K.maximum(sumSquared, K.epsilon()))

  def cosine_distance(self, vectors):
    x, y = vectors
    x = K.l2_normalize(x, axis=1)
    y = K.l2_normalize(y, axis=1)
    return 1 - abs(tf.losses.cosine_similarity(x, y, axis=1))
    #return -K.mean(x * y, axis=1, keepdims=True)

  def contrastive_loss(self, y, preds, margin=1):
    # explicitly cast the true class label data type to the predicted
    # class label data type (otherwise we run the risk of having two
    # separate data types, causing TensorFlow to error out)
    y = tf.cast(y, preds.dtype)
    # calculate the contrastive loss between the true labels and
    # the predicted labels
    squaredPreds = K.square(preds)
    squaredMargin = K.square(K.maximum(margin - preds, 0))
    loss = K.mean(y * squaredPreds + (1 - y) * squaredMargin)
    # return the computed contrastive loss to the calling function
    return loss

  def build(self):
    A = Input(shape=100)
    B = Input(shape=100)
    featureExtractor = build_siamese_model(transformer_layer, max_len=100)
    featsA = featureExtractor(A)
    featsB = featureExtractor(B)
    # finally, construct the siamese network
    distance = Lambda(self.euclidean_distance)([featsA, featsB])
    model = Model(inputs=[A, B], outputs=distance)
    model.compile(loss=self.contrastive_loss, optimizer=Adam(learning_rate=0.00003))
    return model

In [None]:
train_questions_tokens = bert_encode(train_questions, tokenizer, max_len=100)
train_answers_tokens = bert_encode(train_answers, tokenizer, max_len=100)

valid_questions_tokens = bert_encode(valid_questions, tokenizer, max_len=100)
valid_answers_tokens = bert_encode(valid_answers, tokenizer, max_len=100)

all_questions_tokens = bert_encode(all_questions, tokenizer, max_len=100)
all_answers_tokens = bert_encode(all_answers, tokenizer, max_len=100)

100%|██████████| 1323/1323 [00:00<00:00, 2577.76it/s]
100%|██████████| 1323/1323 [00:02<00:00, 555.32it/s]
100%|██████████| 256/256 [00:00<00:00, 1833.06it/s]
100%|██████████| 256/256 [00:00<00:00, 583.82it/s]
100%|██████████| 500/500 [00:00<00:00, 2539.09it/s]
100%|██████████| 824/824 [00:01<00:00, 566.03it/s]


## Load tokens and train

In [6]:
train_questions_tokens = np.load("/content/drive/MyDrive/train_questions_tokens.npy")
train_answers_tokens = np.load("/content/drive/MyDrive/train_answers_tokens.npy")
valid_questions_tokens = np.load("/content/drive/MyDrive/valid_questions_tokens.npy")
valid_answers_tokens = np.load("/content/drive/MyDrive/valid_answers_tokens.npy")
all_questions_tokens = np.load("/content/drive/MyDrive/all_questions_tokens.npy")
all_answers_tokens = np.load("/content/drive/MyDrive/all_answers_tokens.npy")
train_labels = np.load("/content/drive/MyDrive/train_labels.npy")
valid_labels = np.load("/content/drive/MyDrive/valid_labels.npy")
answer_question_mapping_index = np.load("/content/drive/MyDrive/answer_question_mapping_index.npy")

In [None]:
n_samples = len(train_questions_tokens)
shuffle_indices = random.sample(range(n_samples), n_samples)
train_answers_tokens = train_answers_tokens[shuffle_indices]
train_questions_tokens = train_questions_tokens[shuffle_indices]
train_labels = train_labels[shuffle_indices] 

### Using cls token

In [13]:
len(all_answers_tokens)

331119

In [10]:
retriever = DistilBertRetriever().build()

In [None]:
retriever = DistilBertRetriever().build()

mycallback = HardNegativeMiningCallback(all_questions_tokens, 
                                        all_answers_tokens, 
                                        answer_question_mapping_index)

for i in range(5):
  retriever.fit([train_questions_tokens, train_answers_tokens], train_labels,
            validation_data=([valid_questions_tokens,valid_answers_tokens], valid_labels),
            batch_size=32,
            epochs=2,
            verbose=1,
            callbacks = [mycallback],
            shuffle = True)

Epoch 1/2
  688/16848 [>.............................] - ETA: 1:36:46 - loss: 0.1305

### Using mean of sequence

In [None]:
retriever = DistilBertRetriever().build()

mycallback = HardNegativeMiningCallback(all_questions_tokens, all_answers_tokens, 
                                        answer_question_mapping_index)

for i in range(3):
  retriever.fit([train_questions_tokens, train_answers_tokens], train_labels,
            validation_data=([valid_questions_tokens,valid_answers_tokens], valid_labels),
            batch_size=32,
            epochs=1,
            verbose=1,
            callbacks = [mycallback],
            shuffle = True)



100%|██████████| 3/3 [00:03<00:00,  1.21s/it]
100%|██████████| 6/6 [00:04<00:00,  1.37it/s]
100%|██████████| 633/633 [00:00<00:00, 3197.55it/s]



Found: 2445 hard negative pairs
Train on 7613 samples


100%|██████████| 3/3 [00:02<00:00,  1.22it/s]
100%|██████████| 6/6 [00:04<00:00,  1.37it/s]
100%|██████████| 633/633 [00:00<00:00, 3264.72it/s]



Found: 1451 hard negative pairs
Train on 9064 samples


100%|██████████| 3/3 [00:02<00:00,  1.22it/s]
100%|██████████| 6/6 [00:04<00:00,  1.37it/s]
100%|██████████| 633/633 [00:00<00:00, 3231.66it/s]



Found: 994 hard negative pairs
Train on 10058 samples


### Using 2 epochs per hard negative

In [None]:
retriever = DistilBertRetriever().build()

mycallback = HardNegativeMiningCallback(all_questions_tokens, all_answers_tokens, 
                                        answer_question_mapping_index)

for i in range(3):
  retriever.fit([train_questions_tokens, train_answers_tokens], train_labels,
            validation_data=([valid_questions_tokens,valid_answers_tokens], valid_labels),
            batch_size=32,
            epochs=2,
            verbose=1,
            callbacks = [mycallback],
            shuffle = False)

Epoch 1/2
Epoch 2/2


100%|██████████| 3/3 [00:03<00:00,  1.19s/it]
100%|██████████| 6/6 [00:04<00:00,  1.37it/s]
100%|██████████| 633/633 [00:00<00:00, 3328.97it/s]



Found: 2038 hard negative pairs
Train on 7203 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 3/3 [00:02<00:00,  1.22it/s]
100%|██████████| 6/6 [00:04<00:00,  1.37it/s]
100%|██████████| 633/633 [00:00<00:00, 3227.81it/s]



Found: 3166 hard negative pairs
Train on 10369 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 3/3 [00:02<00:00,  1.22it/s]
100%|██████████| 6/6 [00:04<00:00,  1.37it/s]
100%|██████████| 633/633 [00:00<00:00, 3287.23it/s]



Found: 3164 hard negative pairs
Train on 13533 samples


### Using cosine distance

In [None]:
retriever = DistilBertRetriever().build()

mycallback = HardNegativeMiningCallback(all_questions_tokens, all_answers_tokens, 
                                        answer_question_mapping_index)

for i in range(3):
  retriever.fit([train_questions_tokens, train_answers_tokens], train_labels,
            validation_data=([valid_questions_tokens,valid_answers_tokens], valid_labels),
            batch_size=32,
            epochs=2,
            verbose=1,
            callbacks = [mycallback],
            shuffle = True)

Epoch 1/2
Epoch 2/2


100%|██████████| 3/3 [00:03<00:00,  1.21s/it]
100%|██████████| 6/6 [00:04<00:00,  1.37it/s]
100%|██████████| 633/633 [00:00<00:00, 3045.49it/s]



Found: 3168 hard negative pairs
Train on 8336 samples
Epoch 1/2


KeyboardInterrupt: ignored

In [None]:
question_embeddings = retriever.layers[2].predict(all_questions_tokens[:512], batch_size = 512)
answer_embeddings = retriever.layers[2].predict(all_answers_tokens[:512], batch_size = 512)

for i in tqdm(range(512, len(all_questions_tokens), 512)):
  batch_question_embeddings = retriever.layers[2].predict(all_questions_tokens[i:i+512], batch_size = 512)
  question_embeddings = tf.concat([question_embeddings, batch_question_embeddings], axis=0)

for i in tqdm(range(512, len(all_answers_tokens), 512)):
  batch_answer_embeddings = retriever.layers[2].predict(all_answers_tokens[i:i+512], batch_size = 512)
  answer_embeddings = tf.concat([answer_embeddings, batch_answer_embeddings], axis=0)

dist_matrix = tf.reduce_sum((tf.expand_dims(answer_embeddings[:5], 1)-tf.expand_dims(question_embeddings, 0))**2,2)

for i in tqdm(range(5, answer_embeddings.shape[0], 5)):
  tmp = tf.reduce_sum((tf.expand_dims(answer_embeddings[i:i+5], 1)-tf.expand_dims(question_embeddings, 0))**2,2)
  dist_matrix = tf.concat([dist_matrix, tmp], axis=0)


100%|██████████| 406/406 [05:38<00:00,  1.20it/s]
100%|██████████| 646/646 [08:59<00:00,  1.20it/s]
  2%|▏         | 1032/66223 [00:23<25:01, 43.40it/s]


ResourceExhaustedError: ignored