<a href="https://colab.research.google.com/github/dinhngoc267/NSEEN/blob/master/BioSyn_TAC2017_Evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q tf-models-official==2.4.0

[K     |████████████████████████████████| 1.1MB 5.6MB/s 
[K     |████████████████████████████████| 174kB 20.1MB/s 
[K     |████████████████████████████████| 37.6MB 107kB/s 
[K     |████████████████████████████████| 358kB 36.7MB/s 
[K     |████████████████████████████████| 645kB 36.7MB/s 
[K     |████████████████████████████████| 706kB 25.8MB/s 
[K     |████████████████████████████████| 102kB 8.8MB/s 
[K     |████████████████████████████████| 51kB 4.3MB/s 
[K     |████████████████████████████████| 1.2MB 38.0MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
import keras

import os 

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import tensorflow as tf 
import tensorflow_hub as hub 
import tensorflow_datasets as tfds 

import official.nlp.bert.tokenization as tokenization
import glob
from tqdm import tqdm

import pickle 
from sklearn.feature_extraction.text import TfidfVectorizer 

from keras.layers import Layer
import keras.backend as K
import copy

import xml.etree.ElementTree as elemTree
  
import re
from string import punctuation

In [None]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True,name='bert_layer')

In [None]:
# Define func to encode raw_text to token
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=25):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0]*pad_len
        pad_masks = [1] * len(input_sequence) + [0]*pad_len
        segment_ids = [0]*max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

class DenseEncoder():
  def __init__(self, bert_layer, max_len=25):
    
    input_word_ids = tf.keras.Input(shape=(max_len,),dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,),dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask,segment_ids])
    clf_output = sequence_output[:,0,:]

    self.model = tf.keras.models.Model(inputs=[input_word_ids,input_mask,segment_ids], outputs=clf_output)

  def get_model(self):
    return self.model

  def get_dense_embedding(self,input):
    return self.model.predict([input])

class SparseEncoder(object):
  def __init__(self):
    self.encoder = TfidfVectorizer(analyzer='char', ngram_range=(1,2))

  def fit(self, train_corpus):
    self.encoder.fit(train_corpus)

    return self
  
  def transform(self, mentions):
    vec = self.encoder.transform(mentions).toarray()
    #vec = tf.constant(vec, dtype=tf.float32)
    return vec
  
  def __call__(self, mentions):
    return self.transform(mentions)


class QueryDataset():
  def __init__(self, data_dir):
    self.data = self.load_data(data_dir = data_dir)

  def load_data(self, data_dir):
    """
    Parameters
    ----------
    data_dir: a path of data

    Returns
    -------
    data: np.array[(mention, CUI)]
    """
    data = []
    concept_files = glob.glob(os.path.join(data_dir,"*.concept"))
    for concept_file in tqdm(concept_files):
      with open(concept_file, "r", encoding="utf-8") as f:
        concepts = f.readlines()

        for concept in concepts: 
          concept = concept.split("||")
          mention = concept[3].strip()
          cui = concept[4].strip()

          data.append((mention,cui))

    data = np.array(data)

    return data

class DictionaryDataset():
  """
  Dictionary data
  """
  def __init__(self, dictionary_path):
    """
    Parameters
    ----------
    dictionary_path: str
      The path of the dictionary
    """
    self.data = self.load_data(dictionary_path)
  def load_data(self, dictionary_path):
    data = []
    with open(dictionary_path, mode='r', encoding='utf-8') as f:
      lines = f.readlines()
      for line in tqdm(lines):
        line = line.strip()
        if line=="": continue
        cui, name = line.split("||")
        data.append((name,cui))

    data = np.array(data)
    return data

class Scalar(Layer):
  def __init__(self,name=None):
    super(Scalar, self).__init__(name=name)
  def build(self,input_shape) :
    self.W = K.variable(0)
    self._trainable_weights=[self.W]
    super().build(input_shape)
  def call(self,inputs):
    return self.W*inputs

class MedDRAPreprocess():
  """
  Make training dictionary pair
  """
  def __init__(self, hlgt_path, hlt_path, pt_path, llt_path):
      self.hlgt_path = hlgt_path
      self.hlt_path = hlt_path
      self.pt_path =pt_path
      self.llt_path = llt_path

  def load_dictionary(self):
      """ 
      ! hlgt, hlt, pt => need to extract id and name
      format id$name$$$$$$$$
      ! llt => need to extract pt_id and name
      format id$name$pt_id$$$$$$$
      """

      dictionary = {}
      # hlgt
      with open(self.hlgt_path, "r") as f:
          lines = f.readlines()
          for line in tqdm(lines):
              line = line.split("$")
              _id = line[0]
              _name = line[1]
              if _id not in dictionary.keys():
                  dictionary[_id] = _name  
              else:
                  dictionary[_id] = dictionary[_id] + "|" + _name
      # hlt
      with open(self.hlt_path, "r") as f:
          lines = f.readlines()
          for line in tqdm(lines):
              line = line.split("$")
              _id = line[0]
              _name = line[1]
              if _id not in dictionary.keys():
                  dictionary[_id] = _name  
              else:
                  dictionary[_id] = dictionary[_id] + "|" + _name

      # pt
      with open(self.pt_path, "r") as f:
          lines = f.readlines()
          for line in tqdm(lines):
              line = line.split("$")
              _id = line[0]
              _name = line[1]
              if _id not in dictionary.keys():
                  dictionary[_id] = _name  
              else:
                  dictionary[_id] = dictionary[_id] + "|" + _name

      # llt
      with open(self.llt_path, "r") as f:
          lines = f.readlines()
          for line in tqdm(lines):
              line = line.split("$")
              _id = line[2]
              _name = line[1]
              if _id not in dictionary.keys():
                  dictionary[_id] = _name  
              else:
                  names = dictionary[_id].split("|")
                  names.append(_name)
                  names = "|".join(list(set(names)))
                  dictionary[_id] = names

      list_dictionary = [[k,v] for k,v in dictionary.items()]
      return list_dictionary

  def make_ID_mention_map(self, out):
      dictionary = self.load_dictionary()
      with open(out, 'w') as outfile:
          for row in dictionary:
              outfile.write("||".join(row))
              outfile.write('\n')

In [None]:
# load data
training_query = QueryDataset('/content/drive/My Drive/TAC2017/train')
training_dictionary = DictionaryDataset('/content/drive/My Drive/MedDRA/dictionary/train_dictionary.txt')
test_query = QueryDataset('/content/drive/My Drive/TAC2017/test')
test_dictionary = DictionaryDataset('/content/drive/My Drive/MedDRA/dictionary/test_dictionary.txt')

100%|██████████| 80/80 [00:54<00:00,  1.46it/s]
100%|██████████| 76817/76817 [00:00<00:00, 772773.14it/s]
100%|██████████| 99/99 [01:02<00:00,  1.58it/s]
100%|██████████| 78059/78059 [00:00<00:00, 661042.63it/s]


In [None]:
def find_min_idx(array):
  """
  Returns: the index of min in array
  """
  min_idx = 0

  for idx, item in enumerate(array):
    if item < array[min_idx]:
      min_idx = idx

  return min_idx 

def find_topk_candidate(array, topk):
  topk_idx = []

  for idx,item in enumerate(array):
    if len(topk_idx) < topk:
      topk_idx.append(idx)
    else:
      min_idx = find_min_idx(array[topk_idx])
      if item > array[topk_idx[min_idx]]:
        topk_idx[min_idx] = idx
  return np.array(topk_idx)

def sort_candidate_descening(array_value, array_idx):

  for i in range(len(array_value)-1):
    max_idx = i
    for j in range(i+1, len(array_value)):
      if array_value[j] > array_value[max_idx]:
        max_idx = j
    if max_idx != i:
      
      tmp = array_value[i]
      array_value[i] = array_value[max_idx]
      array_value[max_idx] = tmp

      tmp = array_idx[i]
      array_idx[i] = array_idx[max_idx]
      array_idx[max_idx] = tmp
  return array_idx

In [None]:
sparse_encoder = SparseEncoder()

In [None]:
# define function to get sparse candidates and dense candidates indices of query

def get_sparse_candidate_indices(query_text, corpus, sparse_encoder, topk=20):
  """
  Parameters:
    query_text: list of query string
    corpus: list of text in dictionary
    sparse_encoder: sparse encoder which embeds text to vector base tf-idf method
  
  Returns: 
    np.darray: matrix of indices of candidate in dictionary base on sparse score. Shape: (len(query), topk)
  """
  sparse_encoder.fit(corpus)
  query_sparse_embeddings = sparse_encoder.transform(query_text)
  dictionary_sparse_embeddings = sparse_encoder.transform(corpus)

  score_sparse_matrix = np.dot(query_sparse_embeddings, dictionary_sparse_embeddings.transpose())
  
  sparse_candidates = []
  for i in range(0, len(score_sparse_matrix)):
    topk_candidates_sparse =  np.argpartition(score_sparse_matrix[i], -topk)[-topk:]  # get n_sparse candidate first
    array_value = copy.deepcopy(score_sparse_matrix[i][topk_candidates_sparse])
    topk_candidates_sparse = sort_candidate_descening(array_value, topk_candidates_sparse)
    sparse_candidates.append(topk_candidates_sparse)

  sparse_candidates = np.array(sparse_candidates)
  return sparse_candidates

def get_dense_candidate_indices(query_text, corpus, bert_layer,topk=20):
  """
  Parameters:
    query_text: list of query string
    corpus: list of text in dictionary
    bert_layer: bert-pre-trained
  
  Returns: 
    np.darray: matrix of indices of candidate in dictionary base on dense score. Shape: (len(query), topk)
  """
  query_tokens = bert_encode(query_text, tokenizer, max_len=25)
  dictionary_tokens = bert_encode(corpus, tokenizer, max_len=25)
  dense_encoder = DenseEncoder(bert_layer, max_len=25)
  query_dense_embeddings = dense_encoder.get_dense_embedding(query_tokens) # [None, 768]
  dictionary_dense_embeddings =dense_encoder.get_dense_embedding(dictionary_tokens) # [None, 768]
  
  query_dense_score = np.dot(query_dense_embeddings, tf.transpose(dictionary_dense_embeddings, perm=[1,0]))
  
  candidates_dense = []
  for i in range(0, len(query_dense_score)):
    topk_candidates_dense = np.argpartition(query_dense_score[i], -topk)[-topk:]  # get n_sparse candidate first
    array_value = copy.deepcopy(query_dense_score[i][topk_candidates_dense])
    topk_candidates_dense = sort_candidate_descening(array_value, topk_candidates_dense)
    candidates_dense.append(topk_candidates_dense)

  candidates_dense = np.array(candidates_dense)
  return candidates_dense

def get_query_candidates_indices(query_text, corpus, sparse_encoder, bert_layer, topk=20):
  """
  Parameters:
    candidate_dense_indices: matrix of dense candidate indices
    candidate_sparse_indices: matrix of sparse candidate indices
    topk: number of candidates of a query
  
  Returns: 
    np.darray: matrix of indices of candidate in dictionary base on dense and sparse score. Shape: (len(query), topk)
  """

  sparse_candidate_indices = get_sparse_candidate_indices(query_text, corpus, sparse_encoder,topk)
  dense_candidate_indices = get_dense_candidate_indices(query_text, corpus, bert_layer, topk=50)

  candidates_indices = np.empty((sparse_candidate_indices.shape[0],topk))
  candidates_indices[:,0:11] = sparse_candidate_indices[:,0:11]
  for idx, row in enumerate(candidates_indices):
    n = 10
    for i in range(topk):
      if dense_candidate_indices[idx][i] not in row:
        row[n] = dense_candidate_indices[idx][i] 
        n += 1;      
      if n == topk: 
        break;
    
  return candidates_indices

def get_dense_candidate_embeddings(candidate_indices, dictionary, bert_layer):
  """
  Parameters:
    candidate_indices: matrix of candidate indices
    dictionary: dictionary
    bert_layer: bert-pretrained packaged in layer
  
  Returns: 
    np.darray: matrix of dense embeddings of candidates of queries. Shape: (len(query), topk, 768)
  """

  candidates_raw_text = []
  for row in candidate_indices:
    row_texts = dictionary.data[row.astype(int).tolist(),0]
    candidates_raw_text.append(row_texts)

  candidates_raw_text = np.array(candidates_raw_text)
  candidate_tokens = []
  for row in candidates_raw_text:
    row_tokens = bert_encode(row, tokenizer,max_len=25)
    candidate_tokens.append(row_tokens)
  
  dense_encoder = DenseEncoder(bert_layer, max_len=25)
  candidate_embeddings = []
  for row in candidate_tokens:
    row_embeddings = dense_encoder.get_dense_embedding(row)
    candidate_embeddings.append(row_embeddings)

  candidate_embeddings = np.array(candidate_embeddings)
  return candidate_embeddings

#candidate_embeddings = get_query_candidates_embeddings(candidates_indices, training_dictionary, bert_layer)
def get_sparse_candidate_score(candidates_indices, sparse_score_matix):
  """
  Parameters:
    candidate_indices: matrix of candidate indices
    sparse_score_matix: sparse score of all queries
    bert_layer: bert-pretrained packaged in layer
  
  Returns: 
    np.darray: matrix of dense embeddings of candidates of queries. Shape: (len(query), topk, 768)
  """

  candidate_sparse_score = []
  for idx,row in enumerate(candidates_indices):
    row_sparse_score = sparse_score_matix[idx][row.astype(int).tolist()]
    candidate_sparse_score.append(row_sparse_score)
  
  candidate_sparse_score = np.array(candidate_sparse_score)

  return candidate_sparse_score

def get_labels_of_candidates(true_labels, candidates_indices, dictionary):
  label_candidates = []
  for idx, row in enumerate(candidates_indices):

    row_label = []
    predict_label = dictionary.data[row.astype(int).tolist(),1]
    for i, labels in enumerate(predict_label):
      row_label.append(0)
      labels = labels.split('|')
      for label in labels:
        if label in true_labels[idx]:
          row_label[i] = 1
    label_candidates.append(row_label)

  return label_candidates

def get_sparse_query_score(sparse_encoder, corpus, query_text):

  sparse_encoder.fit(corpus)
  query_sparse_embeddings = sparse_encoder.transform(query_text)
  dictionary_sparse_embeddings = sparse_encoder.transform(corpus)
  score_sparse_matrix = np.dot(query_sparse_embeddings, dictionary_sparse_embeddings.transpose())

  return score_sparse_matrix

In [None]:
# Define model ReRanker
def marginal_loss(output, target):
  predict = tf.nn.softmax(tf.cast(output, dtype=tf.float32))
  loss = predict*target
  loss = K.sum(loss,axis=-1)                  # sum all positive scores

  loss = loss[loss > 0]                     # filter sets with at least one positives
  loss = -K.log(loss)                   # for negative log likelihood
  if len(loss) == 0:
      loss = K.sum(loss)                     # will return zero loss
  else:
      loss = K.mean(loss)
  return loss


def build_model(bert_layer, topk=20, max_len=25):  
  query_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="query_word_ids")
  query_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="query_mask")
  query_segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="query_segment_ids")

  pooled_output, sequence_output = bert_layer([query_word_ids, query_mask, query_segment_ids])
  query_dense_embeddings = sequence_output[:, 0, :] # [None, 768]

  # score: 
  candidates_dense_embeddings = tf.keras.Input(shape=(topk,768,), dtype=tf.float32, name="candidates_dense_embeddings")
  candidates_dense_score = keras.layers.Dot(axes=(2,1),name="dense_score")([candidates_dense_embeddings,query_dense_embeddings])
  
  batch_size = candidates_dense_score.shape[0] 
  candidate_sparse_score = tf.keras.Input(shape=(batch_size,), dtype=tf.float32, name="candidate_sparse_score")
  scaling_sparse_score = Scalar(name='sparse_weight')(candidate_sparse_score)

  score = scaling_sparse_score + candidates_dense_score

  lr_multiplier = {
    'bert_layer':1, # optimize with a smaller learning rate
    'sparse_weight':0.5e+3   # optimize  with a larger learning rate
    }
    
  opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)

  model =  tf.keras.models.Model(inputs = [[query_word_ids, query_mask, query_segment_ids],candidates_dense_embeddings, candidate_sparse_score], outputs=score )
  #model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-5),loss=marginal_loss)
  model.compile(optimizer=opt,loss=marginal_loss)
  return model  

def retreival(query, dictionary,bert_layer, sparse_weight, max_len=25, topk=20):
  sparse_encoder = SparseEncoder()
  sparse_encoder.fit(dictionary.data[:,0])
  query_sparse_embeddings = sparse_encoder.transform(query)
  dictionary_sparse_embeddings = sparse_encoder.transform(dictionary.data[:,0])

  query_sparse_score = np.dot(query_sparse_embeddings, dictionary_sparse_embeddings.transpose())
  
  query_tokens = bert_encode(query, tokenizer, max_len)
  dictionary_tokens = bert_encode(dictionary.data[:,0], tokenizer, max_len)
  dense_encoder = DenseEncoder(bert_layer, max_len)
  query_dense_embeddings = dense_encoder.get_dense_embedding(query_tokens) 
  dictionary_dense_embeddings = dense_encoder.get_dense_embedding(dictionary_tokens) 

  query_dense_score = np.dot(query_dense_embeddings, tf.transpose(dictionary_dense_embeddings, perm=[1,0]))
  
  query_score = query_sparse_score*sparse_weight + query_dense_score

  candidates_indices = []
  for i in range(0, len(query_score)):
    topk_candidates_dense = np.argpartition(query_score[i], -topk)[-topk:]  # get n_sparse candidate first
    array_value = copy.deepcopy(query_score[i][topk_candidates_dense])
    topk_candidates_dense = sort_candidate_descening(array_value, topk_candidates_dense)
    candidates_indices.append(topk_candidates_dense)

  candidates_indices = np.array(candidates_indices)
  return candidates_indices

def evaluate(true_labels,candidate_indices, dictionary):
  get_labels_of_candidates
  label_candidates = get_labels_of_candidates(true_labels,candidate_indices, dictionary)
  n = len(label_candidates)
  top_1 = top_5 = top_10 = top_20 = 0
  for row in label_candidates:
    if row[0] == 1:
      top_1 += 1
    if 1 in row[0:5]:
      top_5 += 1
    if 1 in row[0:10]:
      top_10 += 1
    if 1 in row[0:20]:
      top_20 += 1
    
  return [top_1/n, top_5/n, top_10/n, top_20/n]
#candidates_test_indices = retreival(training_query, training_dictionary, bert_layer, sparse_weight[0])

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from tensorflow.python.framework import ops
from tensorflow.python.keras import backend_config
from tensorflow.python.keras.optimizer_v2 import optimizer_v2
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.training import training_ops
from tensorflow.python.util.tf_export import keras_export


@keras_export('keras.optimizers.AdamLRM')
class AdamLRM(optimizer_v2.OptimizerV2):

  def __init__(self,
               learning_rate=0.001,
               beta_1=0.9,
               beta_2=0.999,
               epsilon=1e-7,
               amsgrad=False,
               lr_multiplier={},
               name='AdamLRM',
               **kwargs):

    super(AdamLRM, self).__init__(name, **kwargs)
    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
    self._set_hyper('decay', self._initial_decay)
    self._set_hyper('beta_1', beta_1)
    self._set_hyper('beta_2', beta_2)
    self.epsilon = epsilon or backend_config.epsilon()
    self.amsgrad = amsgrad
    self._lrm_names = list(lr_multiplier.keys())
    for k,v in lr_multiplier.items():
      self._set_hyper(f'lrm_{k}', v)

  def _create_slots(self, var_list):
    # Create slots for the first and second moments.
    # Separate for-loops to respect the ordering of slot variables from v1.
    for var in var_list:
      self.add_slot(var, 'm')
    for var in var_list:
      self.add_slot(var, 'v')
    if self.amsgrad:
      for var in var_list:
        self.add_slot(var, 'vhat')

  def _prepare_local(self, var_device, var_dtype, apply_state):
    super(AdamLRM, self)._prepare_local(var_device, var_dtype, apply_state)

    local_step = math_ops.cast(self.iterations + 1, var_dtype)
    beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype))
    beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype))
    beta_1_power = math_ops.pow(beta_1_t, local_step)
    beta_2_power = math_ops.pow(beta_2_t, local_step)
    lr = (apply_state[(var_device, var_dtype)]['lr_t'] *
          (math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)))
    apply_state[(var_device, var_dtype)].update(dict(
        lr=lr,
        epsilon=ops.convert_to_tensor(self.epsilon, var_dtype),
        beta_1_t=beta_1_t,
        beta_1_power=beta_1_power,
        one_minus_beta_1_t=1 - beta_1_t,
        beta_2_t=beta_2_t,
        beta_2_power=beta_2_power,
        one_minus_beta_2_t=1 - beta_2_t
    ))

  def set_weights(self, weights):
    params = self.weights
    # If the weights are generated by Keras V1 optimizer, it includes vhats
    # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
    # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
    num_vars = int((len(params) - 1) / 2)
    if len(weights) == 3 * num_vars + 1:
      weights = weights[:len(params)]
    super(AdamLRM, self).set_weights(weights)

  def _resource_apply_dense(self, grad, var, apply_state=None):
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype))
                    or self._fallback_apply_state(var_device, var_dtype))

    m = self.get_slot(var, 'm')
    v = self.get_slot(var, 'v')

    lr_t = coefficients['lr_t']
    for k in self._lrm_names:
      if var.name.startswith(k):
        lr_t = coefficients['lr_t'] * self._get_hyper(f'lrm_{k}', var.dtype)

    if not self.amsgrad:
      return training_ops.resource_apply_adam(
          var.handle,
          m.handle,
          v.handle,
          coefficients['beta_1_power'],
          coefficients['beta_2_power'],
          lr_t,
          coefficients['beta_1_t'],
          coefficients['beta_2_t'],
          coefficients['epsilon'],
          grad,
          use_locking=self._use_locking)
    else:
      vhat = self.get_slot(var, 'vhat')
      return training_ops.resource_apply_adam_with_amsgrad(
          var.handle,
          m.handle,
          v.handle,
          vhat.handle,
          coefficients['beta_1_power'],
          coefficients['beta_2_power'],
          lr_t,
          coefficients['beta_1_t'],
          coefficients['beta_2_t'],
          coefficients['epsilon'],
          grad,
          use_locking=self._use_locking)

  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype))
                    or self._fallback_apply_state(var_device, var_dtype))

    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, 'm')
    m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
    m_t = state_ops.assign(m, m * coefficients['beta_1_t'],
                           use_locking=self._use_locking)
    with ops.control_dependencies([m_t]):
      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)

    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, 'v')
    v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
    v_t = state_ops.assign(v, v * coefficients['beta_2_t'],
                           use_locking=self._use_locking)
    with ops.control_dependencies([v_t]):
      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

    lr = coefficients['lr']
    for k in self._lrm_names:
      if var.name.startswith(k):
        lr = coefficients['lr'] * self._get_hyper(f'lrm_{k}', var.dtype)

    if not self.amsgrad:
      v_sqrt = math_ops.sqrt(v_t)
      var_update = state_ops.assign_sub(
          var, lr * m_t / (v_sqrt + coefficients['epsilon']),
          use_locking=self._use_locking)
      return control_flow_ops.group(*[var_update, m_t, v_t])
    else:
      v_hat = self.get_slot(var, 'vhat')
      v_hat_t = math_ops.maximum(v_hat, v_t)
      with ops.control_dependencies([v_hat_t]):
        v_hat_t = state_ops.assign(
            v_hat, v_hat_t, use_locking=self._use_locking)
      v_hat_sqrt = math_ops.sqrt(v_hat_t)
      var_update = state_ops.assign_sub(
          var,
          lr * m_t / (v_hat_sqrt + coefficients['epsilon']),
          use_locking=self._use_locking)
      return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])

  def get_config(self):
    config = super(AdamLRM, self).get_config()
    config.update({
        'learning_rate': self._serialize_hyperparameter('learning_rate'),
        'decay': self._serialize_hyperparameter('decay'),
        'beta_1': self._serialize_hyperparameter('beta_1'),
        'beta_2': self._serialize_hyperparameter('beta_2'),
        'epsilon': self.epsilon,
        'amsgrad': self.amsgrad,
    })
    for k in self._lrm_names:
      config[k] = self._serialize_hyperparameter(f'lrm_{k}')
    return config


In [None]:
model = build_model(bert_layer,topk=20,max_len=25)

In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_0')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f42421a3fd0>

In [None]:
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


In [None]:

#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))


sparse_weight: 3.2980733
Top 1: 61.78% Top 5: 82.86% Top 10: 86.65% Top 20: 89.61% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_1')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



































sparse_weight: 6.210837
Top 1: 83.18% Top 5: 91.94% Top 10: 93.08% Top 20: 93.84% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_2')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



































sparse_weight: 9.55532
Top 1: 90.10% Top 5: 93.36% Top 10: 94.14% Top 20: 95.03% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_3')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



































sparse_weight: 12.900183
Top 1: 90.38% Top 5: 93.63% Top 10: 94.59% Top 20: 95.41% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_4')

In [None]:

#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))


sparse_weight: 16.242441
Top 1: 90.29% Top 5: 93.79% Top 10: 94.75% Top 20: 95.62% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_5')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



































sparse_weight: 19.590744
Top 1: 90.40% Top 5: 93.57% Top 10: 94.70% Top 20: 95.59% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_6')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



































sparse_weight: 22.944056
Top 1: 90.27% Top 5: 93.60% Top 10: 94.77% Top 20: 95.63% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_7')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



































sparse_weight: 26.3024
Top 1: 90.26% Top 5: 93.50% Top 10: 94.64% Top 20: 95.54% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_8')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



































sparse_weight: 29.655035
Top 1: 90.19% Top 5: 93.49% Top 10: 94.56% Top 20: 95.48% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_9')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



































sparse_weight: 33.00549
Top 1: 90.15% Top 5: 93.44% Top 10: 94.55% Top 20: 95.59% 


In [None]:
model.load_weights('/content/drive/My Drive/biosyn_tac2017_epoch_10.h5')
lr_multiplier = {
'bert_layer':1, # optimize with a smaller learning rate
'sparse_weight':0.5e+3   # optimize  with a larger learning rate
}

opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)
model.compile(optimizer=opt,loss=marginal_loss)


#Evaluate:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))



sparse_weight: 36.36214
Top 1: 90.24% Top 5: 93.46% Top 10: 94.64% Top 20: 95.62% 


In [None]:
model.fit(x=training_data, y=label_candidates, batch_size=16,epochs=2,verbose=1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f51cd070310>

In [None]:
model.fit(x=training_data, y=label_candidates, batch_size=16,epochs=2,verbose=1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f6ca14c6f50>

In [None]:
model.fit(x=training_data, y=label_candidates, batch_size=16,epochs=2,verbose=1)
file_path = '/content/drive/My Drive/' + 'biosyn_tac2017_epoch_2' 
model.save_weights(file_path)

Epoch 1/2
Epoch 2/2


In [None]:
model.fit(x=training_data, y=label_candidates, batch_size=16,epochs=2,verbose=1)
file_path = '/content/drive/My Drive/' + 'biosyn_tac2017_epoch_3' 
model.save_weights(file_path)

Epoch 1/2
Epoch 2/2


In [None]:
model.fit(x=training_data, y=label_candidates, batch_size=16,epochs=2,verbose=1)
file_path = '/content/drive/My Drive/' + 'biosyn_tac2017_epoch_4' 
model.save_weights(file_path)

Epoch 1/2
Epoch 2/2


In [None]:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(test_query.data[:,0], test_dictionary, bert_layer, sparse_weight[0])
res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))


sparse_weight: 9.05016


In [None]:
file_path = '/content/drive/My Drive/' + 'biosyn_tac2017_epoch_1' 
model.save_weights(file_path)

In [None]:
# TRAINING:
for  i in range(0,10):
  # training data: training_query_token, dense_candidate_embeddings, sparse_candidate_score
  training_query_tokens = bert_encode(training_query.data[:,0], tokenizer, max_len=25)
  training_candidates_indices = get_query_candidates_indices(training_query.data[:,0], training_dictionary.data[:,0], sparse_encoder, bert_layer, topk=20)
  dense_candidate_embeddings = get_dense_candidate_embeddings(training_candidates_indices,training_dictionary, bert_layer)

  sparse_score_matrix = get_sparse_query_score(sparse_encoder, training_dictionary.data[:,0], training_query.data[:,0])
  sparse_candidate_score = get_sparse_candidate_score(training_candidates_indices, sparse_score_matrix)
  #  labels:
  label_candidates = get_labels_of_candidates(training_query.data[:,1],training_candidates_indices, training_dictionary)
  label_candidates = np.array(label_candidates)

  training_data = [[training_query_tokens], dense_candidate_embeddings,sparse_candidate_score]
  model.fit(x=training_data, y=label_candidates, batch_size=16,epochs=2,verbose=1)

  #Evaluate:
  sparse_weight = model.get_layer(name='sparse_weight').get_weights()
  print('sparse_weight:', sparse_weight[0])
  candidates_test_indices = retreival(test_query, test_dictionary, bert_layer, sparse_weight[0])
  res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
  print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))


In [None]:
model.save('/content/drive/My Drive/biosyn')

In [None]:
model.save_weights('/content/drive/My Drive/biosyn_weight.h5')

In [None]:
model.load_weights('/content/drive/My Drive/biosyn_weight.h5')

In [None]:
bert_layer = model.get_layer(name='bert_layer')

## QUERY


In [None]:
sparse_weight = model.get_layer(name='sparse_weight').get_weights()
print('sparse_weight:', sparse_weight[0])
candidates_test_indices = retreival(['ataxia inclu'], test_dictionary, bert_layer, sparse_weight[0])
#res = evaluate(test_query.data[:,1],candidates_test_indices, test_dictionary)
print(test_dictionary.data[candidates_test_indices,0])

In [None]:
label_candidates = get_labels_of_candidates(test_query.data[:,1],candidates_test_indices, test_dictionary)
for idx,query in enumerate(test_query.data[:,0]):
  print(query+':')
  for i,candidate in enumerate(candidates_test_indices[idx]):
    if label_candidates[idx][i] == 1:
      print('\t*'+test_dictionary.data[candidate,0])
    else:
      print('\t'+test_dictionary.data[candidate,0])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
	infections congenital
	viral infections
febrile neutropenia:
	*febrile neutropenia
	neutropenia
	neutropenias
	neutropenic
	acute febrile neutrophilic dermatosis
	late onset neutropenia
	congenital neutropenia
	neutropenia neonatal
	chronic neutropenia
	transient neonatal neutropenia
	worsening of neutropenia
	idiopathic neutropenia
	benign ethnic neutropenia
	*neutropenic fever
	cyclic neutropenia
	neutropenia malignant
	neutropenic enterocolitis
	febrile aplasia
	autoimmune neutropenia
	penile necrosis
neutrophil count nadirs:
	neutrophil count
	neutrophil count normal
	band neutrophil count
	neutrophil count abnormal
	neutrophil count abnormal nos
	csf neutrophil count
	*neutrophil count low
	segmented neutrophil count
	absolute neutrophil count
	neutrophil count high
	csf neutrophil count negative
	csf neutrophil count positive
	neutrophil count csf positive
	absolute neutrophil count abnormal
	peritoneal effluent ne