<a href="https://colab.research.google.com/github/dinhngoc267/NSEEN/blob/master/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import keras

import os 

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import tensorflow as tf 
import tensorflow_hub as hub 
import tensorflow_datasets as tfds 

import glob
from tqdm import tqdm

import pickle 
from sklearn.feature_extraction.text import TfidfVectorizer 

from keras.layers import Layer
import keras.backend as K
import copy

import xml.etree.ElementTree as elemTree

In [None]:
# Define func to encode raw_text to token
# vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
# tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=25):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0]*pad_len
        pad_masks = [1] * len(input_sequence) + [0]*pad_len
        segment_ids = [0]*max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

class DenseEncoder():
  def __init__(self, bert_layer, max_len=25):
    
    input_word_ids = tf.keras.Input(shape=(max_len,),dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,),dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask,segment_ids])
    clf_output = sequence_output[:,0,:]

    self.model = tf.keras.models.Model(inputs=[input_word_ids,input_mask,segment_ids], outputs=clf_output)

  def get_model(self):
    return self.model

  def get_dense_embedding(self,input):
    return self.model.predict([input])

class SparseEncoder(object):
  def __init__(self):
    self.encoder = TfidfVectorizer(analyzer='char', ngram_range=(1,2))

  def fit(self, train_corpus):
    self.encoder.fit(train_corpus)

    return self
  
  def transform(self, mentions):
    vec = self.encoder.transform(mentions).toarray()
    #vec = tf.constant(vec, dtype=tf.float32)
    return vec
  
  def __call__(self, mentions):
    return self.transform(mentions)


class QueryDataset():
  def __init__(self, data_dir):
    self.data = self.load_data(data_dir = data_dir)

  def load_data(self, data_dir):
    """
    Parameters
    ----------
    data_dir: a path of data

    Returns
    -------
    data: np.array[(mention, CUI)]
    """
    data = []
    concept_files = glob.glob(os.path.join(data_dir,"*.concept"))
    for concept_file in tqdm(concept_files):
      with open(concept_file, "r", encoding="utf-8") as f:
        concepts = f.readlines()

        for concept in concepts: 
          concept = concept.split("||")
          mention = concept[3].strip()
          cui = concept[4].strip()

          data.append((mention,cui))

    data = np.array(data)

    return data

class DictionaryDataset():
  """
  Dictionary data
  """
  def __init__(self, dictionary_path):
    """
    Parameters
    ----------
    dictionary_path: str
      The path of the dictionary
    """
    self.data = self.load_data(dictionary_path)
  def load_data(self, dictionary_path):
    data = []
    with open(dictionary_path, mode='r', encoding='utf-8') as f:
      lines = f.readlines()
      for line in tqdm(lines):
        line = line.strip()
        if line=="": continue
        cui, name = line.split("||")
        data.append((name,cui))

    data = np.array(data)
    return data

class Scalar(Layer):
  def __init__(self,name=None):
    super(Scalar, self).__init__(name=name)
  def build(self,input_shape) :
    self.W = K.variable(0)
    self._trainable_weights=[self.W]
    super().build(input_shape)
  def call(self,inputs):
    return self.W*inputs

class MedDRAPreprocess():
  """
  Make training dictionary pair
  """
  def __init__(self, hlgt_path, hlt_path, pt_path, llt_path):
      self.hlgt_path = hlgt_path
      self.hlt_path = hlt_path
      self.pt_path =pt_path
      self.llt_path = llt_path

  def load_dictionary(self):
      """ 
      ! hlgt, hlt, pt => need to extract id and name
      format id$name$$$$$$$$
      ! llt => need to extract pt_id and name
      format id$name$pt_id$$$$$$$
      """

      dictionary = {}
      # hlgt
      with open(self.hlgt_path, "r") as f:
          lines = f.readlines()
          for line in tqdm(lines):
              line = line.split("$")
              _id = line[0]
              _name = line[1]
              if _id not in dictionary.keys():
                  dictionary[_id] = _name  
              else:
                  dictionary[_id] = dictionary[_id] + "|" + _name
      # hlt
      with open(self.hlt_path, "r") as f:
          lines = f.readlines()
          for line in tqdm(lines):
              line = line.split("$")
              _id = line[0]
              _name = line[1]
              if _id not in dictionary.keys():
                  dictionary[_id] = _name  
              else:
                  dictionary[_id] = dictionary[_id] + "|" + _name

      # pt
      with open(self.pt_path, "r") as f:
          lines = f.readlines()
          for line in tqdm(lines):
              line = line.split("$")
              _id = line[0]
              _name = line[1]
              if _id not in dictionary.keys():
                  dictionary[_id] = _name  
              else:
                  dictionary[_id] = dictionary[_id] + "|" + _name

      # llt
      with open(self.llt_path, "r") as f:
          lines = f.readlines()
          for line in tqdm(lines):
              line = line.split("$")
              _id = line[2]
              _name = line[1]
              if _id not in dictionary.keys():
                  dictionary[_id] = _name  
              else:
                  names = dictionary[_id].split("|")
                  names.append(_name)
                  names = "|".join(list(set(names)))
                  dictionary[_id] = names

      list_dictionary = [[k,v] for k,v in dictionary.items()]
      return list_dictionary

  def make_ID_mention_map(self, out):
      dictionary = self.load_dictionary()
      with open(out, 'w') as outfile:
          for row in dictionary:
              outfile.write("||".join(row))
              outfile.write('\n')

In [None]:
def parse_xml(file):
    doc = elemTree.parse(file)
    root = doc.getroot()

    # text
    text = ""
    sections = root.findall('./Text/Section')
    for section in sections:
        text += section.text

    # reaction nodes have all mention and id.
    mention2id = {}
    reactions = root.findall('./Reactions/Reaction')
    for reaction in reactions:
        mention = reaction.attrib['str'].lower()
        cuis = []
        for normalization in reaction.findall('Normalization'):
            if 'meddra_pt_id' in normalization.attrib:
                cuis.append(normalization.attrib['meddra_pt_id'])
        cuis = '|'.join(cuis)
        if mention not in mention2id:
            mention2id[mention] = cuis
        else:
            raise ValueError('mention({}) already have id({}) in mention2id dictionary.'.format(mention, id_))
    
    # mention node have section and span information.
    entity_mentions = [mention for mention in mention2id.keys()]
    entity_ids = [mention2id[mention] if mention2id[mention] else '-1' for mention in entity_mentions]
    id_name = list(zip(entity_ids, entity_mentions))

    return text, id_name

def process_TAC(input_dir, output_dir):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    input_files = sorted(glob.glob(os.path.join(input_dir, "*.xml")))
    num_docs=0
    num_queries = 0
    for input_file in input_files:
        document_name = os.path.basename(input_file).split(".")[0]
        txtname = document_name + '.txt'
        conceptname = document_name + '.concept'

        text, id_names = parse_xml(input_file)

        # save text
        with open(os.path.join(output_dir,txtname) ,'w') as f:
            f.write(text.lower())
        
        # save entity
        with open(os.path.join(output_dir,conceptname) ,'w') as f:
            for cui, mention in id_names:
                f.write("-1||-1|-1||-1||{}||{}".format(mention, cui))
                f.write("\n")
                num_queries +=1
        num_docs+=1
        
    print("{} {} {}".format(output_dir, num_docs,num_queries))    


In [None]:
# load data
training_query = QueryDataset('/content/drive/My Drive/TAC2017/train')
training_dictionary = DictionaryDataset('/content/drive/My Drive/MedDRA/dictionary/meddra18.1.txt')
test_query = QueryDataset('/content/drive/My Drive/TAC2017/test')
test_dictionary = DictionaryDataset('/content/drive/My Drive/MedDRA/dictionary/meddra18.1.txt')

100%|██████████| 80/80 [00:21<00:00,  3.68it/s]
100%|██████████| 23668/23668 [00:00<00:00, 511251.81it/s]
100%|██████████| 99/99 [00:27<00:00,  3.60it/s]
100%|██████████| 23668/23668 [00:00<00:00, 770060.33it/s]


In [None]:
def find_min_idx(array):
  """
  Returns: the index of min in array
  """
  min_idx = 0

  for idx, item in enumerate(array):
    if item < array[min_idx]:
      min_idx = idx

  return min_idx 

def find_topk_candidate(array, topk):
  topk_idx = []

  for idx,item in enumerate(array):
    if len(topk_idx) < topk:
      topk_idx.append(idx)
    else:
      min_idx = find_min_idx(array[topk_idx])
      if item > array[topk_idx[min_idx]]:
        topk_idx[min_idx] = idx
  return np.array(topk_idx)

def sort_candidate_descening(array_value, array_idx):

  for i in range(len(array_value)-1):
    max_idx = i
    for j in range(i+1, len(array_value)):
      if array_value[j] > array_value[max_idx]:
        max_idx = j
    if max_idx != i:
      
      tmp = array_value[i]
      array_value[i] = array_value[max_idx]
      array_value[max_idx] = tmp

      tmp = array_idx[i]
      array_idx[i] = array_idx[max_idx]
      array_idx[max_idx] = tmp
  return array_idx

In [None]:
sparse_encoder = SparseEncoder()

In [None]:
# define function to get sparse candidates and dense candidates indices of query

def get_sparse_candidate_indices(query_text, corpus, sparse_encoder, topk=20):
  """
  Parameters:
    query_text: list of query string
    corpus: list of text in dictionary
    sparse_encoder: sparse encoder which embeds text to vector base tf-idf method
  
  Returns: 
    np.darray: matrix of indices of candidate in dictionary base on sparse score. Shape: (len(query), topk)
  """
  sparse_encoder.fit(corpus)
  query_sparse_embeddings = sparse_encoder.transform(query_text)
  dictionary_sparse_embeddings = sparse_encoder.transform(corpus)

  score_sparse_matrix = np.dot(query_sparse_embeddings, dictionary_sparse_embeddings.transpose())
  
  sparse_candidates = []
  for i in range(0, len(score_sparse_matrix)):
    topk_candidates_sparse =  np.argpartition(score_sparse_matrix[i], -topk)[-topk:]  # get n_sparse candidate first
    array_value = copy.deepcopy(score_sparse_matrix[i][topk_candidates_sparse])
    topk_candidates_sparse = sort_candidate_descening(array_value, topk_candidates_sparse)
    sparse_candidates.append(topk_candidates_sparse)

  sparse_candidates = np.array(sparse_candidates)
  return sparse_candidates

def get_dense_candidate_indices(query_text, corpus, bert_layer,topk=20):
  """
  Parameters:
    query_text: list of query string
    corpus: list of text in dictionary
    bert_layer: bert-pre-trained
  
  Returns: 
    np.darray: matrix of indices of candidate in dictionary base on dense score. Shape: (len(query), topk)
  """
  query_tokens = bert_encode(query_text, tokenizer, max_len=25)
  dictionary_tokens = bert_encode(corpus, tokenizer, max_len=25)
  dense_encoder = DenseEncoder(bert_layer, max_len=25)
  query_dense_embeddings = dense_encoder.get_dense_embedding(query_tokens) # [None, 768]
  dictionary_dense_embeddings =dense_encoder.get_dense_embedding(dictionary_tokens) # [None, 768]
  
  query_dense_score = np.dot(query_dense_embeddings, tf.transpose(dictionary_dense_embeddings, perm=[1,0]))
  
  candidates_dense = []
  for i in range(0, len(query_dense_score)):
    topk_candidates_dense = np.argpartition(query_dense_score[i], -topk)[-topk:]  # get n_sparse candidate first
    array_value = copy.deepcopy(query_dense_score[i][topk_candidates_dense])
    topk_candidates_dense = sort_candidate_descening(array_value, topk_candidates_dense)
    candidates_dense.append(topk_candidates_dense)

  candidates_dense = np.array(candidates_dense)
  return candidates_dense

def get_query_candidates_indices(query_text, corpus, sparse_encoder, bert_layer, topk=20):
  """
  Parameters:
    candidate_dense_indices: matrix of dense candidate indices
    candidate_sparse_indices: matrix of sparse candidate indices
    topk: number of candidates of a query
  
  Returns: 
    np.darray: matrix of indices of candidate in dictionary base on dense and sparse score. Shape: (len(query), topk)
  """

  sparse_candidate_indices = get_sparse_candidate_indices(query_text, corpus, sparse_encoder,topk)
  dense_candidate_indices = get_dense_candidate_indices(query_text, corpus, bert_layer, topk=30)

  candidates_indices = np.empty((sparse_candidate_indices.shape[0],topk))
  candidates_indices[:,0:11] = sparse_candidate_indices[:,0:11]
  for idx, row in enumerate(candidates_indices):
    n = 10
    for i in range(topk):
      if dense_candidate_indices[idx][i] not in row:
        row[n] = dense_candidate_indices[idx][i] 
        n += 1;      
      if n == topk: 
        break;
    
  return candidates_indices

def get_dense_candidate_embeddings(candidate_indices, dictionary, bert_layer):
  """
  Parameters:
    candidate_indices: matrix of candidate indices
    dictionary: dictionary
    bert_layer: bert-pretrained packaged in layer
  
  Returns: 
    np.darray: matrix of dense embeddings of candidates of queries. Shape: (len(query), topk, 768)
  """

  candidates_raw_text = []
  for row in candidate_indices:
    row_texts = dictionary.data[row.astype(int).tolist(),0]
    candidates_raw_text.append(row_texts)

  candidates_raw_text = np.array(candidates_raw_text)
  candidate_tokens = []
  for row in candidates_raw_text:
    row_tokens = bert_encode(row, tokenizer,max_len=25)
    candidate_tokens.append(row_tokens)
  
  dense_encoder = DenseEncoder(bert_layer, max_len=25)
  candidate_embeddings = []
  for row in candidate_tokens:
    row_embeddings = dense_encoder.get_dense_embedding(row)
    candidate_embeddings.append(row_embeddings)

  candidate_embeddings = np.array(candidate_embeddings)
  return candidate_embeddings

#candidate_embeddings = get_query_candidates_embeddings(candidates_indices, training_dictionary, bert_layer)
def get_sparse_candidate_score(candidates_indices, sparse_score_matix):
  """
  Parameters:
    candidate_indices: matrix of candidate indices
    sparse_score_matix: sparse score of all queries
    bert_layer: bert-pretrained packaged in layer
  
  Returns: 
    np.darray: matrix of dense embeddings of candidates of queries. Shape: (len(query), topk, 768)
  """

  candidate_sparse_score = []
  for idx,row in enumerate(candidates_indices):
    row_sparse_score = sparse_score_matix[idx][row.astype(int).tolist()]
    candidate_sparse_score.append(row_sparse_score)
  
  candidate_sparse_score = np.array(candidate_sparse_score)

  return candidate_sparse_score

def get_labels_of_candidates(true_labels, candidates_indices, dictionary):
  label_candidates = []
  for idx, row in enumerate(candidates_indices):

    row_label = []
    predict_label = dictionary.data[row.astype(int).tolist(),1]
    for i, labels in enumerate(predict_label):
      row_label.append(0)
      labels = labels.split('|')
      for label in labels:
        if label in true_labels[idx]:
          row_label[i] = 1
    label_candidates.append(row_label)

  return label_candidates

def get_sparse_query_score(sparse_encoder, corpus, query_text):

  sparse_encoder.fit(corpus)
  query_sparse_embeddings = sparse_encoder.transform(query_text)
  dictionary_sparse_embeddings = sparse_encoder.transform(corpus)
  score_sparse_matrix = np.dot(query_sparse_embeddings, dictionary_sparse_embeddings.transpose())

  return score_sparse_matrix

In [None]:
# Define model ReRanker
def marginal_loss(output, target):
  predict = tf.nn.softmax(tf.cast(output, dtype=tf.float32))
  loss = predict*target
  loss = K.sum(loss,axis=-1)                  # sum all positive scores

  loss = loss[loss > 0]                     # filter sets with at least one positives
  #loss = K.clip(loss, min_value=1e-9, max_value=1) # for numerical stability
  loss = -K.log(loss)                   # for negative log likelihood
  if len(loss) == 0:
      loss = K.sum(loss)                     # will return zero loss
  else:
      loss = K.mean(loss)
  return loss


def build_model(bert_layer, topk=20, max_len=25):  
  query_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="query_word_ids")
  query_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="query_mask")
  query_segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="query_segment_ids")

  pooled_output, sequence_output = bert_layer([query_word_ids, query_mask, query_segment_ids])
  query_dense_embeddings = sequence_output[:, 0, :] # [None, 768]

  # score: 
  candidates_dense_embeddings = tf.keras.Input(shape=(topk,768,), dtype=tf.float32, name="candidates_dense_embeddings")
  candidates_dense_score = keras.layers.Dot(axes=(2,1),name="dense_score")([candidates_dense_embeddings,query_dense_embeddings])
  
  batch_size = candidates_dense_score.shape[0] 
  candidate_sparse_score = tf.keras.Input(shape=(batch_size,), dtype=tf.float32, name="candidate_sparse_score")
  scaling_sparse_score = Scalar(name='sparse_weight')(candidate_sparse_score)

  score = scaling_sparse_score + candidates_dense_score

  lr_multiplier = {
    'bert_layer':1, # optimize with a smaller learning rate
    'sparse_weight':0.5e+3   # optimize  with a larger learning rate
    }
    
  opt = AdamLRM(lr=1e-5, lr_multiplier=lr_multiplier)

  model =  tf.keras.models.Model(inputs = [[query_word_ids, query_mask, query_segment_ids],candidates_dense_embeddings, candidate_sparse_score], outputs=score )
  #model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-5),loss=marginal_loss)
  model.compile(optimizer=opt,loss=marginal_loss)
  return model  

def retreival(query, dictionary,bert_layer, sparse_weight, max_len=25, topk=20):
  sparse_encoder = SparseEncoder()
  sparse_encoder.fit(dictionary.data[:,0])
  query_sparse_embeddings = sparse_encoder.transform(query)
  dictionary_sparse_embeddings = sparse_encoder.transform(dictionary.data[:,0])

  query_sparse_score = np.dot(query_sparse_embeddings, dictionary_sparse_embeddings.transpose())
  
  query_tokens = bert_encode(query, tokenizer, max_len)
  dictionary_tokens = bert_encode(dictionary.data[:,0], tokenizer, max_len)
  dense_encoder = DenseEncoder(bert_layer, max_len)
  query_dense_embeddings = dense_encoder.get_dense_embedding(query_tokens) 
  dictionary_dense_embeddings = dense_encoder.get_dense_embedding(dictionary_tokens) 

  query_dense_score = np.dot(query_dense_embeddings, tf.transpose(dictionary_dense_embeddings, perm=[1,0]))
  
  query_score = query_sparse_score*sparse_weight + query_dense_score

  candidates_indices = []
  for i in range(0, len(query_score)):
    topk_candidates_dense = np.argpartition(query_score[i], -topk)[-topk:]  # get n_sparse candidate first
    array_value = copy.deepcopy(query_score[i][topk_candidates_dense])
    topk_candidates_dense = sort_candidate_descening(array_value, topk_candidates_dense)
    candidates_indices.append(topk_candidates_dense)

  candidates_indices = np.array(candidates_indices)
  return candidates_indices

def evaluate(true_labels,candidate_indices, dictionary):
  get_labels_of_candidates
  label_candidates = get_labels_of_candidates(true_labels,candidate_indices, dictionary)
  n = len(label_candidates)
  top_1 = top_5 = top_10 = top_20 = 0
  for row in label_candidates:
    if row[0] == 1:
      top_1 += 1
    if 1 in row[0:5]:
      top_5 += 1
    if 1 in row[0:10]:
      top_10 += 1
    if 1 in row[0:20]:
      top_20 += 1
    
  return [top_1/n, top_5/n, top_10/n, top_20/n]
#candidates_test_indices = retreival(training_query, training_dictionary, bert_layer, sparse_weight[0])

In [None]:
model = build_model(bert_layer,topk=20,max_len=25)

In [None]:
sparse_candidate_indices = get_sparse_candidate_indices(test_query.data[:,0],test_dictionary.data[:,0],sparse_encoder,topk=20)

In [None]:
label_candidates = get_labels_of_candidates(test_query.data[:,1],sparse_candidate_indices, test_dictionary)
label_candidates = np.array(label_candidates)
n = len(label_candidates)
top_1 = top_5 = top_10 = top_20 = 0
for row in label_candidates:
  if row[0] == 1:
    top_1 += 1
  if 1 in row[0:5]:
    top_5 += 1
  if 1 in row[0:10]:
    top_10 += 1
  if 1 in row[0:20]:
    top_20 += 1

#res = evaluate(training_query.data[:,1],sparse_candidate_indices, training_dictionary)
#print('Top 1: {:.2f}% '.format(res[0]*100) + 'Top 5: {:.2f}% '.format(res[1]*100) + 'Top 10: {:.2f}% '.format(res[2]*100) + 'Top 20: {:.2f}% '.format(res[3]*100))
print('Top 1: {:.2f}% '.format(top_1/n *100) + 'Top 5: {:.2f}% '.format(top_5/n *100) + 'Top 10: {:.2f}% '.format(top_10/n *100) + 'Top 20: {:.2f}% '.format(top_20/n *100))


Top 1: 39.35% Top 5: 65.17% Top 10: 71.95% Top 20: 78.29% 
