In [None]:
pip install tensorflow-gpu

Collecting tensorflow-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/f1/aa/ae64be5acaac9055329289e6bfd54c1efa28bfe792f9021cea495fe2b89d/tensorflow_gpu-2.4.0-cp36-cp36m-manylinux2010_x86_64.whl (394.7MB)
[K     |████████████████████████████████| 394.7MB 39kB/s 
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-2.4.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 1.63 ms


In [None]:
pip install -U strsimpy

Requirement already up-to-date: strsimpy in /usr/local/lib/python3.6/dist-packages (0.2.0)
time: 2.9 s


In [None]:
pip install faiss-cpu --no-cache

time: 2.8 s


In [None]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading https://files.pythonhosted.org/packages/3f/58/a4a65efcce5c81a67b6893ade862736de355a3a718af5533d30c991831ce/ipython_autotime-0.2.0-py2.py3-none-any.whl
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.2.0
time: 252 µs


In [None]:
import tensorflow as tf
from strsimpy.jaro_winkler import JaroWinkler
import xml.etree.ElementTree as ET
import os
import numpy as np
from itertools import combinations
import random
from keras.preprocessing.sequence import pad_sequences
import copy
import re
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from __future__ import absolute_import
from __future__ import print_function
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding, Bidirectional, LSTM, InputLayer, GRU
from keras.optimizers import RMSprop
from keras import backend as K
from keras.optimizers import Adam
import math
import csv
import faiss
import nltk
import matplotlib.pyplot as plt

time: 3.26 s


In [None]:
def load_reference_dict(data_dir):
  reference_dict = {};
  tsv_file = open(data_dir); 
  read_tsv = csv.reader(tsv_file, delimiter="\n")

  for row in read_tsv:
    arr = row[0].split(': '); 
    values = [];
    variant_names = arr[1].split('\t');
    
    for name in variant_names:
      values.append(name)  
    reference_dict[arr[0]] = values

  return reference_dict

def char_to_int_dict(reference_dict, query_set): 
  text = "" 
  for key, values in reference_dict.items(): 
    for value in values: 
      text = text + value 

  for item in query_set:
    text = text + item

  vocab = sorted(set(text)) 
  return dict((c,i+1) for i,c in enumerate(vocab))

def get_maxlen_sequence(reference_dict): 
  maxlen=0 
  t = ""
  for key, values in reference_dict.items(): 
    for value in values: 
      if maxlen < len(value): 
        maxlen = len(value) 
        t = value
  return (maxlen,t)

def split_character(word): 
  return [char for char in word]

#convert char sequence to int sequence in reference set and pad sequence
def embedding_dataset(data,maxlen, char_to_int): 
  idx = 0 
  for values in data: 
    value_1 = [char_to_int[c] for c in values[0]]; 
    value_2 = [char_to_int[c] for c in values[1]];
    data[idx] = pad_sequences([value_1, value_2],maxlen,padding= 'post') 
    idx += 1; 
  return data;

def create_random_list(list_size, population):
  sample = [] 
  count = 0 
  while count < list_size: 
    index = np.random.randint(0, len(population), size=1) 
    if population[index[0]] != 0: 
      sample.append(index[0]) 
      population[index[0]] = 0 
      count += 1

  return sample, population

#Prepare 3 sub dataset
#1.Semantic

def create_sematic_pairs(reference_dict):
  # Create positive sematic pairs
  # The idea is pairwise crossproduct for terms have the same ID.
  sematic_pairs = []; 
  labels = [];

  for key,values in reference_dict.items(): 
    lst = []; n = len(values); 
    for pair in combinations(values,2):
        if pair[0][0:120] != pair[1][0:120]: 
          sematic_pairs.append(pair);
          labels.append((1,1))
  print("There are ",len(sematic_pairs)," positive pairs in sematic set")

  #Negative sematic pairs
  # The idea is choose random n pair terms have different IDs from each other. 
  count = 0;
  
  ls = random.choices(list(reference_dict.items()),k= (len(reference_list)))
  
  for i in range(0, len(ls)-1):
    if ls[i][0] != ls[i+1][0]:
      term_1 = random.choice(ls[i][1])
      term_2 = random.choice(ls[i+1][1])
      if term_1[0:120] != term_2[0:120]:
        sematic_pairs.append((term_1, term_2));
        labels.append((0,0));
        count += 1

  print("There are ",count ," negative pairs in sematic set") 
  return sematic_pairs, labels


def create_syntactic_pairs(reference_dict):

#Create Syntatic Variations
  syntactic_pairs = []; 
  labels = []; 
  
  #Same name syntactic
  for key, values in reference_dict.items(): 
    for value in values:
      modified_str = copy.deepcopy(value);
      #1. Find all sequence contain characters which are not alphanumerical. If '-' character then replace with ' '. Else remove.     
      if '-' in value:        
        modified_str = modified_str.replace('-', ' ');
      #2. Convert to lower cases.      
      if value.islower() == False: #and value.lower() not in values:
        modified_str = modified_str.lower();
      #3. Remove 's / s in  tions -> tion 
      if "'s" in value:
        modified_str = modified_str.replace("'s","");
      if "'" in value:
        modified_str = modified_str.replace("'","");        
      
      if modified_str != value and modified_str not in values:
        syntactic_pairs.append((value, modified_str));
        #print(values[0],',',modified_str);
  
  #Calculate the similarity between pair 
  jarowinkler = JaroWinkler()
  for i,pair in enumerate(syntactic_pairs): 
    similarity = jarowinkler.similarity(pair[0], pair[1])
    labels.append((similarity,1));

  print("There are", len(syntactic_pairs), "syntactic pairs") 
  return syntactic_pairs, labels
 

def cosine_distance(vects): 
  x,y=vects  
  t1_norm = tf.nn.l2_normalize(x, axis = 1)
  t2_norm = tf.nn.l2_normalize(y, axis = 1)

  cosine =  -tf.losses.cosine_similarity(t1_norm, t2_norm, axis = 1)
  return 1-cosine

def cos_dist_output_shape(shapes): 
  shape1, shape2 = shapes 
  return (shape1[0], 1)

def contrastive_loss(y, d):
  #d: distance; y: labels
  margin = 1
  return K.mean((y) * K.square(d) + (1-y)*K.square(K.maximum(margin - d, 0)))

def create_base_network(input_shape): 
  model = Sequential()
  #model.add(GRU(64,recurrent_dropout=0,return_sequences=True,  input_shape=input_shape, activation = 'tanh',use_bias = True, return_state=False,unroll = False,recurrent_activation = 'sigmoid',));
  model.add(Bidirectional(LSTM(64,return_sequences=True, input_shape= input_shape)))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Bidirectional(LSTM(64, return_sequences=False)))
  model.add(Dense(128, activation='relu'))
  return model

def shuffle_data(data, labels): 
  indices = np.arange(data.shape[0]) 
  np.random.shuffle(indices) 
  data = data[indices]
  labels = labels[indices] 
  return data, labels


#Define Siamese network
def Siamese_network(input_shape, dataset, labels, weights_file, save_dir): 
  base_network = create_base_network(input_shape);
  input_a = Input(shape=input_shape);
  input_b = Input(shape=input_shape);

  processed_a = base_network(input_a);
  processed_b = base_network(input_b);

  distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([processed_a, processed_b]);
  model = Model([input_a, input_b], distance);
 
  history = None;
  if weights_file =="":
    model.compile(loss=contrastive_loss, optimizer=Adam(learning_rate=0.0001))

    pop = [];
    for i in range(0, len(training_set)):
      pop.append(i+1);

    count = 0;
    max_count = np.ceil(len(training_set)/1024);

    filepath="/content/drive/My Drive/Colab Notebooks/NSEEN-weights-model-23.07.20-train_siamese_net-4_BiLSTM_layers-{epoch:02d}-{loss:.6f}.hdf5";
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1 , mode='min');
    callbacks_list = [checkpoint, myCallback("")];

    #train_gen = generator(training_set, training_labels, pop, count,max_count)  
    #history = model.fit_generator(train_gen,
      #            steps_per_epoch =  np.ceil(len(training_set)/1024),
     #             epochs=5,
      #            callbacks=callbacks_list)
    model.fit([training_set[:,0],training_set[:,1]], training_labels[:,0], epochs=5, batch_size=1024, callbacks=callbacks_list);
  else: 
    model.load_weights(weights_file) ;
    model.compile(loss=contrastive_loss, optimizer=Adam(learning_rate=0.0001)) 
  return model, history;
import keras
class myCallback(keras.callbacks.Callback):
  def __init__(self, save_dir):
    self.max_acc = 0;
    self.save_dir = save_dir;
  # Khi kết thúc mỗi epoch cần embed lại reference set và predict lại evaluate data (biểu diễn lại chuỗi dựa vào mô hình học)
  def on_epoch_end(self, epoch, logs=None):
    model_base = self.model.layers[2];
    reference_embeddings = model_base.predict(reference_vecs, batch_size=512);
    predicted_dev = model_base.predict(dev_set,batch_size=512);

    index = faiss.IndexFlatIP(128)   # build the index

    prev = copy.deepcopy(predicted_dev);
    reference = copy.deepcopy(reference_embeddings);
    faiss.normalize_L2(prev)  
    faiss.normalize_L2(reference)

    index.add(reference)
    D, I = index.search(prev,10);

    #index.add(reference_embeddings) 

    #D,I = index.search(predicted_dev[0:], 10) # Trả về index trong mảng 1 chiều sau khi đã flatten. 
                                                      # Cần biết vị trí của vector trong tập reference. -> Dựa vào reference_vecs_pos
    result = [];
    list_id = [];
    for i in range(0,len(predicted_dev)):
      tmp = [];
      lst = [];
      for j in range(0,10):
        idx = I[i][j]      # I là mảng 2 chiều. vì chỉ đi tìm neighbors của 1 vector nên lấy dòng đầu tiên và cột thứ j
        tmp.append(reference_list[idx]);
        lst.append(reference_list_idx[idx][1]);
      result.append(tmp);
      list_id.append(lst)
    
    n_hit_1 = 0; n_hit_3 = 0; n_hit_5 = 0; n_hit_10 = 0;

    for i in range(0,len(result)):
      flag = False;
      for j in range(0,1):
        if str(list_id[i][j]) in evaluate_dev[i, 1]:
          flag = True;
          n_hit_1 += 1;
          n_hit_3 += 1;
          n_hit_5 += 1;
          n_hit_10 += 1;
          break;
      if flag == False:
        for j in range(1,3):
          if str(list_id[i][j]) in evaluate_dev[i, 1]:
            flag = True;
            n_hit_3 += 1;
            n_hit_5 += 1;
            n_hit_10 += 1;
            break;

        if flag == False:
          for j in range(3,5):
            if str(list_id[i][j]) in evaluate_dev[i, 1]:
              flag = True;
              n_hit_5 += 1;
              n_hit_10 += 1;
              break;
          
          if flag == False:
            for j in range(5,10):
              if str(list_id[i][j]) in evaluate_dev[i, 1]:
                flag = True;
                n_hit_10 += 1;
                break;

    print(n_hit_1, n_hit_3, n_hit_5, n_hit_10);
    avg_acc = (float((n_hit_1/len(predicted_dev))) + float((n_hit_3/len(predicted_dev))) + float((n_hit_5/len(predicted_dev))) + float((n_hit_10/len(predicted_dev))))/4;
    print('Average Accuray:', avg_acc*100,'%');


def flatten_dict(referece_dict): 
  reference_list = []; 
  reference_list_idx = [] # Save the index of term. 
  idx = 0; 
  for key, values in reference_dict.items(): 
    for value in values:
      reference_list.append(value) 
      reference_list_idx.append((idx, key)); 
      idx += 1;

  return (reference_list, reference_list_idx);



#cấu trúc dữ liệu: training_set = str, [normalization_id],
def create_evaluate_data(dir): 
  evaluate_data = [];

  tsv_file = open(dir); 
  read_tsv = csv.reader(tsv_file, delimiter="\n")

  for row in read_tsv:
    arr = row[0].split(': '); 
    values = [];
    variant_names = arr[1].split('\t');
    
    for name in variant_names:
      values.append(name)  
    evaluate_data.append((arr[0],values))
  return evaluate_data

def generator(data, labels, pop, count, max_count, batch_size=1024):
  while 1:
    n = batch_size;    
    if count == max_count-1:
      n = len(data) - (max_count-1)*1024;    
    indices, pop = create_random_list(n, pop);
    
    if count == max_count - 1:
      pop = [];
      for i in range(0, len(training_set)):
        pop.append(i+1);
      count = -1;

    np.random.shuffle(indices);
    samples = data[indices];
    target = labels[indices];
    count += 1;
    yield [samples[:,0],samples[:,1]], target[:,0];
  

time: 981 ms


## CHEBI CORPUS

In [None]:
# ===== LOAD BỘ THAM CHIẾU ĐÃ ĐƯỢC XỬ LÝ VÀ TẠO DỮ LIỆU HUẤN LUYỆN ===========

#Create reference dictionary of entities from the xml files.
#The value of each key is a sequence of characters. 
print("Creating reference dictionary...")
reference_dict = load_reference_dict("/content/drive/My Drive/chebi_reference_set.tsv")
print("Size of reference: ", len(reference_dict))
evaluate_dev = create_evaluate_data('/content/drive/My Drive/chebi_query_set.tsv');
evaluate_dev = np.array(evaluate_dev, dtype='object');
dev_set = evaluate_dev[:,0]

char_to_int = char_to_int_dict(reference_dict, dev_set)
n = len(char_to_int)
(reference_list, reference_list_idx) = flatten_dict(reference_dict)
reference_vecs = copy.deepcopy(reference_list)

#Creating 2 training set.
print("Creating training set including: ")
print("1. Creating sematic set...")
(sematic_pairs, sematic_labels) = create_sematic_pairs(reference_dict)
#print("2. Creating syntactic set...")
#(syntactic_pairs, syntactic_labels) = create_syntactic_pairs(reference_dict)
#Now, we need to transform the sequence of characters into vector of integers so that we can feed into the model.
#Here, I use char to index method to embedd the sequence into vector. 
print("Concatenating sematic and syntactic sets...")
dataset = sematic_pairs# + syntactic_pairs;
labels = sematic_labels# + syntactic_labels;

print("Embedding the dataset into vec of integers...")
(maxlen, value) = get_maxlen_sequence(reference_dict)

maxlen = 120
idx = 0;
for value in reference_vecs: 
  reference_vecs[idx] = [char_to_int[c] for c in value];    
  idx +=1;

reference_vecs = pad_sequences(reference_vecs,maxlen,padding= 'post')
reference_vecs = reference_vecs/n
reference_vecs = np.reshape(reference_vecs,reference_vecs.shape + (1,))

#Tạo tập query

idx = 0;
for value in dev_set: 
  dev_set[idx] = [char_to_int[c] for c in value];    
  idx +=1;
dev_set = pad_sequences(dev_set, maxlen, padding='post')
dev_set = dev_set/n
dev_set = np.reshape(dev_set, dev_set.shape + (1,))

training_set = sematic_pairs #+ syntactic_pairs;
training_labels = sematic_labels #+ syntactic_labels;

training_set = embedding_dataset(training_set, maxlen,char_to_int)
training_set = np.array(training_set, dtype='float32');
training_set = training_set/n
#training_set = training_set / maxlen;
training_labels = np.array(training_labels);
print("Shuffle data...")
(training_set, training_labels) = shuffle_data(training_set, training_labels);

print("Reshape data into the shape of (samples, timestep, features)...")
training_set = np.reshape(training_set, training_set.shape + (1,))

Creating reference dictionary...
Size of reference:  58597
Creating training set including: 
1. Creating sematic set...
There are  938810  positive pairs in sematic set
There are  273792  negative pairs in sematic set
Concatenating sematic and syntactic sets...
Embedding the dataset into vec of integers...
Shuffle data...
Reshape data into the shape of (samples, timestep, features)...
time: 48.9 s


In [None]:
len(reference_list)

273798

time: 3.24 ms


In [None]:
# Train M / Load M
input_shape = training_set.shape[2:]
base_network = create_base_network(input_shape);
input_a = Input(shape=input_shape);
input_b = Input(shape=input_shape);

processed_a = base_network(input_a);
processed_b = base_network(input_b);

distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([processed_a, processed_b]);
model = Model([input_a, input_b], distance);

model.compile(loss=contrastive_loss, optimizer=Adam(learning_rate=0.0001))
training_set = K.cast_to_floatx(training_set)
training_labels = K.cast_to_floatx(training_labels)

filepath="/content/drive/My Drive/Colab Notebooks/NSEEN-weights-model-05.08-HNM-2-Adam-epoch:{epoch:02d}-{loss:.6f}.hdf5";
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1 , mode='min');
callbacks_list = [checkpoint, myCallback("")];
model.fit([training_set[:,0], training_set[:,1]],training_labels[:,0], epochs=1,batch_size=1024, callbacks=callbacks_list)


Epoch 00001: saving model to /content/drive/My Drive/Colab Notebooks/NSEEN-weights-model-05.08-HNM-2-Adam-epoch:01-0.157129.hdf5
491 541 546 550
Average Accuray: 41.04938271604938 %


<tensorflow.python.keras.callbacks.History at 0x7f1c9de1d4e0>

time: 7min 21s


In [None]:
model_base = model.layers[2];
reference_embeddings = model_base.predict(reference_vecs, batch_size=512);
predicted_dev = model_base.predict(dev_set,batch_size=512);

index = faiss.IndexFlatIP(128)   # build the index

prev = copy.deepcopy(predicted_dev);
reference = copy.deepcopy(reference_embeddings);
faiss.normalize_L2(prev)  
faiss.normalize_L2(reference)

index.add(reference)
D, I = index.search(prev,10);

time: 23.6 s


In [None]:
D

array([[1.0000001 , 0.9998049 , 0.9994435 , ..., 0.99848306, 0.99846566,
        0.99837995],
       [0.99999994, 0.9995487 , 0.9994616 , ..., 0.9989927 , 0.9987002 ,
        0.9986995 ],
       [1.0000001 , 0.99999833, 0.99999833, ..., 0.999983  , 0.99997956,
        0.99997956],
       ...,
       [0.99999887, 0.99999887, 0.9999898 , ..., 0.99995476, 0.9999503 ,
        0.99994785],
       [0.99999666, 0.99999267, 0.9999875 , ..., 0.9999278 , 0.9999083 ,
        0.9998673 ],
       [0.99997663, 0.9999703 , 0.9999635 , ..., 0.9999386 , 0.9999349 ,
        0.9999341 ]], dtype=float32)

time: 4.45 ms


In [None]:
n_negative = 270000
n_times = 1
neg_pair = [];
while n_times < 5:
  print('Hard negative mining ', n_times, ':')
  
  tmp_neg_pair = []
  model_base = model.layers[2];
  print('Embedding reference...')
  reference_embeddings = model_base.predict(reference_vecs, batch_size=1024);
  print('Done!')

  print('Searching...')
  index = faiss.IndexFlatIP(128)   # build the index
  faiss.normalize_L2(reference_embeddings);
  index.add(reference_embeddings);
  D, I = index.search(reference_embeddings,3);
  print('Done')
  hard_neg_pairs = [];
  hard_neg_labels = [];

  for i in range(0, len(reference_embeddings)):
    for j in range(0,3):
      closest_idx = I[i][j];

      closest_meddra_id = reference_list_idx[closest_idx][1];
      current_meddra_id = reference_list_idx[i][1];

      if current_meddra_id != closest_meddra_id and reference_list[i] != reference_list[closest_idx]:
        if i < closest_idx:
          tmp_neg_pair.append((i,closest_idx))
        else:
          tmp_neg_pair.append((closest_idx, i))

  tmp_neg_pair = set(tmp_neg_pair)-set(neg_pair)
  for pair in tmp_neg_pair:
    neg_pair.append(pair)

  hard_neg_pairs = []
  hard_neg_labels = []

  for pair in tmp_neg_pair:
    hard_neg_pairs.append((reference_list[pair[0]], reference_list[pair[1]]));
    hard_neg_labels.append((0,0))
  print('Found ', len(hard_neg_pairs),' hard negative pairs');
  n_negative += len(hard_neg_pairs)
  n_times += 1

  print('Concatenating hard negative samples to training samples...')
  hard_neg_pairs = embedding_dataset(hard_neg_pairs, maxlen,char_to_int)
  hard_neg_pairs = np.array(hard_neg_pairs, dtype='float32')
  hard_neg_pairs = np.reshape(hard_neg_pairs, hard_neg_pairs.shape + (1,))
  hard_neg_pairs = hard_neg_pairs/n

  training_set = np.append(training_set,hard_neg_pairs, axis=0)
  hard_neg_labels= np.array(hard_neg_labels)
  training_labels = np.append(training_labels, hard_neg_labels, axis=0)
  print('Done! Training size is: ', training_set.shape[0])

  print("Shuffle data...")
  (training_set, training_labels) = shuffle_data(training_set, training_labels);
  training_set = K.cast_to_floatx(training_set)
  training_labels = K.cast_to_floatx(training_labels)

  filepath="/content/drive/My Drive/Colab Notebooks/NSEEN_on_CHEBI_weights_model_HNM_" + str(n_times-1) +"-epoch:{epoch:02d}-{loss:.6f}.hdf5";
  checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1 , mode='min');
  callbacks_list = [checkpoint, myCallback("")];

  model.fit([training_set[:,0],training_set[:,1]], training_labels[:,0], batch_size=512, epochs=5, callbacks=callbacks_list)


Hard negative mining  1 :
Embedding reference...
Done!
Searching...
Done
Found  355383  hard negative pairs
Concatenating hard negative samples to training samples...
Done! Training size is:  1567985
Shuffle data...
Epoch 1/5

Epoch 00001: saving model to /content/drive/My Drive/Colab Notebooks/NSEEN_on_CHEBI_weights_model_HNM_1-epoch:01-0.336919.hdf5
496 538 545 548
Average Accuray: 41.030092592592595 %
Epoch 2/5

Epoch 00002: saving model to /content/drive/My Drive/Colab Notebooks/NSEEN_on_CHEBI_weights_model_HNM_1-epoch:02-0.315992.hdf5
480 536 539 540
Average Accuray: 40.41280864197531 %
Epoch 3/5

Epoch 00003: saving model to /content/drive/My Drive/Colab Notebooks/NSEEN_on_CHEBI_weights_model_HNM_1-epoch:03-0.294493.hdf5
487 540 541 542
Average Accuray: 40.70216049382716 %
Epoch 4/5

Epoch 00004: saving model to /content/drive/My Drive/Colab Notebooks/NSEEN_on_CHEBI_weights_model_HNM_1-epoch:04-0.301558.hdf5
466 513 518 519
Average Accuray: 38.88888888888889 %
Epoch 5/5

Epoch 00

























<br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/>

