<a href="https://colab.research.google.com/github/dinhngoc267/NSEEN/blob/master/Processing_XML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
pip install pyjarowinkler



In [0]:
pip install -U strsimpy

Requirement already up-to-date: strsimpy in /usr/local/lib/python3.6/dist-packages (0.1.4)


In [0]:
#Import library
import tensorflow as tf
from pyjarowinkler import distance
from strsimpy.jaro_winkler import JaroWinkler
import xml.etree.ElementTree as ET
import os
import numpy as np
from itertools import combinations
import random
from keras.preprocessing.sequence import pad_sequences
import copy
import re
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from __future__ import absolute_import
from __future__ import print_function
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding, Bidirectional, LSTM, InputLayer
from keras.optimizers import RMSprop
from keras import backend as K

In [0]:
#Parsing XML Data

def create_reference_dict(data_dir):
  trees = []
  #data_dir = '/content/drive/My Drive/Colab Datasets/TAC2017/train_xml'
  for fname in sorted(os.listdir(data_dir)):
    if(fname[-4:] == '.xml'):
      trees.append(ET.parse(os.path.join(data_dir,fname)))

  reference_dict = {}
  for tree in trees:
    root = tree.getroot()
    for reaction in root.iter('Reaction'):
      attribs = reaction.attrib    
      str_name = attribs.get('str')
      child = list(reaction)
      normalization = child[0]
      attribs = normalization.attrib
    
      canonical_name = attribs.get('meddra_pt')
      canonical_id = attribs.get('meddra_pt_id')
      if canonical_id in reference_dict:
        values = reference_dict.get(canonical_id)
        flag = False
        for value in values: 
          if value == str_name:
            flag = True
            break            
        if flag == False:
          values.append(str_name)
      else:
        if canonical_id is not None:
          values = []      
          values.append(canonical_name)
          values.append(str_name)
          reference_dict[canonical_id] = values  
  print(len(reference_dict))
  reference_dict_copy = copy.deepcopy(reference_dict)
  return reference_dict, reference_dict_copy

def char_to_int_dict(reference_dict):
  text = ""
  for key, values in reference_dict.items():
    for value in values:
      text = text + value
  vocab = sorted(set(text))
  return dict((c,i) for i,c in enumerate(vocab))

def get_maxlen_sequence(reference_dict):
  maxlen=0
  for key, values in reference_dict.items():
    for value in values:
      if maxlen < len(value):
        maxlen = len(value)
  return maxlen

def split_character(word): 
  return [char for char in word]  

# convert char sequence to int sequence in reference set and pad sequence
def embedding_reference_dict(reference_dict, maxlen, char_to_int):
  for key, values in reference_dict.items():
    i = 0
    for sequence in values:
      value = [char_to_int[c] for c in sequence]
      #value = split_character(sequence)
      values[i] = value
      i += 1
    reference_dict[key] = pad_sequences(values,maxlen)
  return reference_dict

# Prepare 3 sub dataset
# 1.Semantic
def create_sematic_pairs(reference_dict, n_features):
  # Positive pairs
  sematic_pairs = np.empty((0,2,n_features), dtype='object')
  labels = np.empty((0,2))
  for key, values in reference_dict.items():
   pos_pairs = [i for i in combinations(values,2)]
   for pos_pair in pos_pairs:
    pos_pair = np.array([pos_pair],dtype='object')
    sematic_pairs = np.append(sematic_pairs, pos_pair, axis=0)
    label = [1,1]
    labels = np.append(labels, [label], axis=0)
  
  # Negative sematic pairs
  sampled_list_1 = random.sample(list(reference_dict.items()), 100)
  sampled_list_2 = random.sample(list(reference_dict.items()), 100)
  
  for key_1, values_1 in sampled_list_1:
    for key_2, values_2 in sampled_list_2:
      if key_1 != key_2:
        value_1 = random.sample(list(values_1), 1)
        value_2 = random.sample(list(values_2), 1)
        neg_pair = np.empty((0,n_features))
        neg_pair = np.append(neg_pair,value_1,axis=0)
        neg_pair = np.append(neg_pair,value_2,axis=0)
        neg_pair = np.array([neg_pair])
        sematic_pairs = np.append(sematic_pairs,neg_pair,axis=0)     

        label = [0, 0]
        labels = np.append(labels, [label], axis=0) 

  return sematic_pairs, labels

def create_syntactic_pairs(reference_dict):

# Create Syntatic Variations
#  To capture the most common forms of noise occurring on the same name, we make the following three modifications based on our
#   observation of the most frequent variations in the query names:
#     – Replace all but alphanumerical characters with space, e.g., <infusion-related reactions, infusion related reactions, y>
#     – Converting to upper and lower cases, e.g., <Neuropathy peripheral, neuropathy peripheral, y>,
#     - Swap two words, e.g., <peripheral neuropathy, neuropathy peripheral>
  syntactic_pairs = np.empty((0,2))
  labels = np.empty((0,2))
  for key, values in reference_dict.items():
    for i, string in enumerate(values):    
      #1. Find all sequence contain characters which are not alphanumerical. If '-' character then replace with ' '. Else remove.   
      if re.search('^[A-Za-z0-9 ]+', string):     
        if string.find("-") != -1:
          pair = []
          tmp = string
          pair = np.append(pair, string)
          pair = np.append(pair, tmp.replace("-",""))
          syntactic_pairs = np.append(syntactic_pairs, [pair],axis=0)      

      #2. Convert upper and lower cases.
      if  string.islower() == False:
        pair = []
        tmp = string
        pair = np.append(pair, string)
        pair = np.append(pair, tmp.lower())
        syntactic_pairs = np.append(syntactic_pairs, [pair],axis=0)

      #3. Swap twos words
      if len(string.split()) == 2:
        pair = []
        modified_str = string.split()[1] + " " + string.split()[0]
        pair = np.append(pair, string)
        pair = np.append(pair, modified_str)
        syntactic_pairs = np.append(syntactic_pairs, [pair],axis=0)

  #Calculate the similarity between pair
  jarowinkler = JaroWinkler()
  for pair in syntactic_pairs:
    similarity = jarowinkler.similarity(pair[0], pair[1])

    label = [similarity, 1]
    labels = np.append(labels, [label], axis=0)
  
  return syntactic_pairs, labels

def embedding_syntactic_pairs(syntactic_pairs):
  tmp = copy.deepcopy(syntactic_pairs)
  syntactic_pairs = np.empty((0,2,71))
  for pairs in tmp:
    embeddings = []
    embedding_1 = [char_to_int[c] for c in pairs[0]]
    embedding_2 = [char_to_int[c] for c in pairs[1]]
    #embedding_1 = split_character(pairs[0])
    #embedding_2 = split_character(pairs[1])
    embeddings.append(embedding_1)
    embeddings.append(embedding_2)
    embeddings = pad_sequences(embeddings, 71)
    syntactic_pairs = np.append(syntactic_pairs, [embeddings], axis=0)
  return syntactic_pairs

def concatenate_tensor(tensor1, tensor2):
  return np.concatenate((tensor1, tensor2), axis=0)


def normalize_feature(data, maxvalue):
  for i in data.shape[1]:
    data[:,i] = data[:,i] / maxvalue
  return data

def l1_normalize(v):
  norm = K.sum(v, axis=1, keepdims=True)
  divResult = Lambda(lambda x: x[0]/x[1])([v,norm])
  return divResult

import tensorflow as tf
def cosine_distance(vects):
  x,y=vects
  def l2_normalize(x, axis):
        #norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=True))
        #return K.maximum(x, K.epsilon()) / K.maximum(norm, K.epsilon())
        norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=True))
        return K.sign(x) * K.maximum(K.abs(x), K.epsilon()) / K.maximum(norm, K.epsilon())
  x = K.l2_normalize(x, axis=-1)
  y = K.l2_normalize(y, axis=-1)
  return K.mean(1 - K.sum((x * y), axis=-1), axis=-1)

def cos_dist_output_shape(shapes):
  shape1, shape2 = shapes
  return (shape1[0], 1)

def contrastive_loss(d, y):
  #d: distance; y: labels
  margin = 1
  #square_pred = K.square(y_pred)
  #margin_square = K.square(K.maximum(margin - y_pred, 0))
  #return K.mean(y_true*square_pred + (1-y_true)*margin_square) 
  return K.mean((1-y)* K.square(d) + y * K.square(K.maximum(margin - d, 0)))

def create_base_network(input_shape):
  model = Sequential()
  #model.add(Embedding(input_dim=76+1, output_dim = 32, input_length=71))
  model.add(Bidirectional(LSTM(64, input_shape=input_shape, return_sequences=True)))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Bidirectional(LSTM(64, return_sequences=False)))
  model.add(Dense(128, activation='relu'))
  
  return model

#def main():
  # prepair training data and labels
reference_dict, reference_dict_copy = create_reference_dict('/content/drive/My Drive/Colab Datasets/TAC2017/train_xml')
maxlen = get_maxlen_sequence(reference_dict)
char_to_int = char_to_int_dict(reference_dict)
reference_dict = embedding_reference_dict(reference_dict,maxlen,char_to_int)
(sematic_pairs, sematic_labels) = create_sematic_pairs(reference_dict,n_features=maxlen)

(syntactic_pairs, syntactic_labels) = create_syntactic_pairs(reference_dict = reference_dict_copy)
print(syntactic_labels)
syntactic_pairs = embedding_syntactic_pairs(syntactic_pairs)

print(sematic_labels.shape)
print(syntactic_labels.shape)
data = np.concatenate((sematic_pairs, syntactic_pairs), axis=0)
labels = np.concatenate((sematic_labels, syntactic_labels), axis=0)
  

  

1486
[[0.96825397 1.        ]
 [0.63492063 1.        ]
 [0.63492063 1.        ]
 ...
 [0.93333333 1.        ]
 [0.66666667 1.        ]
 [0.66666667 1.        ]]
(18800, 2)
(3451, 2)


In [0]:
data = np.reshape(data, data.shape + (1,))

indices = np.arange(data.shape[0]) # Splits the data into a training set and a 
                                   # validation set, but first shuffles the data,
                                   # because you're deadling with data in which samples
                                   # are ordered (all negative first, then all positive)
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
training_data = data[0:15000]
print(training_data.shape)
training_labels = labels[0:15000]
print(training_labels.shape)
validation_data = data[15000:]
validation_labels = labels[15000:]

(15000, 2, 71, 1)
(15000, 2)


In [0]:


# network definition
input_shape = training_data.shape[2:]
print(input_shape)
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)

  #train
rms = RMSprop()

model.compile(loss=contrastive_loss, optimizer=rms)

# define the checkpoint
#filepath="/content/drive/My Drive/Colab Notebooks/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
#callbacks_list = [checkpoint]

#history = model.fit([training_data[:,0], training_data[:,1]], training_labels[:,0],
#                    batch_size=128,
#                    epochs=7,
#                    validation_data=([validation_data[:,0], validation_data[:,1]], validation_labels[:,0]),
#                    callbacks=callbacks_list)



(71, 1)


In [0]:
# load the network weights
filename = "/content/drive/My Drive/Colab Notebooks/weights-improvement-06-0.4771.hdf5"
model.load_weights(filename)
model.compile(loss=contrastive_loss, optimizer=rms)

In [0]:
y_pred = model.predict([training_data[0:100, 0], training_data[0:100, 1]], batch_size=1)

In [0]:
pred = np.empty((0,2), dtype=float)
index = 0
for i in y_pred:
  pred = np.append(pred,[['%.08f' %i, labels[index][1]]], axis=0)
  index += 1

print(pred)

In [0]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 71, 1)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 71, 1)        0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 128)          346752      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 1)            0           sequential_1[1][0]         

In [0]:
model.layers[2].summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 71, 128)           33792     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 71, 128)           98816     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 71, 128)           98816     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
Total params: 346,752
Trainable params: 346,752
Non-trainable params: 0
_________________________________________________________________


In [0]:
count = 0
for key, values in reference_dict.items():
  for value in values:
    count += 1

print("Number of entity names: ", count)

Number of entity names:  4371


In [0]:
reference_array = np.empty((0,2), dtype='object')

for key, values in reference_dict.items():
  record = np.array([key, values])
  reference_array = np.append(reference_array, [record], axis=0)


In [0]:
print(reference_array.shape)


(1486, 2)


In [0]:
print(reference_array[0,0])

10029331


In [0]:
for i, value in enumerate(reference_array):
  print(i)

In [0]:
for i in range(0,4):
  print(i)

0
1
2
3


In [0]:
hard_neg_pairs = np.empty((0,2))
hard_neg_labels = []

index =0
#for record_1 in reference_array:
for i, record_i in enumerate(reference_array):
  for value_i in record_i[1]:
    value_1 = np.reshape(value_i, (1,) + value_i.shape + (1,))
    dist_array = np.empty((0,2), dtype='object') # Cột 1 lưu khoảng cách từ value_1 trong record_1 tới tất cả các value khác trong các record khác. 
                                                 # Cột 2 lưu lại chuỗi biểu diễn value_2
    
    #Comput the distance from the name in entity i to all the names in other enities
    for j, record_2 in enumerate(reference_array):
      if j > i: # Tránh bị trùng lặp (khoảng cách từ 1 đến 2 bằng khoảng cách từ 2 đến 1)
        for value_j in record_2[1]:
          value_2 = np.reshape(value_j, (1,) + value_j.shape + (1,))
          dist= model.predict([value_1, value_2], batch_size=1)
          dist_array = np.append(dist_array, [[dist[0], value_2]], axis=0)  
          
    dist_array = sorted(dist_array, key=lambda dist: dist[0])  

    for k in range(0,4):
      tmp = np.reshape(dist_array[k][1], (1,71))
      hard_neg_pairs = np.append(hard_neg_pairs, [[value_i,tmp]], axis=0)
      hard_neg_labels.append(0)       
  print(index)
  index += 1
    

0
1
2
3


KeyboardInterrupt: ignored

In [0]:
# Hard negatvie mining

hard_neg_pairs = np.empty((0,2))
hard_neg_labels = []
for key_1, values_1 in reference_dict.items():
  for value_1 in values_1:
    value_1 = np.reshape(value_1, (1,) + value_1.shape + (1,))
    dist_arr = []#np.empty((0,1), dtype=float)
    for key_2, values_2 in reference_dict.items():
      if key_1 != key_2:
        for value_2 in values_2:
          value_2 = np.reshape(value_2, (1,) + value_2.shape + (1,))
          dist= model.predict([value_1, value_2], batch_size=1)
          dist_arr.append(dist[0])
    
   


    #indexOfKClosest = getIndexOfKclosest(dist_arr, 0.00000 , 3, len(dist_arr))
    #print(dist_arr[:,0])
    #print(indexOfKClosest)
    #tmp = []
    #for i in indexOfKClosest: 
    #  tmp = np.append(tmp, i)
    #print(tmp)
    #hard_neg = np.append(hard_neg, np.array([tmp]), axis=0)
    #for i in indexOfKClosest:
    #  print("Lan: ", dist_arr[i])



4365


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



4365


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



KeyboardInterrupt: ignored

In [0]:
print("%.08f" %2.8014183e-06)

0.00000280


In [0]:
type(dist_arr)

list

In [0]:
x = 0.0
print(dist_array[:,0].shape)

(6285,)


In [0]:
print(dist_array[:,0])

NameError: ignored

In [0]:
print(getIndexOfKclosest(dist_array[:,0], 0.0001 , 4, len(dist_array[:,0])))

NameError: ignored

In [0]:
a = sorted(dist_array[:,0])

In [0]:
print("%.08f" % dist_array[2,0])

0.08016455


In [0]:
print(dist_arr[26])


0.0041077733


In [0]:
a = training_data[0][0]
a = np.reshape(a, (1,) + a.shape )

In [0]:
a.shape

(1, 71, 1)

In [0]:
b = model.layers[2]
z = b.predict(a, batch_size=1)

In [0]:
z.shape

(1, 128)

In [0]:
def compareFloatNum(a, b): 
      
    # Correct method to compare 
    # floating-ponumbers 
    if (abs(a - b) < 1e-9): 
        return True
    else: 
        return False
def findCrossOver(arr, low, high, x) :  
  
    # Base cases  
    if (float(arr[high]) <= float(x)) : # x is greater than all  
        return high 
          
    if (float(arr[low]) > float(x)) : # x is smaller than all  
        return low  
      
    # Find the middle point  
    mid = (low + high) // 2 # low + (high - low)// 2  
      
    # If x is same as middle element,  
    # then return mid  
    if (float(arr[mid]) <= float(x) and float(arr[mid + 1]) > float(x)) : 
        return mid  
      
    # If x is greater than arr[mid], then  
    # either arr[mid + 1] is ceiling of x  
    # or ceiling lies in arr[mid+1...high]  
    if(float(arr[mid]) < float(x)) : 
        return findCrossOver(arr, mid + 1, high, x) 
      
    return findCrossOver(arr, low, mid - 1, x) 
  
# This function prints k closest elements to x  
# in arr[]. n is the number of elements in arr[]  
def getIndexOfKclosest(arr, x, k, n) : 
    result = []
    # Find the crossover point  
    l = findCrossOver(arr, 0, n - 1, x) 
    r = l + 1 # Right index to search  
    count = 0 # To keep track of count of  
              # elements already printed  
  
    # If x is present in arr[], then reduce  
    # left index. Assumption: all elements  
    # in arr[] are distinct  
    if (float(arr[l]) == float(x)) : 
        l -= 1
  
    # Compare elements on left and right of crossover  
    # point to find the k closest elements  
    while (l >= 0 and r < n and count < k) : 
          
        if (float(x) - float(arr[l]) < float(arr[r]) - float(x)) : 
            result.append(l)
            l -= 1
        else : 
            result.append(r)
            #print(arr[r], end = " ")  
            r += 1
        count += 1
  
    # If there are no more elements on right  
    # side, then print left elements  
    while (count < k and l >= 0) : 
      #result.append(arr[l])
      result.append(l)
      l -= 1
      count += 1
  
    # If there are no more elements on left  
    # side, then print right elements  
    while (count < k and r < n) :  
        #result.append(arr[r]) 
        result.append(r)
        r += 1
        count += 1
    
    return result 

In [0]:
arr =[0.01, 0.02, 0.05, 0.002, 0.7, 0.22] 
                  
n = len(arr) 
x = 0.001
k = 2
      
print(getIndexOfKclosest(arr, x, k, n))

[0, 1]


In [0]:
print(0.999999999999999 == 1)

False


In [0]:
m = Sequential()
m.add(Dense(128, activation='relu', input_shape=(71,1)))
m.set_weights(model.layers[2].get_weights())

ValueError: ignored

In [0]:

  #print(data.shape)
indices = np.arange(data.shape[0]) # Splits the data into a training set and a 
                                   # validation set, but first shuffles the data,
                                   # because you're deadling with data in which samples
                                   # are ordered (all negative first, then all positive)
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
training_data = data[0:15000]
print(training_data.shape)
training_labels = labels[0:15000]
print(training_labels.shape)
validation_data = data[15000:]
validation_labels = labels[15000:]

# network definition
input_shape = training_data.shape[2:]
print(input_shape)
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)

  #train
rms = RMSprop()

model.compile(loss=contrastive_loss, optimizer=rms, metrics=['acc'])
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

history = model.fit([training_data[:,0], training_data[:,1]], training_labels[:,0],
                    batch_size=128,
                    epochs=20,
                    validation_data=([validation_data[:,0], validation_data[:,1]], validation_labels[:,0]),
                    callbacks_list=callbacks_list)

import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label = 'Validation acc')
plt.title('Training and validation accuraccy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
  


#if __name__ == "__main__":
#  main()

(15000, 2, 71, 1, 1)
(15000, 2)
(71, 1, 1)


ValueError: ignored

In [0]:
y_pred = model.predict([training_data[:, 0], training_data[:, 1]], batch_size=1)

NameError: ignored

In [0]:
print(y_pred[0:20])

[1.0000000e+00 1.0000000e+00 7.8428471e-01 1.0000000e+00 4.4756687e-01
 2.0265579e-06 2.7307433e-01 8.9411157e-01 5.9604645e-08 9.6646893e-01
 1.0000000e+00 1.0000000e+00 1.8204451e-03 1.0000000e+00 5.3942204e-05
 8.0746412e-03 3.2198429e-04 9.6096611e-01 3.8743019e-05 9.9839902e-01]


In [0]:
pred = np.empty((0,2), dtype=float)
index = 0
for i in y_pred:
  pred = np.append(pred,[['%.08f' %i, labels[index][1]]], axis=0)
  index += 1

In [0]:
print(pred[0:20])

[['1.00000000' '0.0']
 ['0.17268920' '0.0']
 ['1.00000000' '0.0']
 ['0.99957287' '1.0']
 ['1.00000000' '1.0']
 ['1.00000000' '0.0']
 ['1.00000000' '1.0']
 ['1.00000000' '0.0']
 ['1.00000000' '0.0']
 ['1.00000000' '1.0']
 ['1.00000000' '0.0']
 ['1.00000000' '1.0']
 ['1.00000000' '0.0']
 ['1.00000000' '0.0']
 ['1.00000000' '1.0']
 ['1.00000000' '0.0']
 ['1.00000000' '1.0']
 ['0.00488019' '1.0']
 ['1.00000000' '0.0']
 ['1.00000000' '1.0']]


In [0]:
pos_count = 0
for pair in pred:
  if float(pair[0]) < 0.5 and float(pair[1]) == 1:
    pos_count += 1
print(pos_count)

34


In [0]:
pos_total = 0
for pair in pred:
  if float(pair[1]) == 1.0:
    pos_total += 1
print(pos_total)

58


In [0]:
neg_count = 0
for pair in pred:
  if float(pair[0]) > 0.5 and float(pair[1]) == 0:
    neg_count += 1
print(neg_count)

28


In [0]:
neg_total = 0 
for pair in pred:
  if float(pair[1]) == 0.0:
    neg_total += 1
print(neg_total)

42


In [0]:
#print('%.' y_pred[0:100],'f'))
#('%.08f' % x)
#for i in y_pred[0:100]:
 # print('%0.5f' %i)
#for i in range(y_pred.)
##print(labels[0:20,1])



count = 0
pred = np.empty((0,2))
index = 0
for i in y_pred:
  if i < 0.5:
    pred = np.append(pred, [[1, labels[index][1]]],axis=0)
    count += 1
  else:
    pred = np.append(pred, [[0, labels[index][1]]], axis=0)
  index += 1

rate = 0
for j in pred:
  if j[0] == j[1] == 1:
    rate +=1

total =0
for l in pred:
  if(l[1] == 1):
    total +=1
num = 0
index = 0
for i in labels[0:15000,1]:
  if i == 1 and y_pred[index] < 0.5:
    num += 1
    index +=1
  
print(total)
print(rate)
print(num)
print(rate/total)

8266
6725
6
0.8135736752963949


In [0]:
index=0
for i in y_pred:
  if i < 0.5:
    pred = np.append(pred, [['%0.2f' %i, labels[index][1]]],axis=0)
    count += 1
  else:
    pred = np.append(pred, [['%0.2f' %i, labels[index][1]]], axis=0)
  index += 1
print(pred[0:100])

KeyboardInterrupt: ignored

In [0]:
print(pred[0:100])

[['1.0' '1.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['0.0' '0.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['0.0' '1.0']
 ['1.0' '0.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['0.0' '0.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['0.0' '0.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['0.0' '0.0']
 ['0.0' '0.0']
 ['0.0' '1.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['1.0' '0.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '0.0']
 ['0.0' '1.0']
 ['0.0' '1.0']
 ['1.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '0.0']
 ['1.0' '0.0']
 ['0.0' '0.0']
 ['1.0' '1.0']
 ['1.0' '1.0']
 ['1.0' '0

In [0]:
root.attrib

{'drug': 'adcetris', 'track': 'TAC2017_ADR'}

In [0]:
# Create Reference Set

reference_dict = {}
for tree in trees:
  root = tree.getroot()
  for reaction in root.iter('Reaction'):
    attribs = reaction.attrib    
    str_name = attribs.get('str')
    child = list(reaction)
    normalization = child[0]
    attribs = normalization.attrib
    
    canonical_name = attribs.get('meddra_pt')
    canonical_id = attribs.get('meddra_pt_id')
    if canonical_id in reference_dict:
      values = reference_dict.get(canonical_id)
      flag = False
      for value in values: 
        if value == str_name:
          flag = True
          break            
      if flag == False:
        values.append(str_name)
    else:
      if canonical_id is not None:
        values = []      
        values.append(canonical_name)
        values.append(str_name)
        reference_dict[canonical_id] = values   

print(len(reference_dict))

1486


In [0]:
#Coppy dict for make syntactic_dict
syntactic_dict = copy.deepcopy(reference_dict)

In [0]:
print(syntactic_dict)

{'10029331': ['Neuropathy peripheral', 'peripheral neuropathy', 'neuropathy', 'residual neuropathy', 'neuropathy peripheral', 'peripheral neuritis'], '10002198': ['Anaphylactic reaction', 'anaphylaxis', 'anaphylactic reactions', 'anaphylactic reaction'], '10051792': ['Infusion related reaction', 'infusion reactions', 'infusion-related reactions', 'infusion related reactions', 'adverse events associated with the infusion', 'infusion reaction'], '10061188': ['Haematotoxicity', 'hematologic toxicities', 'hematologic toxicity'], '10021789': ['Infection', 'infections', 'infection', 'worsening of infections', 'unusual infections', 'infection-related adverse reactions', 'infectious adverse events', 'infection adverse events', 'infectious etiology', 'infectious adverse reactions', 'worsening infection', 'complications of infection'], '10030901': ['Opportunistic infection', 'opportunistic infections', 'infections with opportunistic pathogens', 'infections due to other opportunistic pathogens', 

In [0]:
# get the vocabulary of query entity names
text = ""
maxlen = 0
for key, values in reference_dict.items():
  for value in values:
    text = text + value

vocab = sorted(set(text))
char_to_int = dict((c,i) for i,c in enumerate(vocab))
print(char_to_int)

{' ': 0, '"': 1, '%': 2, "'": 3, '(': 4, ')': 5, '*': 6, ',': 7, '-': 8, '.': 9, '/': 10, '0': 11, '1': 12, '2': 13, '3': 14, '4': 15, '5': 16, '6': 17, '7': 18, '8': 19, '9': 20, ':': 21, '<': 22, '=': 23, '>': 24, 'A': 25, 'B': 26, 'C': 27, 'D': 28, 'E': 29, 'F': 30, 'G': 31, 'H': 32, 'I': 33, 'J': 34, 'K': 35, 'L': 36, 'M': 37, 'N': 38, 'O': 39, 'P': 40, 'Q': 41, 'R': 42, 'S': 43, 'T': 44, 'U': 45, 'V': 46, 'W': 47, 'X': 48, 'Y': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75}


In [0]:
len(char_to_int)

76

In [0]:
# find max len sequence
for key, values in reference_dict.items():
  for value in values:
    if maxlen < len(value):
      maxlen = len(value)
print(maxlen)

71


In [0]:
# convert char sequence to int sequence in reference set and pad sequence
for key, values in reference_dict.items():
  i = 0
  for sequence in values:
    value = [char_to_int[c] for c in sequence]
    values[i] = value
    i += 1
  reference_dict[key] = pad_sequences(values,71)


In [0]:
print(reference_dict)

In [0]:
# Prepare 3 sub dataset

# 1.Semantic
# Number of entities in reference set
# Positive pairs

sematic_pairs = np.empty((0,2,71), dtype='object')
soft_labels = np.empty((0))

for key, values in reference_dict.items():
#for list in embeddings:
  pos_pairs = [i for i in combinations(values,2)]
  for pos_pair in pos_pairs:
    pos_pair = np.array([pos_pair],dtype='object')
    sematic_pairs = np.append(sematic_pairs, pos_pair, axis=0)
    soft_labels = np.append(soft_labels, [1], axis=0)


In [0]:
sematic_pairs.shape

(8804, 2, 71)

In [0]:
print(sematic_pairs[:,0])

[[0 0 0 ... 67 50 61]
 [0 0 0 ... 67 50 61]
 [0 0 0 ... 67 50 61]
 ...
 [0 0 0 ... 64 67 69]
 [0 0 0 ... 64 67 69]
 [0 0 0 ... 50 58 63]]


In [0]:
# Negative sematic pairs
sampled_list_1 = random.sample(list(reference_dict.items()), 100)
sampled_list_2 = random.sample(list(reference_dict.items()), 100)

for key_1, values_1 in sampled_list_1:
  for key_2, values_2 in sampled_list_2:
    if  key_1 != key_2:
      value_1 = random.sample(list(values_1), 1)
      value_2 = random.sample(list(values_2), 1)
      neg_pair = np.empty((0,71))
      neg_pair = np.append(neg_pair,value_1,axis=0)
      neg_pair = np.append(neg_pair,value_2,axis=0)
      neg_pair = np.array([neg_pair])
      sematic_pairs = np.append(sematic_pairs,neg_pair,axis=0)
      

      soft_labels = np.append(soft_labels, [0], axis=0) 

In [0]:
sematic_pairs.shape

(18796, 2, 71)

In [0]:
soft_labels.shape

(18793,)

In [0]:
# Create Syntatic Variations
#  To capture the most common forms of noise occurring on the same name, we make the following three modifications based on our
#   observation of the most frequent variations in the query names:
#     – Replace all but alphanumerical characters with space, e.g., <infusion-related reactions, infusion related reactions, y>
#     – Converting to upper and lower cases, e.g., <Neuropathy peripheral, neuropathy peripheral, y>,
#     - Swap two words, e.g., <peripheral neuropathy, neuropathy peripheral>

import re

#re.sub(r'[^A-Za-z0-9 ]+', '', s)
syntactic_pairs = np.empty((0,2))
for key, values in syntactic_dict.items():
  for i, string in enumerate(values):
    #1. Find all sequence contain characters which are not alphanumerical. If '-' character then replace with ' '. Else remove.   
    if re.search('^[A-Za-z0-9 ]+', string):     
      if string.find("-") != -1:
        pair = []
        tmp = string
        pair = np.append(pair, string)
        pair = np.append(pair, tmp.replace("-",""))
        syntactic_pairs = np.append(syntactic_pairs, [pair],axis=0)      

    #2. Convert upper and lower cases.
    if  string.islower() == False:
      pair = []
      tmp = string
      pair = np.append(pair, string)
      pair = np.append(pair, tmp.lower())
      syntactic_pairs = np.append(syntactic_pairs, [pair],axis=0)

    #3. Swap twos words
    if len(string.split()) == 2:
      pair = []
      modified_str = string.split()[1] + " " + string.split()[0]
      pair = np.append(pair, string)
      pair = np.append(pair, modified_str)
      syntactic_pairs = np.append(syntactic_pairs, [pair],axis=0)


In [0]:
print(type(syntactic_pairs))
print(syntactic_pairs.shape)

<class 'numpy.ndarray'>
(3451, 2)


In [0]:
syntactic_labels = []
jarowinkler = JaroWinkler()
for pair in syntactic_pairs:
  similarity = jarowinkler.similarity(pair[0], pair[1])
  syntactic_labels.append(similarity)

In [0]:
print(syntactic_labels)

[0.9682539682539683, 0.6349206349206349, 0.6349206349206349, 0.5380116959064327, 0.6349206349206349, 0.5842105263157894, 0.8515873015873016, 0.7161531279178338, 0.7676767676767677, 0.7357609710550888, 0.9455555555555555, 0.5892255892255892, 0.9911242603550297, 0.8143589743589743, 0.6122004357298475, 0.9555555555555556, 0.6545454545454544, 0.7627450980392156, 0.9259259259259259, 0.708994708994709, 0.9929251700680273, 0.5842105263157894, 0.6432748538011696, 0.955862977602108, 0.7832988267770876, 0.7777777777777777, 0.7832988267770876, 0.9682539682539683, 0.9210256410256411, 0.5462962962962963, 0.9523809523809524, 0.6342592592592592, 0.6055555555555555, 0.7333333333333334, 0.9841269841269842, 0.9629629629629629, 0.4973544973544974, 0.4973544973544974, 0.9908172635445364, 0.9487179487179488, 0.7459207459207459, 0.806060606060606, 0.7380952380952381, 0.7489035087719298, 0.8080808080808081, 0.9393939393939394, 0.9532019704433498, 0.781904761904762, 0.8061002178649237, 0.9047619047619048, 0.9

In [0]:
tmp = copy.deepcopy(syntactic_pairs)
syntactic_pairs = np.empty((0,2,71))
for pairs in tmp:
  embeddings = []
  embedding_1 = [char_to_int[c] for c in pairs[0]]
  embedding_2 = [char_to_int[c] for c in pairs[1]]
  embeddings.append(embedding_1)
  embeddings.append(embedding_2)
  embeddings = pad_sequences(embeddings, 71)
  syntactic_pairs = np.append(syntactic_pairs, [embeddings], axis=0)
  #syntactic_pairs[index,:]= embeddings
  #index += 1


In [0]:
#syntactic_pairs = np.reshape(syntactic_pairs, syntactic_pairs.shape + (1,))
syntactic_pairs.shape

(3451, 2, 71)

In [0]:
training_data = np.concatenate((sematic_pairs, syntactic_pairs), axis=0)

In [0]:
print(training_data.shape)

(22246, 2, 71)


In [0]:
labels = np.concatenate((soft_labels,syntactic_labels), axis=0)
print(labels.shape)

(22244,)


In [0]:
#Shuffle
indices = np.arange(training_data.shape[0]) # Splits the data into a training set and a 
                                   # validation set, but first shuffles the data,
                                   # because you're deadling with data in which samples
                                   # are ordered (all negative first, then all positive)
np.random.shuffle(indices)
training_data = training_data[indices]
labels = labels[indices]

In [0]:
# Build Siamese Neural Network

from __future__ import absolute_import
from __future__ import print_function
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding, Bidirectional, LSTM
from keras.optimizers import RMSprop
from keras import backend as K

In [0]:
# Define distance similarity function
import tensorflow as tf
def cosine_distance(vects):
  x,y=vects
  s = tf.keras.losses.cosine_similarity(x,y)
  return s

def cos_dist_output_shape(shapes):
  shape1, shape2 = shapes
  return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
  margin = 1
  square_pred = K.square(y_pred)
  margin_square = K.square(K.maximum(margin - y_pred, 0))
  return K.mean(y_true*square_pred + (1-y_true)*margin_square) 

def create_base_network(input_shape):
  model = Sequential()
  #model.add(Embedding(76, 32))
  model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=input_shape))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Bidirectional(LSTM(64, return_sequences=False)))
  model.add(Dense(128, activation='relu'))
  
  return model



In [0]:
training_data[:,0] = training_data[:,0]/75


In [0]:
training_data[:,1] = training_data[:,1]/75

In [0]:
input_shape = (71,1)

# network definition
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)

#train
rms = RMSprop()


model.compile(loss=contrastive_loss, optimizer=rms, metrics=['acc'])
model.fit([training_data[:,0], training_data[:,1]], labels,
          batch_size=128,
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f029ff4d7f0>

In [0]:
print(training_data)

[[[0.0 0.0 0.0 ... 0.7733333333333333 0.8533333333333334 0.84]
  [0.0 0.0 0.0 ... 0.8533333333333334 0.84 0.9066666666666666]]

 [[0.0 0.0 0.0 ... 0.72 0.8933333333333333 0.9866666666666667]
  [0.0 0.0 0.0 ... 0.84 0.72 0.6666666666666666]]

 [[0.0 0.0 0.0 ... 0.6666666666666666 0.9066666666666666 0.72]
  [0.0 0.0 0.0 ... 0.9066666666666666 0.72 0.9066666666666666]]

 ...

 [[0.0 0.0 0.0 ... 0.7733333333333333 0.8533333333333334 0.84]
  [0.0 0.0 0.0 ... 0.7733333333333333 0.84 0.7466666666666667]]

 [[0.0 0.0 0.0 ... 0.6666666666666666 0.9066666666666666 0.72]
  [0.0 0.0 0.0 ... 0.7733333333333333 0.8533333333333334 0.84]]

 [[0.0 0.0 0.0 ... 0.8533333333333334 0.8266666666666667
   0.6666666666666666]
  [0.0 0.0 0.0 ... 0.6666666666666666 0.7466666666666667 0.72]]]


In [0]:
training_data = np.reshape(training_data, training_data.shape + (1,))
print(training_data[:,0].shape)

(22246, 71, 1)
