In [2]:
import mysql.connector
import os, re, requests
import gensim, pickle
import pandas as pd
import numpy as np
import cvxpy as cp
import tensorflow as tf
import matplotlib.pyplot as plt
from mysql.connector import Error
from itertools import combinations
from ast import literal_eval
from collections import defaultdict
from scipy.spatial import distance
from sklearn.manifold import TSNE
from bs4 import BeautifulSoup
import plotly.graph_objects as go
from nltk.corpus import stopwords
K = 10

In [3]:
# Definición de parámetros de conexión
connection_params = {
    'host': 'localhost',
    'user': 'default',
    'database': 'foodb',
    'password': ''
}

try:
    connection = mysql.connector.connect(**connection_params)

    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL Server version ", db_Info)
        cursor = connection.cursor()

except Error as e:
    print("Error while connecting to MySQL", e)

Connected to MySQL Server version  8.0.26


# Definición del modo de funcionamiento

In [4]:
test = True               # Si se utiliza o no un conjunto más pequeño de conceptos
db_method = 'normal'      # 'normal', 'raw', 'raw_negative'             
tokenizer_func = 'custom'

# Query sobre la tabla de health_effects

In [5]:
sql_query = '''
    SELECT name, chebi_name, IF(description IS NULL, chebi_definition, description) as definitions 
    FROM foodb.health_effects he
    WHERE description IS NOT NULL OR chebi_definition IS NOT NULL
'''

# Obtener el dataframe
df = pd.read_sql(sql_query, con=connection)
df

Unnamed: 0,name,chebi_name,definitions
0,(+)-inotropic,,An agent that alters the force or energy of mu...
1,(-)-chronotropic,,An agent that may change the heart rate by aff...
2,(-)-inotropic,,An agent that alters the force or energy of mu...
3,11beta-hydroxysteroid-dehydrogenase inhibitor,enzyme inhibitor,A compound or agent that combines with an enzy...
4,12-lipoxygenase inhibitor,enzyme inhibitor,A compound or agent that combines with an enzy...
...,...,...,...
1312,xanthine oxidase inhibitor,xanthine oxidase inhibitor,An EC 1.17.3.* (oxidoreductase acting on CH or...
1313,inhibitor,inhibitor,A substance that diminishes the rate of a chem...
1314,prostaglandin antagonist,prostaglandin antagonist,A compound that inhibits the action of prostag...
1315,platelet aggregation inhibitor,platelet aggregation inhibitor,A drug or agent which antagonizes or impairs a...


# Obtener conceptos a agregar al diccionario

In [6]:
def preprocess_HE_decriptions(connection, test=False):
    # Definición de la query que se le hace a la base de datos
    sql_query = '''
        SELECT name, chebi_name, IF(description IS NULL, chebi_definition, description) as definitions 
        FROM foodb.health_effects he
        WHERE description IS NOT NULL OR chebi_definition IS NOT NULL
    '''

    # Obtener el dataframe
    dataframe = pd.read_sql(sql_query, con=connection)
    
    # Acortar para un testeo sencillo
    if test:
        with open('Summary/names_to_rev.txt', 'r', encoding='utf8') as file:
            concepts = list()
            for line in file:
                concepts.append(line.strip())

        dataframe = dataframe[dataframe['name'].isin(concepts)]

    # Definición de los tokens de nombres a agregar
    names = list(dataframe['name'])
    
    # Definición de los tokens de descripción
    descriptions = list()
    
    # Definición de tokens generados a partir de las descripciones
    from_defs = list()
    for d in dataframe['definitions']:
        # En primer lugar eliminar el salto de linea
        txt = d.strip()

        # Eliminando los puntos y comas
        txt = ''.join(re.findall('[\w\s]+', txt))

        # Reemplazando los espacios múltiples con espacios
        txt = re.sub('\s+', ' ', txt)

        # Pasando a minúsculas
        txt = txt.lower()

        # Filtrando palabras stop
        txt = [i for i in txt.split() if not i in stopwords.words('english')]
        
        # Agregando a la lista
        descriptions.append(txt)
        
    return names, descriptions


def cosine_similarity(vests):
    x, y = vests
    x = tf.keras.backend.l2_normalize(x, axis=-1)
    y = tf.keras.backend.l2_normalize(y, axis=-1)
    return tf.keras.backend.batch_dot(x, y, axes=-1)

    
def data_generator(batch_size, data_x, data_y, shuffle=True):
    '''
      Input: 
        batch_size - integer describing the batch size
        data_x - list containing samples
        data_y - list containing labels
        shuffle - Shuffle the data order
      Output:
        a tuple containing 2 elements:
        X - list of dim (batch_size) of samples
        Y - list of dim (batch_size) of labels
    '''
    
    data_lng = len(data_x) # len(data_x) must be equal to len(data_y)
    index_list = [*range(data_lng)] # Create a list with the ordered indexes of sample data
    
    
    # If shuffle is set to true, we traverse the list in a random way
    if shuffle:
        random.shuffle(index_list) # Inplace shuffle of the list
    
    index = 0 # Start with the first element
    # START CODE HERE    
    # Fill all the None values with code taking reference of what you learned so far
    while True:
        X = [0] * batch_size # We can create a list with batch_size elements. 
        Y = [0] * batch_size # We can create a list with batch_size elements. 
        
        for i in range(batch_size):
            
            # Wrap the index each time that we reach the end of the list
            if index >= data_lng:
                index = 0
                # Shuffle the index_list if shuffle is true
                if shuffle:
                    random.shuffle(index_list) # re-shuffle the order
            
            X[i] = data_x[index_list[index]] # We set the corresponding element in x
            Y[i] = data_y[index_list[index]] # We set the corresponding element in x
    # END CODE HERE            
            index += 1
        
        yield((X, Y))
            

class custom_tokenizer(object):
    def __init__(self, split):
        self.split = split
        self.word_index = dict()
        self.index_word = dict()
        
    def fit_on_vocab(self, vocab):
        # Agregando caracteres especiales
        self.word_index[''] = 0
        self.word_index['<UNK>'] = 1
        self.index_word[0] = ''
        self.index_word[1] = '<UNK>'
        
        # Para cada palabra en el vocabuluario
        for i, word in enumerate(vocab, start=2):
            self.word_index[word] = i
            self.index_word[i] = word
    
    def texts_to_sequences(self, texts, split_bool=True):
        if split_bool:
            sentence_tokens = texts.split(self.split)
        else:
            sentence_tokens = texts
        
        if isinstance(sentence_tokens, str):
            return self.word_index.get(sentence_tokens, self.word_index[''])
        elif isinstance(sentence_tokens, (list, tuple)):
            return [self.word_index.get(word, self.word_index['']) for word in sentence_tokens]
    
    def sequences_to_texts(self, sequences):
        if isinstance(sequences, (int, float)):
            return self.index_word.get(sequences, self.index_word[0])
        else:
            return [self.index_word.get(i, self.index_word[0]) for i in sequences]
        

def get_word_embeddings(d_embed, connection, tokenizer_func='custom', tokenizer_split=' ', 
                        table_to_rev='health_effects', test=False):
    # Creación de un diccionario de embeddings
    embeddings_index = dict()
    # Creación de un set de palabras
    word_set = set()
    
    # Cargando el embedding en memoria
    with open(f'C:/Users/Chris-Brota/Desktop/glove.6b/glove.6B.{d_embed}d.txt', 'r', 
              encoding='utf8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            
            # Agregando a las estructuras correspondientes
            embeddings_index[word] = coefs
            word_set.add(word)
    
    # Obtener los conceptos de la tabla de interés
    if table_to_rev == 'health_effects':
        names, descriptions = preprocess_HE_decriptions(connection, test=test)
    
    # Agregando a un set auxiliar de nuevas palabras
    vocab_extra = set()
    for desc in descriptions:
        vocab_extra.update(desc)
    vocab_extra.update(names)
    
    # Revisando en este nuevo diccionario
    for word in vocab_extra:
        if embeddings_index.get(word, None) is None:
            embeddings_index[word] = np.random.uniform(-1, 1, d_embed)
            word_set.add(word)
    
    # Checkeo de sanidad
    assert len(embeddings_index) == len(word_set)
    print(f'Vector de {len(embeddings_index)} palabras cargadas.')
    
    
    if tokenizer_func == 'tensorflow':
        # Creación del tokenizer
        tokenizer = tf.keras.preprocessing.text.Tokenizer(split=tokenizer_split)
        # Ajustando
        tokenizer.fit_on_texts(list(word_set))
        
    elif tokenizer_func == 'custom':
        # Creación del tokenizer
        tokenizer = custom_tokenizer(split=tokenizer_split)
        # Ajustando
        tokenizer.fit_on_vocab(list(word_set))
    
    # Definiendo el tamaño del vocabulario
    vocab_size = len(tokenizer.word_index) + 1
    
    # Definición de la matriz de embedding
    embedding_matrix = np.zeros((vocab_size, d_embed))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word, None)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix, tokenizer


def get_XY(connection, tokenizer, tokenizer_func='custom', test=False):
    # Obtener la base de datos preprocesada
    names, descriptions = preprocess_HE_decriptions(connection, test=test)
    
    # Tokenizando cada concepto y los descriptores
    if tokenizer_func == 'tensorflow':
        names_tok = tokenizer.texts_to_sequences(names)
        desc_tok = tokenizer.texts_to_sequences(descriptions)
    elif tokenizer_func == 'custom':
        names_tok = tokenizer.texts_to_sequences(names, split_bool=False)
        desc_tok = [tokenizer.texts_to_sequences(sent, split_bool=False)
                    for sent in descriptions]
    
    # Checkeo de sanidad
    txt_check = "Conceptos y definición con distinto largo."
    assert len(names_tok) == len(desc_tok), txt_check
    
    # Definición de las listas de salida
    X = list()
    Y = list()
    
    # Haciendo el mapeo uno a uno
    for i in range(len(names_tok)):
        for word in desc_tok[i]:
            X.append(names_tok[i])
            Y.append(word)
    
    return X, Y


def get_raw_XY(connection, tokenizer, tokenizer_func='custom', test=False):
    # Obtener la base de datos preprocesada
    names, descriptions = preprocess_HE_decriptions(connection, test=test)
    
    # Tokenizando cada concepto y los descriptores
    if tokenizer_func == 'tensorflow':
        names_tok = tokenizer.texts_to_sequences(names)
        desc_tok = tokenizer.texts_to_sequences(descriptions)
    elif tokenizer_func == 'custom':
        names_tok = tokenizer.texts_to_sequences(names, split_bool=False)
        desc_tok = [tokenizer.texts_to_sequences(sent, split_bool=False)
                    for sent in descriptions]
    
    # Checkeo de sanidad
    txt_check = "Conceptos y definición con distinto largo."
    assert len(names_tok) == len(desc_tok), txt_check
    
    # Aplicando padding
    desc_tok = tf.keras.preprocessing.sequence.pad_sequences(desc_tok, 
                                                             padding="post")
    
    return desc_tok, names_tok


def get_raw_XY_negativeSampling(connection, tokenizer, tokenizer_func='custom', 
                                test=False):
    # Obtener la base de datos preprocesada
    names, descriptions = preprocess_HE_decriptions(connection, test=test)
    
    # Tokenizando cada concepto y los descriptores
    if tokenizer_func == 'tensorflow':
        names_tok = tokenizer.texts_to_sequences(names)
        desc_tok = tokenizer.texts_to_sequences(descriptions)
    elif tokenizer_func == 'custom':
        names_tok = tokenizer.texts_to_sequences(names, split_bool=False)
        desc_tok = [tokenizer.texts_to_sequences(sent, split_bool=False)
                    for sent in descriptions]
    
    # Checkeo de sanidad
    txt_check = "Conceptos y definición con distinto largo."
    assert len(names_tok) == len(desc_tok), txt_check
    
    # Aplicando padding
    desc_tok = tf.keras.preprocessing.sequence.pad_sequences(desc_tok, 
                                                             padding="post")
    
    # Definición de listas para las entradas y las salidas (negative sampling)
    X = list()
    Y = list()
    Z = list()
    
    for i in range(len(names_tok)):
        for j in range(len(desc_tok)):
            X.append(desc_tok[j])
            Y.append(names_tok[i])
            if i == j:
                Z.append(1)
            else:
                Z.append(0)
    
    return np.array(X), np.array(Y), np.array(Z)

# Obtener la matriz de embeddings y el tokenizer

In [7]:
# Obtener la matriz de embedding y el tokenizer
embedding_matrix, tokenizer = \
    get_word_embeddings(d_embed=100, connection=connection, tokenizer_func='custom',
                        tokenizer_split=None, table_to_rev='health_effects', test=test)

Vector de 400004 palabras cargadas.


# Obtener los vectores de entrada y salida

In [8]:
# Obtener entradas-salidas
if db_method == 'raw':
    X_to, Y_to = get_raw_XY(connection, tokenizer, test=test)
elif db_method == 'normal':
    X, Y = get_XY(connection, tokenizer, test=test)
    X_to = X + Y
    Y_to = Y + X
elif db_method == 'raw_negative':
    X_to, Y_to, Z_to = get_raw_XY_negativeSampling(connection, tokenizer, test=test)

In [8]:
embedding_matrix.shape

(400007, 100)

# Single CBOW

Cada palabra v/s el concepto objetivo y viceversa.

<img src="imgs/CBOW-Diagrams-CBOW.drawio.png" width=600 height=600 />

In [8]:
def model_CBOW_1(vocab_size, embedding_matrix, embedding_dim=100, embedding_trainable=True,
                 activation_out='sigmoid'):
    # Capa de entrada
    x_targ_in = tf.keras.Input(shape=(1,), name='Target_Input')
    # Capa de embebido
    x_targ = tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                       trainable=embedding_trainable,
                                       weights=[embedding_matrix],
                                       mask_zero=True)(x_targ_in)
    # Capa FF
    x_out = tf.keras.layers.Dense(units=vocab_size, kernel_initializer='he_normal',
                                  activation=activation_out)(x_targ)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=[x_targ_in], outputs=[x_out])
    
    # Compilando el modelo
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [9]:
model = model_CBOW_1(vocab_size=len(tokenizer.word_index) + 1, 
                     embedding_matrix=embedding_matrix,
                     activation_out='sigmoid')
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Target_Input (InputLayer)    [(None, 1)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 1, 100)            40000700  
_________________________________________________________________
dense (Dense)                (None, 1, 400007)         40400707  
Total params: 80,401,407
Trainable params: 80,401,407
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Definición del nombre
model_name = 'model_CBOW_1'

# Creación de la carpeta
if not os.path.exists(f'Results/{model_name}'):
    os.makedirs(f'Results/{model_name}')

# Guardando el tokenizer
with open(f'Results/{model_name}/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Entrenamiento
epochs = 100
for epoch in range(epochs):
    # Para cada epoch, ajustar
    model.fit(x=X_to, y=Y_to, batch_size=16, shuffle=True)
    # Y guardar
    model.save(f'Results/{model_name}/model_test.h5')





### Versión softmax

In [9]:
model_2 = model_CBOW_1(vocab_size=len(tokenizer.word_index) + 1, 
                     embedding_matrix=embedding_matrix,
                     activation_out='softmax')
model_2.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Target_Input (InputLayer)    [(None, 1)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 1, 100)            40000700  
_________________________________________________________________
dense (Dense)                (None, 1, 400007)         40400707  
Total params: 80,401,407
Trainable params: 80,401,407
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Definición del nombre
model_name = 'model_CBOW_1_softmax_test'

# Creación de la carpeta
if not os.path.exists(f'Results/{model_name}'):
    os.makedirs(f'Results/{model_name}')

# Guardando el tokenizer
with open(f'Results/{model_name}/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Entrenamiento
epochs = 250
for epoch in range(epochs):
    # Para cada epoch, ajustar
    model_2.fit(x=X_to, y=Y_to, batch_size=128, shuffle=True)
    # Y guardar
    model_2.save(f'Results/{model_name}/model.h5')

NameError: name 'model_2' is not defined

# Semi - Siamese Networks


<img src="imgs/CBOW-Diagrams-Semi-Siamese.drawio.png" width=600 height=600 />

In [8]:
def semi_siamese_network(vocab_size, embedding_matrix, embedding_dim=100, embedding_trainable=True,
                         activation_out='sigmoid'):
    # Capa de entrada
    x_cont_in = tf.keras.Input(shape=(1,), name='Input_context')
    x_targ_in = tf.keras.Input(shape=(1,), name='Input_target')
    
    # Capa de embebido
    embedd = tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                       trainable=embedding_trainable,
                                       weights=[embedding_matrix],
                                       mask_zero=True)
    # Aplicando
    x_cont = embedd(x_cont_in)
    x_targ = embedd(x_targ_in)
    
    # Capa de diferencia
    l1_norm = lambda x: 1 - tf.norm((x[0] - x[1]), axis=-1)
    distance = tf.keras.layers.Lambda(function=l1_norm, 
                                      output_shape=lambda x: x[0], 
                                      name='L1_distance')([x_cont, x_targ])
    
    # Capa FF
    x_out = tf.keras.layers.Dense(units=1, kernel_initializer='he_normal',
                                  activation=activation_out, name='Output')(distance)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=[x_cont_in, x_targ_in], outputs=x_out)
    
    # Compilando el modelo
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])
    
    return model

In [9]:
model_3 = semi_siamese_network(vocab_size=len(tokenizer.word_index) + 1, 
                                 embedding_matrix=embedding_matrix,
                                 activation_out=None)
model_3.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_context (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
Input_target (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 100)       40000700    Input_context[0][0]              
                                                                 Input_target[0][0]               
__________________________________________________________________________________________________
L1_distance (Lambda)            (None, 1)            0           embedding[0][0]       

In [None]:
# Definición del nombre
model_name = 'model_CBOW_1_semiSiamese_test_v2'

# Creación de la carpeta
if not os.path.exists(f'Results/{model_name}'):
    os.makedirs(f'Results/{model_name}')

# Guardando el tokenizer
with open(f'Results/{model_name}/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Entrenamiento
epochs = 500
for epoch in range(epochs):
    # Para cada epoch, ajustar
    model_3.fit(x={'Input_context': np.array(X_to), 'Input_target': np.array(Y_to)}, 
                y={'Output': np.zeros((len(Y_to), 1))}, batch_size=32, shuffle=True)
    # Y guardar
    model_3.save(f'Results/{model_name}/model.h5')



# Semi siamese without Dense

<img src="imgs/CBOW-Diagrams-Semi-Siamese No Dense.drawio.png" width=600 height=600 />

In [11]:
def semi_siamese_network_v2(vocab_size, embedding_matrix, embedding_dim=100, embedding_trainable=True,
                            activation_out='sigmoid'):
    # Capa de entrada
    x_cont_in = tf.keras.Input(shape=(1,), name='Input_context')
    x_targ_in = tf.keras.Input(shape=(1,), name='Input_target')
    
    # Capa de embebido
    embedd = tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                       trainable=embedding_trainable,
                                       weights=[embedding_matrix],
                                       mask_zero=True)
    # Aplicando
    x_cont = embedd(x_cont_in)
    x_targ = embedd(x_targ_in)
    
    # Capa de diferencia
    l1_norm = lambda x: tf.norm((x[0] - x[1]), axis=-1)
    distance = tf.keras.layers.Lambda(function=l1_norm, 
                                      output_shape=lambda x: x[0], 
                                      name='Output')([x_cont, x_targ])
    
    # Definición del modelo
    model = tf.keras.Model(inputs=[x_cont_in, x_targ_in], outputs=distance)
    
    # Compilando el modelo
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])
    
    return model

In [12]:
model_3_1 = semi_siamese_network_v2(vocab_size=len(tokenizer.word_index) + 1, 
                                    embedding_matrix=embedding_matrix,
                                    activation_out=None)
model_3_1.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_context (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
Input_target (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 100)       40000700    Input_context[0][0]              
                                                                 Input_target[0][0]               
__________________________________________________________________________________________________
Output (Lambda)                 (None, 1)            0           embedding_1[0][0]     

In [14]:
# Definición del nombre
model_name = 'model_CBOW_1_semiSiamese_test_v3'

# Creación de la carpeta
if not os.path.exists(f'Results/{model_name}'):
    os.makedirs(f'Results/{model_name}')

# Guardando el tokenizer
with open(f'Results/{model_name}/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Entrenamiento
epochs = 500
for epoch in range(epochs):
    # Para cada epoch, ajustar
    model_3_1.fit(x={'Input_context': np.array(X_to), 'Input_target': np.array(Y_to)}, 
                  y={'Output': np.zeros((len(Y_to), 1))}, batch_size=32, shuffle=True)
    # Y guardar
    model_3_1.save(f'Results/{model_name}/model.h5')

























































# One sided Siamese Networks

<img src="imgs/CBOW-Diagrams-Siamese.drawio.png" weight=600 height=600/>

In [11]:
def siamese_network(vocab_size, embedding_matrix, max_len=10, embedding_dim=100, 
                    embedding_trainable=True, activation_out='sigmoid'):
    # Capa de entrada
    x_cont_in = tf.keras.Input(shape=(max_len,), name='Input_context')
    x_targ_in = tf.keras.Input(shape=(1,), name='Input_target')
    
    # Capa de embebido
    embedd = tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                       trainable=embedding_trainable,
                                       weights=[embedding_matrix],
                                       mask_zero=False)
    # Aplicando
    x_cont = embedd(x_cont_in)
    x_targ = embedd(x_targ_in)
    
    # Capa LSTM solo sobre la entrada de la descripción
    lstm = tf.keras.layers.Bidirectional( 
            tf.keras.layers.LSTM(units=embedding_dim, kernel_initializer='he_normal',
                                dropout=0.2, recurrent_dropout=0.2))    
    x_cont = lstm(x_cont)
    x_targ = lstm(x_targ)
       
    
    # Capa de diferencia
    l1_norm = lambda x: tf.keras.backend.abs(x[0] - x[1])
    distance = tf.keras.layers.Lambda(function=l1_norm, 
                                      output_shape=lambda x: x[0], 
                                      name='L1_distance')([x_cont, x_targ])
    
    # Capa FF
    print(x_cont.shape)
    x_out = tf.keras.layers.Dense(units=1, kernel_initializer='he_normal',
                                  activation=activation_out, name='Output')(distance)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=[x_cont_in, x_targ_in], outputs=x_out)
    
    # Compilando el modelo
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])
    
    return model

In [12]:
model_4 = siamese_network(vocab_size=len(tokenizer.word_index) + 1, 
                          embedding_matrix=embedding_matrix,
                          max_len=X_to.shape[1],
                          activation_out=None)
model_4.summary()

(None, 200)
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_context (InputLayer)      [(None, 25)]         0                                            
__________________________________________________________________________________________________
Input_target (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             40000700    Input_context[0][0]              
                                                                 Input_target[0][0]               
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 200)          160800      embedding[

In [13]:
# Definición del nombre
model_name = 'model_CBOW_1_Siamese_test'

# Creación de la carpeta
if not os.path.exists(f'Results/{model_name}'):
    os.makedirs(f'Results/{model_name}')

# Guardando el tokenizer
with open(f'Results/{model_name}/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Entrenamiento
epochs = 500
for epoch in range(epochs):
    # Para cada epoch, ajustar
    model_4.fit(x={'Input_context': np.array(X_to), 'Input_target': np.array(Y_to)}, 
                y={'Output': np.zeros((len(Y_to), 1))}, batch_size=1, shuffle=True)
    # Y guardar
    model_4.save(f'Results/{model_name}/model.h5')





# Traditional Siamese Networks

In [9]:
def siamese_network_v2(vocab_size, embedding_matrix, max_len=10, embedding_dim=100, 
                    embedding_trainable=True, activation_out='sigmoid'):
    # Capa de entrada
    x_cont_in = tf.keras.Input(shape=(max_len,), name='Input_context')
    x_targ_in = tf.keras.Input(shape=(1,), name='Input_target')
    
    # Capa de embebido
    embedd = tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                       trainable=embedding_trainable,
                                       weights=[embedding_matrix],
                                       mask_zero=True)
    # Aplicando
    x_cont = embedd(x_cont_in)
    x_targ = embedd(x_targ_in)
    
    # Capa LSTM solo sobre la entrada de la descripción
    lstm = tf.keras.layers.Bidirectional( 
            tf.keras.layers.LSTM(units=embedding_dim, kernel_initializer='he_normal',
                                dropout=0.2, recurrent_dropout=0.2
                                ))    
    x_cont = lstm(x_cont)
    x_targ = lstm(x_targ)
       
    
    # Capa de diferencia
    cos_sim = \
        lambda x: tf.keras.backend.batch_dot(tf.keras.backend.l2_normalize(x[0], axis=-1), 
                                             tf.keras.backend.l2_normalize(x[1], axis=-1), 
                                             axes=-1)
    distance = tf.keras.layers.Lambda(function=cos_sim, 
                                      output_shape=lambda x: x[0], 
                                      name='L1_distance')([x_cont, x_targ])
    
    # Capa FF
    if activation_out == 'sigmoid':
        n_units = 1
    elif activation_out == 'softmax':
        n_units = 2
    x_out = tf.keras.layers.Dense(units=n_units, kernel_initializer='he_normal',
                                  activation=activation_out, name='Output')(distance)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=[x_cont_in, x_targ_in], outputs=x_out)
    
    # Compilando el modelo
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [10]:
model_5 = siamese_network_v2(vocab_size=len(tokenizer.word_index) + 1, 
                          embedding_matrix=embedding_matrix,
                          max_len=X_to.shape[1],
                          activation_out='sigmoid')
model_5.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_context (InputLayer)      [(None, 25)]         0                                            
__________________________________________________________________________________________________
Input_target (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             40000700    Input_context[0][0]              
                                                                 Input_target[0][0]               
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 200)          160800      embedding[0][0]       

In [11]:
# Definición del nombre
model_name = 'model_CBOW_1_Siamese_traditional_test_v2(masking)'

# Creación de la carpeta
if not os.path.exists(f'Results/{model_name}'):
    os.makedirs(f'Results/{model_name}')

# Guardando el tokenizer
with open(f'Results/{model_name}/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Entrenamiento
epochs = 200
for epoch in range(epochs):
    # Para cada epoch, ajustar
    model_5.fit(x={'Input_context': X_to, 'Input_target': Y_to},
                y={'Output': Z_to}, batch_size=16, shuffle=True)
    # Y guardar
    model_5.save(f'Results/{model_name}/model.h5')







# Módulo de obtención de pesos

In [9]:
model_name = 'model_CBOW_1'
model_weights = tf.keras.models.load_model(f'Results/{model_name}/model_test.h5')
with open(f'Results/{model_name}/tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

In [10]:
model_weights.layers

[<tensorflow.python.keras.engine.input_layer.InputLayer at 0x1f3cc785e80>,
 <tensorflow.python.keras.layers.embeddings.Embedding at 0x1f3cc785c70>,
 <tensorflow.python.keras.layers.core.Dense at 0x1f3cc7937c0>]

In [11]:
# Obteniendo los nuevos pesos
tr_embedding_matrix = model_weights.layers[1].weights[0].numpy()
display(tr_embedding_matrix.shape)
tr_embedding_matrix

(400007, 100)

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.85697 , -0.16452 ,  0.090192, ...,  0.018941,  0.057016,
        -0.77519 ],
       ...,
       [ 1.3757  ,  0.62687 , -0.64018 , ..., -0.91214 , -0.11763 ,
        -0.55478 ],
       [-0.053613, -0.37776 , -0.72317 , ...,  0.73281 , -0.086304,
         0.092891],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]], dtype=float32)

In [12]:
def query_embeddings(sentence, tokenizer, embedding_matrix, connection, 
                     tokenizer_type='custom', split_bool=False, 
                     distance_metric='euclidean', test=False):
    # Definición de los conceptos clave a buscar
    names, descriptions = preprocess_HE_decriptions(connection, test=test)
    
    # Obtener los id de la oración
    if tokenizer_type == 'custom':
        tokens = tokenizer.texts_to_sequences(sentence, split_bool=split_bool)
        names_toks = tokenizer.texts_to_sequences(names, split_bool=False)
    elif tokenizer_type == 'tensorflow':
        tokens = tokenizer.texts_to_sequences(sentence)
        names_toks = tokenizer.texts_to_sequences(names)
    
    # Obtener los embeddings asociados
    embedded_tokens = embedding_matrix[tokens]
    embedded_names = embedding_matrix[names_toks]
    
    # Calculando una distancia
    if distance_metric == 'cosine':
        metric = np.dot(embedded_names, embedded_tokens.T)
        metric = metric / np.linalg.norm(embedded_names, 2, axis=1)[:, np.newaxis]
        metric = metric / np.linalg.norm(embedded_tokens, 2, axis=1)[np.newaxis, :]
        metric = metric.mean(axis=1)
    elif distance_metric == 'euclidean':
        metric = list()
        for emb_tok in embedded_tokens:
            dists = np.sum((embedded_names - emb_tok) ** 2, axis=1)
            metric.append(dists)
        metric = np.array(metric)
        metric = metric.mean(axis=0)
    
    # Ordenando las métricas
    metrics_key = [(i, value, descriptions[i]) for i, value in enumerate(metric)]
    if distance_metric == 'cosine':
        metrics_key.sort(key=lambda x: x[1], reverse=True)
    elif distance_metric == 'euclidean':
        metrics_key.sort(key=lambda x: x[1], reverse=False)
    
    # Transformando a palabras
    concepts = [(names[i[0]], i[1], i[2]) for i in metrics_key]
    
    return concepts
    

sentence = ['sweating']
a = query_embeddings(sentence, tokenizer, tr_embedding_matrix, connection, split_bool=False,
                 test=test, distance_metric='cosine')

for i in a:
    print(i)

[169560]
('anesthetic', 0.5128166, ['drug', 'causes', 'anesthesia', 'reversible', 'loss', 'sensation'])
('androgenic', 0.49520603, ['agent', 'usually', 'steroid', 'hormone', 'stimulates', 'controls', 'development', 'maintenance', 'male', 'characteristics', 'vertebrates', 'binding', 'androgen', 'receptors', 'includes', 'activity', 'accessory', 'male', 'sex', 'organs', 'development', 'male', 'secondary', 'sex', 'characteristics'])
('energizer', 0.4759444, ['agent', 'supplies', 'energy', 'typically', 'kinetic', 'another'])
('sudorific', 0.41351315, ['agent', 'drug', 'induces', 'sweating'])
('anti syncopic', 0.39649156, ['agent', 'relieves', 'symptoms', 'fainting', 'low', 'heart', 'rate', 'low', 'blood', 'pressure', 'low', 'blood', 'flow', 'brain'])
('stimulant', 0.38202187, ['agent', 'drug', 'induces', 'temporary', 'improvements', 'either', 'mental', 'physical', 'functions', 'examples', 'kinds', 'effects', 'may', 'include', 'enhanced', 'alertness', 'wakefulness', 'locomotion', 'among', 'o

# Módulo de reentrenamiento

In [8]:
filename = 'model_CBOW_1_Siamese_traditional_test_v2(masking)'
re_model = tf.keras.models.load_model(f'Results/{filename}/model.h5')
re_model.summary()
with open(f'Results/{model_name}/tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_context (InputLayer)      [(None, 25)]         0                                            
__________________________________________________________________________________________________
Input_target (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             40000700    Input_context[0][0]              
                                                                 Input_target[0][0]               
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 200)          160800      embedding[0][0]       

In [9]:
epochs_extra = 500
for epoch in range(epochs_extra):
    # Para cada epoch, ajustar
    if filename == 'Results/model_CBOW_1_semiSiamese_test.h5':
        re_model.fit(x={'Input_context': np.array(X_to), 'Input_target': np.array(Y_to)}, 
                     y={'Output': np.zeros((len(Y_to), 1))}, batch_size=1, shuffle=True)
    elif filename in ['Results/model_CBOW_1_Siamese_traditional_test.h5', 
                      'Results/model_CBOW_1_Siamese_traditional_test_v2(masking).h5']:
        # Para cada epoch, ajustar
        re_model.fit(x={'Input_context': X_to, 'Input_target': Y_to},
                     y={'Output': Z_to}, batch_size=16, shuffle=True)
    else:
        re_model.fit(x=X_to, y=Y_to, batch_size=32, shuffle=True)
    # Y guardar
    re_model.save(filename)













# Modulo de testeo del modelo

In [17]:
model_name = 'model_CBOW_1'
model_test = tf.keras.models.load_model(f'Results/{model_name}/model_test.h5')
with open(f'Results/{model_name}/tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

In [10]:
def model_testing(model, sentence, tokenizer, connection, 
                  tokenizer_type='custom', split_bool=False, 
                  test=False):
    # Definición de los conceptos clave a buscar
    names, descriptions = preprocess_HE_decriptions(connection, test=test)
    
    # Obtener los id de la oración
    if tokenizer_type == 'custom':
        tokens = tokenizer.texts_to_sequences(sentence, split_bool=split_bool)
        names_toks = tokenizer.texts_to_sequences(names, split_bool=False)
    elif tokenizer_type == 'tensorflow':
        tokens = tokenizer.texts_to_sequences(sentence)
        names_toks = tokenizer.texts_to_sequences(names)
    
    # Aplicando padding
    tokens = tf.keras.preprocessing.sequence.pad_sequences([tokens], maxlen=25, padding='post')
    
    # Definición de la lista de resultados
    results = list()
    
    for i, name_i in enumerate(names_toks):
        y_pred = model.predict(x={'Input_context': tokens, 'Input_target': np.array([name_i])})
        results.append((names[i], round(y_pred[0,0] * 100, 3), descriptions[i]))
    
    # Ordenando
    results.sort(key=lambda x: x[1], reverse=True)
    
    return results


sentence = ['energizer']
a = model_testing(model_test, sentence, tokenizer, connection, 
                  tokenizer_type='custom', split_bool=False, 
                  test=True)
for i in a:
    print(i)

('stimulant', 20.006, ['agent', 'drug', 'induces', 'temporary', 'improvements', 'either', 'mental', 'physical', 'functions', 'examples', 'kinds', 'effects', 'may', 'include', 'enhanced', 'alertness', 'wakefulness', 'locomotion', 'among', 'others'])
('androgenic', 19.959, ['agent', 'usually', 'steroid', 'hormone', 'stimulates', 'controls', 'development', 'maintenance', 'male', 'characteristics', 'vertebrates', 'binding', 'androgen', 'receptors', 'includes', 'activity', 'accessory', 'male', 'sex', 'organs', 'development', 'male', 'secondary', 'sex', 'characteristics'])
('chronotropic', 18.698, ['agent', 'drug', 'change', 'heart', 'rate', 'affecting', 'nerves', 'controlling', 'heart', 'changing', 'rhythm', 'produced', 'sinoatrial', 'node'])
('anti syncopic', 17.934, ['agent', 'relieves', 'symptoms', 'fainting', 'low', 'heart', 'rate', 'low', 'blood', 'pressure', 'low', 'blood', 'flow', 'brain'])
('sudorific', 17.527, ['agent', 'drug', 'induces', 'sweating'])
('anesthetic', 17.38, ['drug',

## Testeo

In [12]:
sentence = ['hormone', 'stimulation', 'male', 'features']
print(f'{sentence}\n------------------------------------------------')
a = model_testing(model_test, sentence, tokenizer, connection, 
                  tokenizer_type='custom', split_bool=False, 
                  test=True)
for i in a:
    print(i)
print('\n\n')

sentence = ['change', 'heart', 'rate']
print(f'{sentence}\n------------------------------------------------')
a = model_testing(model_test, sentence, tokenizer, connection, 
                  tokenizer_type='custom', split_bool=False, 
                  test=True)
for i in a:
    print(i)
print('\n\n')
    
sentence = ['energy']
print(f'{sentence}\n------------------------------------------------')
a = model_testing(model_test, sentence, tokenizer, connection, 
                  tokenizer_type='custom', split_bool=False, 
                  test=True)
for i in a:
    print(i)
print('\n\n')  

sentence = ['less', 'sensation', 'low']
print(f'{sentence}\n------------------------------------------------')
a = model_testing(model_test, sentence, tokenizer, connection, 
                  tokenizer_type='custom', split_bool=False, 
                  test=True)
for i in a:
    print(i)

['hormone', 'stimulation', 'male', 'features']
------------------------------------------------
('androgenic', 45.911, ['agent', 'usually', 'steroid', 'hormone', 'stimulates', 'controls', 'development', 'maintenance', 'male', 'characteristics', 'vertebrates', 'binding', 'androgen', 'receptors', 'includes', 'activity', 'accessory', 'male', 'sex', 'organs', 'development', 'male', 'secondary', 'sex', 'characteristics'])
('anesthetic', 14.87, ['drug', 'causes', 'anesthesia', 'reversible', 'loss', 'sensation'])
('energizer', 6.808, ['agent', 'supplies', 'energy', 'typically', 'kinetic', 'another'])
('stimulant', 5.842, ['agent', 'drug', 'induces', 'temporary', 'improvements', 'either', 'mental', 'physical', 'functions', 'examples', 'kinds', 'effects', 'may', 'include', 'enhanced', 'alertness', 'wakefulness', 'locomotion', 'among', 'others'])
('sudorific', 4.783, ['agent', 'drug', 'induces', 'sweating'])
('anti syncopic', 3.424, ['agent', 'relieves', 'symptoms', 'fainting', 'low', 'heart', '