In [1]:
import mysql.connector
import os, getpass, re, requests
import gensim
import pandas as pd
import numpy as np
import cvxpy as cp
import tensorflow as tf
import matplotlib.pyplot as plt
from mysql.connector import Error
from itertools import combinations
from ast import literal_eval
from collections import defaultdict
from scipy.spatial import distance
from sklearn.manifold import TSNE
from bs4 import BeautifulSoup
import plotly.graph_objects as go
from nltk.corpus import stopwords
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
# Definición de parámetros de conexión
connection_params = {
    'host': 'localhost',
    'user': 'cmescobar',
    'database': 'foodb',
    'password': getpass.getpass(prompt='Introduzca la contraseña: ')
}

try:
    connection = mysql.connector.connect(**connection_params)

    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL Server version ", db_Info)
        cursor = connection.cursor()

except Error as e:
    print("Error while connecting to MySQL", e)

Introduzca la contraseña: ········
Connected to MySQL Server version  8.0.26


# Definición del modo de funcionamiento

In [3]:
test = True               # Si se utiliza o no un conjunto más pequeño de conceptos
raw_XY = True             # Si es True, se hace el conjunto para el siamese normal. 
                          # En caso contrario, es para todo el resto.
tokenizer_func = 'custom'
task_prefix = 'health_effects: '

In [4]:
def preprocess_HE_decriptions(connection, test=False, to_list=True):
    # Definición de la query que se le hace a la base de datos
    sql_query = '''
        SELECT name, chebi_name, IF(description IS NULL, chebi_definition, description) as definitions 
        FROM foodb.health_effects he
        WHERE description IS NOT NULL OR chebi_definition IS NOT NULL
    '''

    # Obtener el dataframe
    dataframe = pd.read_sql(sql_query, con=connection)
    
    # Acortar para un testeo sencillo
    if test:
        with open('Summary/names_to_rev.txt', 'r', encoding='utf8') as file:
            concepts = list()
            for line in file:
                concepts.append(line.strip())

        dataframe = dataframe[dataframe['name'].isin(concepts)]

    # Definición de los tokens de nombres a agregar
    names = list(dataframe['name'])
    
    # Definición de los tokens de descripción
    descriptions = list()
    
    # Definición de tokens generados a partir de las descripciones
    from_defs = list()
    for d in dataframe['definitions']:
        # En primer lugar eliminar el salto de linea
        txt = d.strip()

        # Eliminando los puntos y comas
        txt = ''.join(re.findall('[\w\s]+', txt))

        # Reemplazando los espacios múltiples con espacios
        txt = re.sub('\s+', ' ', txt)

        # Pasando a minúsculas
        txt = txt.lower()

        # Filtrando palabras stop
        if to_list:
            txt = [i for i in txt.split() if not i in stopwords.words('english')]
        
        # Agregando a la lista
        descriptions.append(txt)
        
    return names, descriptions

In [5]:
names, descriptions = preprocess_HE_decriptions(connection, test=test, to_list=False)

# Training

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [7]:
# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# Codificar las entradas
encoding = tokenizer([task_prefix + sequence for sequence in descriptions],
                     padding='longest',
                     max_length=max_source_length,
                     truncation=True,
                     return_tensors="pt")
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

In [None]:
# encode the targets
target_encoding = tokenizer(names, padding='longest',
                            max_length=max_target_length,
                            truncation=True)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100
labels = [
           [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
]
labels = torch.tensor(labels)

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss