# Imports e inicialización del servidor

In [1]:
import mysql.connector
import os, getpass, re, requests
import gensim
import pandas as pd
import numpy as np
import cvxpy as cp
import tensorflow as tf
from mysql.connector import Error
from itertools import combinations
from ast import literal_eval
from collections import defaultdict
from scipy.spatial import distance
from bs4 import BeautifulSoup
K = 10

In [2]:
# Definición de parámetros de conexión
connection_params = {
    'host': 'localhost',
    'user': 'cmescobar',
    'database': 'foodb',
    'password': getpass.getpass(prompt='Introduzca la contraseña: ')
}

try:
    connection = mysql.connector.connect(**connection_params)

    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL Server version ", db_Info)
        cursor = connection.cursor()

except Error as e:
    print("Error while connecting to MySQL", e)

Introduzca la contraseña: ········
Connected to MySQL Server version  8.0.26


# Abriendo el modelo GloVe entrenado en Wikipedia

# Definición de la función objetivo customizada

In [3]:
def weighted_binary_crossentropy(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float64)
    y_pred = tf.cast(y_pred, tf.float64)
    bcross_ent = - 1 / len(y_true) * (y_true * tf.keras.backend.log(y_pred) + 
                                      (1 - y_true) * tf.keras.backend.log(1 - y_pred))
    weighted_bcross_ent = ((K - 1) * y_true + 1) * bcross_ent
    
    return tf.reduce_sum(weighted_bcross_ent)

In [4]:
# Cargando el embedding en memoria
embeddings_index = dict()
with open('C:/Users/Chris-Brota/Desktop/glove.6b/glove.6B.100d.txt', 'r', encoding='utf8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f'Vector de {len(embeddings_index)} palabras cargadas.')

Vector de 400001 palabras cargadas.


In [4]:
print(embeddings_index.get('uwhasdkjd'))

None


# Cargando los datos

# Obteniendo la información de la tabla de interés

In [5]:
query = \
'''SELECT name, name_scientific, description, wikipedia_id
   FROM foods f;
'''

# Solicitud de la query
df = pd.read_sql(query, connection)

# Limpieza de datos

In [6]:
# Eliminando los alimentos con None en la descripción
df_clean = df.dropna(subset=['description'])
df_clean

Unnamed: 0,name,name_scientific,description,wikipedia_id
0,Angelica,Angelica keiskei,Angelica is a genus of about 60 species of tal...,Angelica
1,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,Savoy cabbage
2,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,Tilia tomentosa
3,Kiwi,Actinidia chinensis,"The kiwifruit, often shortened to kiwi in many...",Kiwifruit
4,Allium,Allium,Allium haematochiton is a species of wild onio...,Allium haematochiton
...,...,...,...,...
986,White bread,,White bread typically refers to breads made fr...,
987,Cape gooseberry,Physalis peruviana,"Physalis peruviana, a plant species of the gen...",
988,Herbal tea,,Herbal teas are the beverages made from the in...,
989,Fish oil,,Fish oil is oil derived from the tissues of oi...,


# Definición de los datos de entrada y salida

In [7]:
X_data = df_clean['description']
Y_data = df_clean['name']

In [8]:
Y_labels = tf.keras.utils.to_categorical([i for i in range(len(Y_data))], 
                                         num_classes=None, dtype='int')

# Limpieza del texto

In [9]:
def depure_text(text):
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)

    # Remove Emails
    text = re.sub('\S*@\S*\s?', '', text)

    # Remove new line characters
    text = re.sub('\s+', ' ', text)

    # Remove distracting single quotes
    text = re.sub("\'", "", text)
        
    return text


def sent_to_words(descriptions):
    '''Función que preprocesa el texto, y además elimina palabras muy cortas o 
    muy largas.
    
    Parameters
    ----------
    descriptions : list
        Lista de descripciones a revisar.
    '''
    for desc in descriptions:
        yield gensim.utils.simple_preprocess(depure_text(desc), deacc=True)
        

def lowercase_words(words):
    return [word.lower() for word in words]
        
        
def detokenize(text):
    return nltk.tokenize.treebank.TreebankWordDetokenizer().detokenize(text)

In [10]:
# Obteniendo la lista de descripciones
X_data = list(sent_to_words(X_data))
Y_data2 = list(sent_to_words(Y_data))

In [11]:
# Definición del set de todas las palabras que se encuentran
# en las descripciones
bag_of_words = set()

for line in X_data:
    for i in line:
        bag_of_words.add(i)
        
for line in Y_data2:
    for i in line:
        bag_of_words.add(i)

In [12]:
len(bag_of_words)

11332

# Tokenization y Embeddings

In [13]:
# Definición del token
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Ajustando el tokenizer a los textos
tokenizer.fit_on_texts(list(bag_of_words))

# Definiendo el tamaño del vocabulario
vocab_size = len(tokenizer.word_index) + 1

# Una vez definido el token para cada palabra, se transforman los textos mediante cada token
sequences = tokenizer.texts_to_sequences(X_data)

# Dado que las secuencias no tienen el mismo largo, se paddea para obtener una matriz
X_token = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=200)
print(X_token.shape)
X_token

(950, 200)


array([[    0,     0,     0, ...,  1430,  5841,  2996],
       [    0,     0,     0, ...,  9294,  1430, 10912],
       [    0,     0,     0, ...,  1430,  6884, 11108],
       ...,
       [    0,     0,     0, ...,   973,  8097,  8032],
       [    0,     0,     0, ..., 10570, 11299,  9207],
       [    0,     0,     0, ...,  4452,  6587,  4452]])

# Checkeo de las palabras objetivo

In [14]:
not_found = []
found = []

for food_i in Y_data2:
    for i in food_i:
        if embeddings_index.get(i) is None:
            not_found.append(i)
            # print(f'Word {i} not in vocab')
        else:
            found.append(i)

In [15]:
found

['angelica',
 'savoy',
 'cabbage',
 'silver',
 'linden',
 'kiwi',
 'allium',
 'garden',
 'onion',
 'leek',
 'garlic',
 'chives',
 'lemon',
 'verbena',
 'cashew',
 'nut',
 'pineapple',
 'dill',
 'custard',
 'apple',
 'wild',
 'celery',
 'peanut',
 'burdock',
 'horseradish',
 'tarragon',
 'mugwort',
 'asparagus',
 'oat',
 'star',
 'fruit',
 'brazil',
 'nut',
 'common',
 'beet',
 'borage',
 'chinese',
 'mustard',
 'swede',
 'rape',
 'common',
 'cabbage',
 'cauliflower',
 'brussel',
 'sprouts',
 'kohlrabi',
 'broccoli',
 'chinese',
 'cabbage',
 'turnip',
 'pigeon',
 'pea',
 'tea',
 'capers',
 'pepper',
 'papaya',
 'safflower',
 'caraway',
 'pecan',
 'nut',
 'chestnut',
 'roman',
 'camomile',
 'chickpea',
 'endive',
 'chicory',
 'chinese',
 'cinnamon',
 'ceylon',
 'cinnamon',
 'watermelon',
 'lime',
 'lemon',
 'mandarin',
 'orange',
 'clementine',
 'tangerine',
 'sweet',
 'orange',
 'coffee',
 'arabica',
 'coffee',
 'robusta',
 'coffee',
 'coriander',
 'common',
 'hazelnut',
 'saffron',
 'm

# Una vez cargados los datos, es necesario crear la matriz de Embedding E

In [16]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
Y_data2

[['angelica'],
 ['savoy', 'cabbage'],
 ['silver', 'linden'],
 ['kiwi'],
 ['allium'],
 ['garden', 'onion'],
 ['leek'],
 ['garlic'],
 ['chives'],
 ['lemon', 'verbena'],
 ['cashew', 'nut'],
 ['pineapple'],
 ['dill'],
 ['custard', 'apple'],
 ['wild', 'celery'],
 ['peanut'],
 ['burdock'],
 ['horseradish'],
 ['tarragon'],
 ['mugwort'],
 ['asparagus'],
 ['oat'],
 ['star', 'fruit'],
 ['brazil', 'nut'],
 ['common', 'beet'],
 ['borage'],
 ['chinese', 'mustard'],
 ['swede'],
 ['rape'],
 ['common', 'cabbage'],
 ['cauliflower'],
 ['brussel', 'sprouts'],
 ['kohlrabi'],
 ['broccoli'],
 ['chinese', 'cabbage'],
 ['turnip'],
 ['pigeon', 'pea'],
 ['tea'],
 ['capers'],
 ['pepper'],
 ['papaya'],
 ['safflower'],
 ['caraway'],
 ['pecan', 'nut'],
 ['chestnut'],
 ['roman', 'camomile'],
 ['chickpea'],
 ['endive'],
 ['chicory'],
 ['chinese', 'cinnamon'],
 ['ceylon', 'cinnamon'],
 ['watermelon'],
 ['lime'],
 ['lemon'],
 ['pummelo'],
 ['mandarin', 'orange', 'clementine', 'tangerine'],
 ['sweet', 'orange'],
 ['coff

In [18]:
embedding_matrix.shape

(11333, 100)

# Testing: Buscando las palabras próximas

In [19]:
test_phrase = 'citric fruit'
test_phrase = test_phrase.split()

In [20]:
test_seq = tokenizer.texts_to_sequences(test_phrase)
test_seq

[[7091], [6255]]

In [21]:
np.squeeze(np.array(test_seq)).shape

(2,)

In [23]:
embedded_words = embedding_matrix[np.squeeze(np.array(test_seq))]
mean_words = embedded_words.mean(axis=0)
display(mean_words.shape)
mean_words

(100,)

array([-2.92229995e-01,  3.38079996e-01, -1.66065000e-01, -8.01560022e-02,
        7.19269991e-01, -7.87404981e-02,  2.35774994e-01,  1.26304999e-01,
        1.02714993e-01, -3.63299992e-01, -6.37120008e-01,  1.84214994e-01,
        2.29149967e-01,  3.99593990e-01,  2.21654985e-01,  6.97304994e-01,
       -2.25730002e-01, -3.96500155e-03,  5.74855000e-01,  3.34066000e-01,
       -7.46230036e-02, -6.02365475e-01,  5.73520005e-01,  1.13944992e-01,
       -1.02430001e-01,  1.04918002e+00, -5.84500000e-01, -6.65800005e-01,
       -6.36994988e-01,  3.63750011e-03,  1.90754980e-01,  4.89419997e-01,
        1.51950002e-01, -9.31050003e-01, -3.14884990e-01,  2.03944996e-01,
        4.87160012e-01,  1.35622003e-01,  4.80830014e-01, -5.46064995e-01,
        3.20112497e-01, -6.58584997e-01, -4.33014996e-01, -7.51014978e-01,
        8.55825007e-01,  6.57368012e-01, -2.91510001e-01, -3.28245007e-01,
       -4.64629993e-01, -4.72310007e-01, -1.75798499e-01, -6.98455006e-01,
       -2.02080004e-01,  

In [24]:
index_key = []
embedding_all = []
for key, value in embeddings_index.items():
    index_key.append(key)
    embedding_all.append(value)

embedding_all = np.array(embedding_all)

In [25]:
cos_sim = np.dot(embedding_all, mean_words) / \
            (np.linalg.norm(embedding_all, ord=2, axis=1) * np.linalg.norm(mean_words, ord=2))

cos_sim = [(num, i) for num, i in enumerate(cos_sim)]

In [26]:
cos_sim.sort(key=lambda x: x[1], reverse=True)

In [30]:
words_out = [(index_key[i[0]], f'{round((i[1] + 1) / 2 * 100, 2)}%') for i in cos_sim[:30]]
words_out

[('fruit', '88.96%'),
 ('citric', '88.81%'),
 ('fruits', '83.92%'),
 ('juice', '83.86%'),
 ('vegetables', '82.81%'),
 ('syrup', '82.77%'),
 ('citrus', '82.58%'),
 ('flavors', '82.05%'),
 ('milk', '81.78%'),
 ('grape', '81.75%'),
 ('vegetable', '81.67%'),
 ('mango', '81.31%'),
 ('corn', '81.14%'),
 ('sugar', '81.05%'),
 ('lysine', '80.87%'),
 ('yogurt', '80.77%'),
 ('acid', '80.33%'),
 ('honey', '80.24%'),
 ('flavor', '80.22%'),
 ('coconut', '79.97%'),
 ('ingredients', '79.66%'),
 ('coffee', '79.66%'),
 ('barley', '79.64%'),
 ('malt', '79.3%'),
 ('beans', '79.28%'),
 ('lemon', '79.12%'),
 ('vinegar', '79.07%'),
 ('edible', '79.05%'),
 ('organic', '78.94%'),
 ('dried', '78.92%')]