# Imports e inicialización del servidor

In [37]:
import mysql.connector
import os, getpass, re, requests
import gensim, nltk, unidecode
import wikipedia
import pandas as pd
import numpy as np
import spacy
import cvxpy as cp
import tensorflow as tf
from tqdm import tqdm
from mysql.connector import Error
from itertools import combinations
from ast import literal_eval
from collections import defaultdict
from scipy.spatial import distance
from bs4 import BeautifulSoup
from urllib.request import urlopen
K = 10

In [38]:
# Definición de parámetros de conexión
connection_params = {
    'host': 'localhost',
    'user': 'cmescobar',
    'database': 'foodb',
    'password': getpass.getpass(prompt='Introduzca la contraseña: ')
}

try:
    connection = mysql.connector.connect(**connection_params)

    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL Server version ", db_Info)
        cursor = connection.cursor()

except Error as e:
    print("Error while connecting to MySQL", e)

Introduzca la contraseña: ········
Connected to MySQL Server version  8.0.26


# Obteniendo el dataframe de información

In [39]:
# Obteniendo la información del texto obtenido mediante wikipedia
df = pd.read_pickle('Summary/Food_wikitext.pkl')
df

Unnamed: 0,id,name,name_scientific,description,wikipedia_id,wikipedia_text
0,1,Angelica,Angelica keiskei,Angelica is a genus of about 60 species of tal...,Angelica,About 50 species; see text Angelica is a genu...
1,2,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,Savoy cabbage,Savoy cabbage (Brassica oleracea var. sabauda ...
2,3,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,Tilia tomentosa,"Tilia tomentosa, known as silver linden in th..."
3,4,Kiwi,Actinidia chinensis,"The kiwifruit, often shortened to kiwi in many...",Kiwifruit,Kiwifruit (commonly shortened to kiwi in Nort...
4,5,Allium,Allium,Allium haematochiton is a species of wild onio...,Allium haematochiton,Allium haematochiton is a North American spec...
...,...,...,...,...,...,...
848,934,Asparagus fern,Asparagus setaceus,"Asparagus setaceus, commonly called common asp...",Asparagus_setaceus,"Asparagus setaceus, commonly known as common ..."
849,935,Thornless blackberry,Rubus ulmifolius,"Rubus ulmifolius, commonly called thornless bl...",Rubus_ulmifolius,Rubus ulmifolius is a species of wild blackbe...
850,936,Tropical highland blackberry,Rubus adenotrichos,Rubus adenotrichos is commonly called tropical...,Rubus_adenotrichos,Rubus adenotrichus Schltdl. Rubus adenotricho...
851,937,Andean blackberry,Rubus glaucus,"Rubus glaucus, commonly called Andean blackber...",Rubus_glaucus,"Rubus glaucus, commonly known as mora de Cast..."


# Preprocesando el texto

In [40]:
# Descargando las stopwords y el wordnet para el lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Chris-
[nltk_data]     Brota\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Chris-
[nltk_data]     Brota\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def preprocess_text_OLD(text, alphanumeric=True):
    # Tokenizar palabras ignorando la puntuación
    if alphanumeric:
        text = ' '.join(re.findall(r'\w+', text))
    else:
        text = ' '.join(re.findall(r'[A-Za-z]+', text))
    
    # Creando el lemmatizer y pasando a minúsculas
    lemmatizer = spacy.load('en_core_web_trf')
    tokens = lemmatizer(text)
    
    # Eliminando las stopwords
    keywords= [token.lemma_ for token in tokens 
               if token not in nltk.corpus.stopwords.words('english')]
    return keywords


def preprocess_text(text, alphanumeric=True):
    def _pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return nltk.corpus.wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return nltk.corpus.wordnet.VERB
        elif nltk_tag.startswith('N'):
            return nltk.corpus.wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return nltk.corpus.wordnet.ADV
        else:         
            return None
    
    
    def _lemmatize(tokens):
        # Creando el lemmatizer y pasando a minúsculas
        lemmatizer = nltk.stem.WordNetLemmatizer()
        
        # Tokenizar la oración y encontrar el POS tag para cada token
        pos_tagged = nltk.pos_tag(tokens) 
        
        # Mapeando a los tags de interés y pasando a minúscula
        wordnet_tagged = list(map(lambda x: (x[0].lower(), _pos_tagger(x[1])), pos_tagged))
        
        # Definición de la oración lemmatimizada
        return [word if tag is None else lemmatizer.lemmatize(word, tag)
                for word, tag in wordnet_tagged]
    
    
    # Tokenizar palabras ignorando la puntuación
    if alphanumeric:
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    else:
        tokenizer = nltk.tokenize.RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokenizer.tokenize(text)
    
    # Obtener los lemas
    lemmas = _lemmatize(tokens)

    # Eliminando las stopwords
    keywords = [lemma for lemma in lemmas if lemma not in nltk.corpus.stopwords.words('english')]
    
    return keywords


def get_data(dataframe):
    # Definición de la bag of words
    X_data = list()
    Y_data = list()
    
    # Definición de la lista de alimentos con errores
    error_list = list()
    
    # Para cada texto en la tabla
    for index, row in tqdm(dataframe.iterrows(), ncols=70, desc='Text'):
        # Obtener el texto
        text = row['wikipedia_text']
        # Preprocesando
        try:
            text = preprocess_text(text, alphanumeric=False)
            X_data.append(text)
            Y_data.append(row['name'])  
        except:
            print('No funciono:', text, index, row)
            error_list.append((index, row))
            
    return X_data, Y_data, error_list

# Definiendo los datos de entrada y salida

In [8]:
X_data, Y_data, error_list = get_data(df)

Text: 418it [02:08,  6.58it/s]

No funciono: None 418 id                                                               444
name                                                 French plantain
name_scientific                                   Musa X paradisiaca
description        Musa × paradisiaca is the accepted name for th...
wikipedia_id                                      Musa × paradisiaca
wikipedia_text                                                  None
Name: 418, dtype: object


Text: 616it [03:14,  4.77it/s]

No funciono: None 616 id                                                               661
name                                                          Dragée
name_scientific                                                 None
description        A dragée is a bite-sized, colorful form of con...
wikipedia_id                                                  Dragée
wikipedia_text                                                  None
Name: 616, dtype: object


Text: 628it [03:20,  1.95it/s]

No funciono: None 628 id                                                               674
name                                                            Pate
name_scientific                                                 None
description        Pâté is a mixture of ground meat and fat mince...
wikipedia_id                                                    Pâté
wikipedia_text                                                  None
Name: 628, dtype: object


Text: 740it [03:52,  7.33it/s]

No funciono: None 738 id                                                               799
name                                                         Cupuaçu
name_scientific                               Theobroma grandiflorum
description        Cupuaçu (also spelled Cupuassu, Cupuazú, and C...
wikipedia_id                                                 Cupuaçu
wikipedia_text                                                  None
Name: 738, dtype: object


Text: 849it [04:22,  6.32it/s]

No funciono: None 845 id                                                               931
name                                                       Yali pear
name_scientific                               Pyrus × bretschneideri
description        Yali pear, also called Chinese White Pear, is ...
wikipedia_id                                  Pyrus_×_bretschneideri
wikipedia_text                                                  None
Name: 845, dtype: object


Text: 853it [04:22,  3.25it/s]


In [None]:
np.savez('Summary/training_data.npz', X_data=X_data, Y_data=Y_data)

In [23]:
Y_labels = tf.keras.utils.to_categorical([i for i in range(len(Y_data))], 
                                          num_classes=None, dtype='int')

# Definición del bag of words

In [12]:
# Definición del set de todas las palabras que se encuentran
# en las descripciones
bag_of_words = set()

# Definición de la lista de largo de cada texto
len_texts = list()

for line in X_data:
    len_texts.append(len(line))
    for i in line:
        bag_of_words.add(i)

In [13]:
max(len_texts)

6312

# Tokenizando

In [14]:
# Definición del token
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Ajustando el tokenizer a los textos
tokenizer.fit_on_texts(list(bag_of_words))

# Definiendo el tamaño del vocabulario
vocab_size = len(tokenizer.word_index) + 1

# Una vez definido el token para cada palabra, se transforman los textos mediante cada token
sequences = tokenizer.texts_to_sequences(X_data)

# Dado que las secuencias no tienen el mismo largo, se paddea para obtener una matriz
X_token = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=1000)
print(X_token.shape)
X_token

(848, 1000)


array([[    0,     0,     0, ..., 19230,  8370, 30722],
       [    0,     0,     0, ..., 13180, 25045, 23866],
       [    0,     0,     0, ...,  3811, 10129,  8545],
       ...,
       [    0,     0,     0, ..., 14769, 25223, 28923],
       [    0,     0,     0, ..., 26646, 38145,  4870],
       [43262, 21403, 30931, ...,  2318,  8864, 19417]])

# Creación de los modelos

In [20]:
def model_1(vocab_size, out_dim, embedding_dim=100, embedding_trainable=True):
    # Capa de entrada
    x_in = tf.keras.layers.Input(shape=(None,))
    
    # Capa de embebido
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                  trainable=embedding_trainable)(x_in)
    
    # Red RNN + MLP
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.5))(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    
    # Capa de salida
    x_out = tf.keras.layers.Dense(out_dim, activation='sigmoid')(x)
    
    # define the model
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    return model
    

def model_2(vocab_size, out_dim, embedding_dim=100, embedding_trainable=False):
    def _get_embedding_matrix():
        pass
    
    
    # Capa de entrada
    x_in = tf.keras.layers.Input(shape=(None,))
    
    # Capa de embebido
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], 
                                  trainable=embedding_trainable)(x_in)
    
    # Red RNN + MLP
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.5))(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    
    # Capa de salida
    x_out = tf.keras.layers.Dense(out_dim, activation='sigmoid')(x)
    
    # define the model
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    return model

In [21]:
# Parámetros
embedding_dim = 100
embedding_trainable = True

model = model_1(vocab_size, out_dim=len(Y_data), embedding_dim=embedding_dim, 
                embedding_trainable=embedding_trainable)

In [25]:
# Compilando
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Implementing model checkpoins to save the best metric and do not lose it on training.
# checkpoint = tf.keras.callbacks.ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', 
#                                                 verbose=1, save_best_only=True, 
#                                                 mode='auto', period=1,save_weights_only=False)

# Implementando un EarlyStopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# Entrenando
history = model.fit(X_token, Y_labels, epochs=100, callbacks=[early_stopping], batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


# RNN prediction

In [34]:
text_to = ['citric'.split(' ')]
print(text_to)

# Una vez definido el token para cada palabra, se transforman los textos mediante cada token
text_to = tokenizer.texts_to_sequences(text_to)
print(text_to)

# # Dado que las secuencias no tienen el mismo largo, se paddea para obtener una matriz
# text_to = tf.keras.preprocessing.sequence.pad_sequences(text_to, maxlen=200)

[['citric']]
[[37048]]


In [35]:
a = model.predict(text_to)
b = [(num, i) for num, i in enumerate(a[0])]
b.sort(key=lambda x: x[1], reverse=True)

for i in b:
    print(Y_data[i[0]])

Meatball
Eggs
Energy drink
Nutritional drink
Greenland halibut/turbot
Ostrich
Dried milk
Lime
Arctic blackberry
Broad bean
Whitefish
King mackerel
Lemon balm
Curry powder
Avocado
Crisp bread
Blackcurrant
Ascidians
Butterfat
Naranjilla
Black huckleberry
Chinese chestnut
Peppermint
Soy bean
Whisky
Sweet bay
Other candy
Ucuhuba
Carp bream
Epazote
Anise
Trail mix
Black salsify
Roe
Falafel
Spirit
Dock
Chinese water chestnut
Horseradish tree
Wheat bread
Rock ptarmigan
Cumin
Malabar plum
Domestic pig
Rape
Chili
Cracker
Chinese cinnamon
Sherry
Common cabbage
Pummelo
Breadnut tree seed
Chicken
Napa cabbage
Kohlrabi
Jerusalem artichoke
Tostada
Semolina
Other snack food
Lingonberry
Pink salmon
Acorn
Cherry tomato
Atlantic pollock
Colorado pinyon
Wild leek
Vinegar
Purslane
Savoy cabbage
Mugwort
White lupine
Beefalo
Pitanga
Shark
Romaine lettuce
Cinnamon
Columbidae (Dove, Pigeon)
Common persimmon
Coriander
Borage
Mountain hare
Fruits
Sweet marjoram
Corn grits
Bowhead whale
Egg substitute
Peanut
Coc