# Imports e inicialización del servidor

In [1]:
import mysql.connector
import os, getpass, re, requests
import gensim, nltk, unidecode
import wikipedia
import pandas as pd
import numpy as np
import spacy
import cvxpy as cp
import tensorflow as tf
from tqdm import tqdm
from mysql.connector import Error
from itertools import combinations
from ast import literal_eval
from collections import defaultdict
from scipy.spatial import distance
from bs4 import BeautifulSoup
from urllib.request import urlopen
K = 10

In [2]:
# Definición de parámetros de conexión
connection_params = {
    'host': 'localhost',
    'user': 'cmescobar',
    'database': 'foodb',
    'password': getpass.getpass(prompt='Introduzca la contraseña: ')
}

try:
    connection = mysql.connector.connect(**connection_params)

    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL Server version ", db_Info)
        cursor = connection.cursor()

except Error as e:
    print("Error while connecting to MySQL", e)

Introduzca la contraseña: ········
Connected to MySQL Server version  8.0.26


# Obteniendo el dataframe de información

In [3]:
# Obteniendo la información del texto obtenido mediante wikipedia
df = pd.read_pickle('Summary/Food_wikitext.pkl')
df

Unnamed: 0,id,name,name_scientific,description,wikipedia_id,wikipedia_text
0,1,Angelica,Angelica keiskei,Angelica is a genus of about 60 species of tal...,Angelica,About 50 species; see text Angelica is a genu...
1,2,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,Savoy cabbage,Savoy cabbage (Brassica oleracea var. sabauda ...
2,3,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,Tilia tomentosa,"Tilia tomentosa, known as silver linden in th..."
3,4,Kiwi,Actinidia chinensis,"The kiwifruit, often shortened to kiwi in many...",Kiwifruit,Kiwifruit (commonly shortened to kiwi in Nort...
4,5,Allium,Allium,Allium haematochiton is a species of wild onio...,Allium haematochiton,Allium haematochiton is a North American spec...
...,...,...,...,...,...,...
848,934,Asparagus fern,Asparagus setaceus,"Asparagus setaceus, commonly called common asp...",Asparagus_setaceus,"Asparagus setaceus, commonly known as common ..."
849,935,Thornless blackberry,Rubus ulmifolius,"Rubus ulmifolius, commonly called thornless bl...",Rubus_ulmifolius,Rubus ulmifolius is a species of wild blackbe...
850,936,Tropical highland blackberry,Rubus adenotrichos,Rubus adenotrichos is commonly called tropical...,Rubus_adenotrichos,Rubus adenotrichus Schltdl. Rubus adenotricho...
851,937,Andean blackberry,Rubus glaucus,"Rubus glaucus, commonly called Andean blackber...",Rubus_glaucus,"Rubus glaucus, commonly known as mora de Cast..."


# Preprocesando el texto

In [4]:
# Descargando las stopwords y el wordnet para el lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Chris-
[nltk_data]     Brota\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Chris-
[nltk_data]     Brota\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
def preprocess_text_OLD(text, alphanumeric=True):
    # Tokenizar palabras ignorando la puntuación
    if alphanumeric:
        text = ' '.join(re.findall(r'\w+', text))
    else:
        text = ' '.join(re.findall(r'[A-Za-z]+', text))
    
    # Creando el lemmatizer y pasando a minúsculas
    lemmatizer = spacy.load('en_core_web_trf')
    tokens = lemmatizer(text)
    
    # Eliminando las stopwords
    keywords= [token.lemma_ for token in tokens 
               if token not in nltk.corpus.stopwords.words('english')]
    return keywords


def preprocess_text(text, alphanumeric=True):
    def _pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return nltk.corpus.wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return nltk.corpus.wordnet.VERB
        elif nltk_tag.startswith('N'):
            return nltk.corpus.wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return nltk.corpus.wordnet.ADV
        else:         
            return None
    
    
    def _lemmatize(tokens):
        # Creando el lemmatizer y pasando a minúsculas
        lemmatizer = nltk.stem.WordNetLemmatizer()
        
        # Tokenizar la oración y encontrar el POS tag para cada token
        pos_tagged = nltk.pos_tag(tokens) 
        
        # Mapeando a los tags de interés y pasando a minúscula
        wordnet_tagged = list(map(lambda x: (x[0].lower(), _pos_tagger(x[1])), pos_tagged))
        
        # Definición de la oración lemmatimizada
        return [word if tag is None else lemmatizer.lemmatize(word, tag)
                for word, tag in wordnet_tagged]
    
    
    # Tokenizar palabras ignorando la puntuación
    if alphanumeric:
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    else:
        tokenizer = nltk.tokenize.RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokenizer.tokenize(text)
    
    # Obtener los lemas
    lemmas = _lemmatize(tokens)

    # Eliminando las stopwords
    keywords = [lemma for lemma in lemmas if lemma not in nltk.corpus.stopwords.words('english')]
    
    return keywords


def get_data(dataframe):
    # Definición de la bag of words
    X_data = list()
    Y_data = list()
    
    # Definición de la lista de alimentos con errores
    error_list = list()
    
    # Para cada texto en la tabla
    for index, row in tqdm(dataframe.iterrows(), ncols=70, desc='Text'):
        # Obtener el texto
        text = row['wikipedia_text']
        # Preprocesando
        try:
            text = preprocess_text(text, alphanumeric=False)
            X_data.append(text)
            Y_data.append(row['name'])  
        except:
            print('No funciono:', text, index, row)
            error_list.append((index, row))
            
    return X_data, Y_data, error_list

# Definiendo los datos de entrada y salida

In [None]:
X_data, Y_data, error_list = get_data(df)

Text: 64it [00:20,  2.93it/s]

No funciono:  Saffron (pronounced /ˈsæfrən/ or /ˈsæfrɒn/) is a spice derived from the flower of Crocus sativus, commonly known as the "saffron crocus". The vivid crimson stigma and styles, called threads, are collected and dried for use mainly as a seasoning and colouring agent in food. Saffron has long been the world's costliest spice by weight. Although some doubts remain on its origin, it is believed that saffron originated in Iran. However, Greece and Mesopotamia have also been suggested as the possible region of origin of this plant. Harold McGee states that it was domesticated in or near Greece during the Bronze Age. C. sativus is possibly a triploid form of Crocus cartwrightianus, which is also known as "wild saffron". Saffron crocus slowly propagated throughout much of Eurasia and was later brought to parts of North Africa, North America, and Oceania. Saffron's taste and iodoform-like or hay-like fragrance result from the phytochemicals picrocrocin and safranal. It also contain

Text: 256it [01:21,  2.37it/s]Exception ignored in: <function SeekableUnicodeStreamReader.__del__ at 0x000001456D4FF430>
Traceback (most recent call last):
  File "c:\users\chris-brota\appdata\local\programs\python\python38\lib\site-packages\nltk\data.py", line 1160, in __del__
    self.close()
  File "c:\users\chris-brota\appdata\local\programs\python\python38\lib\site-packages\nltk\data.py", line 1195, in close
    self.stream.close()
KeyboardInterrupt: 
Text: 269it [01:25,  4.18it/s]

No funciono:  The red king crab (Paralithodes camtschaticus), also called Kamchatka crab or Alaskan king crab, is a species of king crab native to the far northern Pacific Ocean, including the Bering Sea and Gulf of Alaska, but also introduced to the Barents Sea. It grows to a leg span of 1.8 m (5.9 ft), and is heavily targeted by fisheries. The red king crab is the largest species of king crab. Red king crabs can reach a carapace width up to 28 cm (11 in), a leg span of 1.8 m (5.9 ft), and a weight of 12.7 kg (28 lb). Males grow larger than females. Today, red king crabs infrequently surpass 17 cm (7 in) in carapace width and the average male landed in the Bering Sea weighs 2.9 kg (6.4 lb). It was named after the color it turns when it is cooked rather than the color of a living animal, which tends to be more burgundy. The red king crab is native to the Bering Sea, North Pacific Ocean, around the Kamchatka Peninsula and neighboring Alaskan waters. It was introduced artificially by the

Text: 282it [01:28,  4.04it/s]

In [None]:
np.savez('Summary/training_data.npz', X_data=X_data, Y_data=Y_data)

# Cargar datos

In [5]:
data = np.load('Summary/training_data.npz', allow_pickle=True)
X_data = data['X_data']
Y_data = data['Y_data']

In [12]:
Y_labels = tf.keras.utils.to_categorical([i for i in range(len(Y_data))], 
                                          num_classes=None, dtype='int')

# Definición del bag of words

In [6]:
# Definición del set de todas las palabras que se encuentran
# en las descripciones
bag_of_words = set()

# Definición de la lista de largo de cada texto
len_texts = list()

for line in X_data:
    len_texts.append(len(line))
    for i in line:
        bag_of_words.add(i)

In [7]:
max(len_texts)

6312

# Tokenizando

In [8]:
# Definición del token
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Ajustando el tokenizer a los textos
tokenizer.fit_on_texts(list(bag_of_words))

# Definiendo el tamaño del vocabulario
vocab_size = len(tokenizer.word_index) + 1

# Una vez definido el token para cada palabra, se transforman los textos mediante cada token
sequences = tokenizer.texts_to_sequences(X_data)

# Dado que las secuencias no tienen el mismo largo, se paddea para obtener una matriz
X_token = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=1000)
print(X_token.shape)
X_token

(853, 1000)


array([[    0,     0,     0, ..., 35412, 15019,  3961],
       [    0,     0,     0, ..., 46872, 37500, 12713],
       [    0,     0,     0, ...,  6768, 19797,  5683],
       ...,
       [    0,     0,     0, ..., 42356, 17040, 17187],
       [    0,     0,     0, ..., 15991, 45824, 18269],
       [19102,  2740, 10184, ..., 21242, 15204, 32287]])

# Creación de los modelos

In [9]:
def model_1(vocab_size, out_dim, embedding_dim=100, embedding_trainable=True):
    # Capa de entrada
    x_in = tf.keras.layers.Input(shape=(None,))
    
    # Capa de embebido
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                  trainable=embedding_trainable)(x_in)
    
    # Red RNN + MLP
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.5))(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    
    # Capa de salida
    x_out = tf.keras.layers.Dense(out_dim, activation='sigmoid')(x)
    
    # define the model
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    return model
    

def model_2(vocab_size, out_dim, embedding_dim=100, embedding_trainable=False):
    def _get_embedding_matrix():
        pass
    
    
    # Capa de entrada
    x_in = tf.keras.layers.Input(shape=(None,))
    
    # Capa de embebido
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], 
                                  trainable=embedding_trainable)(x_in)
    
    # Red RNN + MLP
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.5))(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    
    # Capa de salida
    x_out = tf.keras.layers.Dense(out_dim, activation='sigmoid')(x)
    
    # define the model
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    return model

In [10]:
# Parámetros
embedding_dim = 100
embedding_trainable = True

model = model_1(vocab_size, out_dim=len(Y_data), embedding_dim=embedding_dim, 
                embedding_trainable=embedding_trainable)

In [14]:
# Compilando
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Implementing model checkpoins to save the best metric and do not lose it on training.
# checkpoint = tf.keras.callbacks.ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', 
#                                                 verbose=1, save_best_only=True, 
#                                                 mode='auto', period=1,save_weights_only=False)

# Implementando un EarlyStopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# Entrenando
history = model.fit(X_token, Y_labels, epochs=100, callbacks=[early_stopping], batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


# RNN prediction

In [15]:
text_to = ['citric fruit'.split(' ')]
print(text_to)

# Una vez definido el token para cada palabra, se transforman los textos mediante cada token
text_to = tokenizer.texts_to_sequences(text_to)
print(text_to)

# # Dado que las secuencias no tienen el mismo largo, se paddea para obtener una matriz
# text_to = tf.keras.preprocessing.sequence.pad_sequences(text_to, maxlen=200)

[['citric', 'fruit']]
[[5205, 25999]]


In [20]:
a = model.predict(text_to)
b = [(num, i) for num, i in enumerate(a[0])]
b.sort(key=lambda x: x[1], reverse=True)

for i in b:
    print(Y_data[i[0]], round(i[1] * 100, 2), '%')

Butternut squash 44.53 %
Pili nut 44.25 %
Atlantic menhaden 44.24 %
Sherry 44.23 %
Common hazelnut 44.19 %
Common dab 44.19 %
Safflower 44.17 %
Soft-necked garlic 44.17 %
Rainbow smelt 44.16 %
Pepper (C. chinense) 44.15 %
Black mulberry 44.15 %
Winter squash 44.15 %
Rabbit 44.14 %
Chinese water chestnut 44.14 %
Breadfruit 44.13 %
Swede 44.13 %
Sourdough 44.13 %
Deerberry 44.13 %
Domestic pig 44.13 %
Bivalvia (Clam, Mussel, Oyster) 44.13 %
Cloudberry 44.12 %
Port wine 44.12 %
Chives 44.12 %
Hippoglossus (Common halibut) 44.11 %
Candy bar 44.11 %
Velvet duck 44.1 %
Lettuce 44.1 %
Mandarin orange (Clementine, Tangerine) 44.1 %
Avocado 44.09 %
Turnip 44.08 %
Mulberry 44.08 %
Sake 44.08 %
Pistachio 44.08 %
European rabbit 44.08 %
Celeriac 44.08 %
Vanilla 44.08 %
Jicama 44.08 %
Baked beans 44.08 %
Dripping 44.08 %
Soy sauce 44.08 %
Turkey 44.07 %
Whelk 44.07 %
Date 44.06 %
Orange bell pepper 44.06 %
Evaporated milk 44.06 %
Sweet bay 44.06 %
Miso 44.06 %
Red bell pepper 44.06 %
Pacific sardin