# Imports e inicialización del servidor

In [2]:
import mysql.connector
import os, getpass, re, requests
import gensim
import pandas as pd
import numpy as np
import cvxpy as cp
import tensorflow as tf
import matplotlib.pyplot as plt
from mysql.connector import Error
from itertools import combinations
from ast import literal_eval
from collections import defaultdict
from scipy.spatial import distance
from sklearn.manifold import TSNE
from bs4 import BeautifulSoup
import plotly.graph_objects as go
K = 10

In [3]:
def cosine_similarity(A, b):
    # Definición de los argumentos de la similaridad coseno
    A_dot_b = np.sum(A * b, axis=1)
    A_norm = np.sqrt(np.sum(A ** 2, axis=1))
    b_norm = np.sqrt(np.sum(b ** 2))
    return A_dot_b / (A_norm * b_norm)


def find_Nprox_words(terms_to_rev, N_words, metric='euclidean'):
    # Definición del diccionario de palabras
    word_dict_out = dict()

    for term in terms_to_rev:
        # Definición del vector de embedding
        embed_term = embeddings_index.get(term)

        if embed_term is None: continue
        
        if metric == 'euclidean':
            # Diferencia cuadrática
            dif_metric = np.sum((embedding_matrix - embed_term) ** 2, axis=1)
            reverse_sort = False
        
        elif metric == 'cosine':
            # Similaridad coseno
            dif_metric = (cosine_similarity(embedding_matrix, embed_term) + 1) / 2
            reverse_sort = True
        
        # Etiquetando posición
        pos_dif = [(i, delta) for i, delta in enumerate(dif_metric)]

        # Ordenando
        pos_dif.sort(key=lambda x: x[1], reverse=reverse_sort)

        # Agregar al diccionario
        word_dict_out[term] = [(embedding_words[i[0]], i[0], i[1]) for i in pos_dif[1:1+N_words]]

    return word_dict_out


def find_Nprox_words_joint(terms_to_rev, N_words, metric='euclidean'):
    # Definición del diccionario de palabras
    word_dict_out = dict()
    
    # Definición de un vector de embedding total
    embed_total = list()
    
    for term in terms_to_rev:
        # Definición del vector de embedding
        embed_term = embeddings_index.get(term)

        if embed_term is None: continue
        else: embed_total.append(embed_term)    
        
    # Calculando la media del vector embebido
    embed_total = np.array(embed_total).mean(axis=0)
    
    if metric == 'euclidean':
        # Diferencia cuadrática
        dif_metric = np.sum((embedding_matrix - embed_total) ** 2, axis=1)
        reverse_sort = False

    elif metric == 'cosine':
        # Similaridad coseno
        dif_metric = (cosine_similarity(embedding_matrix, embed_total) + 1) / 2
        reverse_sort = True

    # Etiquetando posición
    pos_dif = [(i, delta) for i, delta in enumerate(dif_metric)]

    # Ordenando
    pos_dif.sort(key=lambda x: x[1], reverse=reverse_sort)

    # Agregar al diccionario
    word_dict_out[' '.join(terms_to_rev)] = [(embedding_words[i[0]], i[0], i[1]) for i in pos_dif[1:1+N_words]]

    return word_dict_out

# Cargando los datos

In [4]:
data = np.load('Summary/training_data.npz', allow_pickle=True)
Y_data = data['Y_data']
Y_data = [i.lower() for i in Y_data]
Y_data

['angelica',
 'savoy cabbage',
 'silver linden',
 'kiwi',
 'allium',
 'garden onion',
 'leek',
 'garlic',
 'chives',
 'lemon verbena',
 'cashew nut',
 'pineapple',
 'dill',
 'custard apple',
 'wild celery',
 'peanut',
 'burdock',
 'horseradish',
 'tarragon',
 'mugwort',
 'asparagus',
 'oat',
 'star fruit',
 'brazil nut',
 'common beet',
 'borage',
 'chinese mustard',
 'swede',
 'rape',
 'common cabbage',
 'cauliflower',
 'brussel sprouts',
 'kohlrabi',
 'broccoli',
 'chinese cabbage',
 'turnip',
 'pigeon pea',
 'tea',
 'capers',
 'pepper',
 'papaya',
 'safflower',
 'caraway',
 'pecan nut',
 'chestnut',
 'roman camomile',
 'chickpea',
 'endive',
 'chicory',
 'chinese cinnamon',
 'ceylon cinnamon',
 'watermelon',
 'lime',
 'lemon',
 'pummelo',
 'mandarin orange (clementine, tangerine)',
 'sweet orange',
 'coffee',
 'arabica coffee',
 'robusta coffee',
 'coriander',
 'common hazelnut',
 'saffron',
 'muskmelon',
 'cucumber',
 'cucurbita',
 'cumin',
 'turmeric',
 'quince',
 'lemon grass',
 

# Abriendo el modelo GloVe entrenado en Wikipedia

In [5]:
# Cargando el embedding en memoria
embeddings_index = dict()

# Definición de la matriz de embedding
embeddings_matrix = list()

# Definición del diccionario de decodificación de palabras
idx2word = dict()

# Abriendo el archivo
with open('C:/Users/Chris-Brota/Desktop/glove.6b/glove.6B.100d.txt', 'r', encoding='utf8') as file:
    for num, line in enumerate(file):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        embeddings_matrix.append(coefs)
        idx2word[num] = word

# Pasando a array
embeddings_matrix = np.array(embeddings_matrix)

print(f'Vector de {len(embeddings_index)} palabras cargadas.')

Vector de 400001 palabras cargadas.


In [6]:
print(embeddings_index.get('citric'))

[ 0.28123   0.19583   0.06453  -0.13524   0.99726  -0.19329  -0.48548
  0.43083  -0.16015  -0.12496  -0.5319    0.57561  -0.5881    0.085808
 -0.0859    0.75721  -1.1515   -0.084785  0.65576   0.064682  0.067194
 -1.1096    0.92557   0.57395  -0.34474   0.37906  -0.82575  -0.55303
 -0.58417   0.044774  0.76227   0.75636  -0.45388  -0.8378   -0.39259
 -0.3134    0.29243   0.025914  1.0914   -0.13309   0.57329  -0.12037
 -0.90361  -0.91887   1.013     1.276    -0.31055  -0.86031  -0.45764
 -0.35792  -0.34129  -0.64878  -0.21554  -0.32376  -0.75714   0.80563
 -1.143     0.128    -0.34669  -0.40518  -0.031299 -0.63768  -0.35624
 -0.28367   0.73029  -0.47432   0.84953  -1.1061    0.046679 -1.1738
 -0.4851    0.45272   0.16722   0.35547  -0.27191  -0.26114   0.21498
 -0.79119  -0.75023  -0.63608   0.068707 -0.46095  -0.44356   0.52749
 -0.32422   0.77409  -0.81332   0.1544   -0.40902   0.4267   -0.59445
 -0.10446  -1.2722   -0.76292   0.6785   -0.43202  -0.19278  -0.26015
  0.93397   0.54029

# Definición de funciones alimenticias objetivo

In [7]:
def find_Nprox_concepts(concept, embedding_matrix, index2word, N_words, 
                        metric='euclidean'):
    # Definición del vector de embedding
    embedded_concept = embeddings_index.get(concept)
    
    # Si no se encuentra el concepto, se termina la ejecución
    if embedded_concept is None: 
        print(f'Concept {concept} not found.')
        return
    
    if metric == 'euclidean':
        # Diferencia cuadrática
        dif_metric = np.sum((embedding_matrix - embedded_concept) ** 2, axis=1)
        reverse_sort = False

    elif metric == 'cosine':
        # Similaridad coseno
        dif_metric = (cosine_similarity(embedding_matrix, embedded_concept)) 
        reverse_sort = True
    
    else:
        raise Exception('Parámetro "metric" no válido. Intente nuevamente.')
    
    # Etiquetando posición
    pos_dif = [(i, delta) for i, delta in enumerate(dif_metric)]
    
    # Ordenando
    pos_dif.sort(key=lambda x: x[1], reverse=reverse_sort)

    # Obtener la salida
    if metric == 'euclidean':
        proximate_concepts = [(index2word[i[0]], np.sqrt(i[1])) 
                               for i in pos_dif[1:1+N_words]]
    
    elif metric == 'cosine':
        proximate_concepts = [(index2word[i[0]], i[1] * 100) 
                               for i in pos_dif[1:1+N_words]]

    return proximate_concepts


def find_concepts_by_radius(concept, embedding_matrix, index2word, radius):
    # Definición del vector de embedding
    embedded_concept = embeddings_index.get(concept)
    
    # Si no se encuentra el concepto, se termina la ejecución
    if embedded_concept is None: 
        print(f'Concept {concept} not found.')
        return

    # Diferencia cuadrática
    dif_metric = np.sqrt(np.sum((embedding_matrix - embedded_concept) ** 2, axis=1))
    
    # Etiquetando posición y filtrando solo las que cumplan con el criterio del radio
    pos_dif = [(i, delta) for i, delta in enumerate(dif_metric)
               if delta <= radius]
    
    # Ordenando
    pos_dif.sort(key=lambda x: x[1])

    # Obtener la salida
    proximate_concepts = [(index2word[i[0]], i[1]) for i in pos_dif]

    return proximate_concepts



# Definición de los conceptos objetivos a revisar en el embedding
objective_concepts = ['digestive', 'antioxidant', 'nutritive',  'vitamins', 
                      'nutrients', 'minerals', 'energetic', 'energy']

for concept in objective_concepts:
    result = find_Nprox_concepts(concept, embedding_matrix=embeddings_matrix, 
                                 index2word=idx2word, N_words=20, 
                                 metric='cosine')
    
#     result = find_concepts_by_radius(concept, embedding_matrix=embeddings_matrix, 
#                                      index2word=idx2word, radius=5)
    
    print(concept)
    display(result)

digestive


[('gastrointestinal', 70.93609571456909),
 ('liver', 68.97932291030884),
 ('circulatory', 68.34980845451355),
 ('digestion', 66.64594411849976),
 ('pancreas', 66.53716564178467),
 ('intestine', 66.34802222251892),
 ('metabolic', 63.087719678878784),
 ('respiratory', 62.453240156173706),
 ('kidneys', 61.7425262928009),
 ('urinary', 61.693453788757324),
 ('lungs', 61.332499980926514),
 ('intestines', 61.10748648643494),
 ('stomach', 60.38355827331543),
 ('tissues', 59.96477007865906),
 ('kidney', 59.81611609458923),
 ('tract', 59.225016832351685),
 ('intestinal', 58.00774097442627),
 ('esophagus', 57.95712471008301),
 ('renal', 57.57710337638855),
 ('fluids', 57.52309560775757)]

antioxidant


[('antioxidants', 79.44929599761963),
 ('polyphenols', 65.94921946525574),
 ('flavonoids', 64.5348846912384),
 ('lycopene', 63.0429744720459),
 ('antimicrobial', 62.74224519729614),
 ('anticancer', 62.35556602478027),
 ('antibacterial', 61.95918917655945),
 ('anti-inflammatory', 61.26128435134888),
 ('vitamin', 60.13367176055908),
 ('carotene', 59.75155234336853),
 ('carotenoids', 59.094130992889404),
 ('antifungal', 58.460527658462524),
 ('vitamins', 58.32422971725464),
 ('antitumor', 55.810028314590454),
 ('antibody', 55.58624863624573),
 ('phytochemicals', 55.55211305618286),
 ('estrogenic', 55.29184341430664),
 ('anti-cancer', 54.624706506729126),
 ('progesterone', 54.48868274688721),
 ('cytotoxic', 54.458147287368774)]

nutritive


[('calorific', 67.07394123077393),
 ('insulative', 61.14565134048462),
 ('precedential', 60.627323389053345),
 ('shareowner', 60.184067487716675),
 ('artifactual', 59.59566831588745),
 ('probative', 59.417080879211426),
 ('realizable', 57.74664282798767),
 ('assessable', 57.42769241333008),
 ('allergenic', 57.09967613220215),
 ('absorptive', 56.397032737731934),
 ('nonmonetary', 55.36136031150818),
 ('shareable', 55.266886949539185),
 ('realisable', 55.1944375038147),
 ('absorbency', 55.14974594116211),
 ('succulence', 54.45681810379028),
 ('cardioprotective', 54.11723852157593),
 ('non-material', 53.72737646102905),
 ('apomorphic', 53.59330177307129),
 ('unpackaged', 53.1688928604126),
 ('anti-oxidant', 53.098076581954956)]

vitamins


[('vitamin', 81.62173628807068),
 ('supplements', 78.74909043312073),
 ('antioxidants', 76.52631998062134),
 ('nutrients', 70.22867798805237),
 ('folic', 68.88197064399719),
 ('folate', 67.64053106307983),
 ('carotene', 65.7368004322052),
 ('calcium', 65.44392704963684),
 ('b12', 64.07235860824585),
 ('phytochemicals', 63.04248571395874),
 ('dietary', 63.03325295448303),
 ('micronutrients', 62.965017557144165),
 ('acids', 62.676650285720825),
 ('carotenoids', 62.43682503700256),
 ('multivitamins', 61.5333616733551),
 ('niacin', 61.498868465423584),
 ('pills', 61.227381229400635),
 ('caffeine', 60.97118854522705),
 ('salts', 60.86549162864685),
 ('soluble', 60.275959968566895)]

nutrients


[('nutrient', 70.51071524620056),
 ('vitamins', 70.22867798805237),
 ('moisture', 68.01445484161377),
 ('contaminants', 65.8448576927185),
 ('soluble', 65.76610803604126),
 ('acids', 65.74196219444275),
 ('toxins', 65.48925638198853),
 ('microorganisms', 65.11854529380798),
 ('calcium', 64.68392014503479),
 ('oxygen', 64.64420557022095),
 ('nitrogen', 63.66875171661377),
 ('salts', 61.881035566329956),
 ('algae', 61.786746978759766),
 ('fluids', 61.55136823654175),
 ('sugars', 61.135512590408325),
 ('sediment', 60.79842448234558),
 ('antioxidants', 60.77163815498352),
 ('impurities', 60.668957233428955),
 ('organisms', 60.61676740646362),
 ('micronutrients', 60.12769937515259)]

minerals


[('mineral', 81.73073530197144),
 ('metals', 78.535395860672),
 ('ores', 64.13602232933044),
 ('copper', 64.11359906196594),
 ('mining', 63.96978497505188),
 ('gemstones', 62.875962257385254),
 ('ore', 61.744850873947144),
 ('silicate', 61.24851703643799),
 ('salts', 60.360872745513916),
 ('petroleum', 59.66758728027344),
 ('hydrocarbons', 59.13420915603638),
 ('hydrocarbon', 58.72114896774292),
 ('crystals', 58.4683895111084),
 ('carbonate', 58.277904987335205),
 ('manganese', 57.56113529205322),
 ('zinc', 57.327091693878174),
 ('aluminium', 56.800299882888794),
 ('natural', 56.5890371799469),
 ('phosphates', 56.503939628601074),
 ('alloys', 56.24755620956421)]

energetic


[('exuberant', 71.13802433013916),
 ('feisty', 68.33209991455078),
 ('personable', 68.29231381416321),
 ('thoughtful', 68.18881630897522),
 ('passionate', 67.67101883888245),
 ('resourceful', 66.75754189491272),
 ('playful', 66.69796705245972),
 ('spirited', 66.65301322937012),
 ('youthful', 66.41921997070312),
 ('easygoing', 65.57125449180603),
 ('forceful', 65.56152701377869),
 ('vigorous', 65.35044312477112),
 ('enthusiastic', 65.10879397392273),
 ('intelligent', 65.0416910648346),
 ('affable', 65.00149965286255),
 ('cheerful', 64.54155445098877),
 ('vivacious', 64.51388001441956),
 ('brash', 64.32241797447205),
 ('imaginative', 64.20926451683044),
 ('ebullient', 64.14363384246826)]

energy


[('resources', 72.94265627861023),
 ('gas', 72.44957685470581),
 ('renewable', 71.00892663002014),
 ('natural', 70.34191489219666),
 ('petroleum', 69.90280747413635),
 ('electricity', 69.32563185691833),
 ('oil', 68.60381364822388),
 ('power', 66.44038558006287),
 ('development', 66.43015742301941),
 ('fuel', 66.3334310054779),
 ('supply', 65.8108651638031),
 ('fuels', 65.60667753219604),
 ('global', 65.51679372787476),
 ('technology', 65.06755352020264),
 ('efficiency', 64.53660726547241),
 ('environment', 63.31504583358765),
 ('environmental', 62.37325668334961),
 ('technologies', 62.18891739845276),
 ('industry', 61.89283728599548),
 ('energies', 61.627936363220215)]