# Análisis preliminar de los resultados

In [1]:
import os, pickle
import seaborn as sns
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import plotly.graph_objects as go
import plotly.express as px
K = 10

In [21]:
df = pd.read_csv('CONSOLIDADO MVP v4 (sin En)_id_aux.dat', sep='|', decimal=',')
df

Unnamed: 0,id_aux,food_group,food_subgroup,id_food_table,nombre_primario,nombre_secundario,orig_food_id,G_energy,G_protein,"G_fiber, total dietary",...,ANE_lysine,ANE_methionine,ANE_phenylalanine,ANE_threonine,ANE_tryptophan,ANE_valine,OT_betaine,OT_caffeine,OT_choline,OT_theobromine
0,1,fruits,drupes,144,apricot,"apricot, dried",1,277.0076,3390.0,7300.0,...,120.0,19.0,79.0,65.0,14.0,120.0,0.0,0.0,0.0,0.0
1,2,pulses,beans,134,common bean,"beans, white, dried",25,324.0918,18900.0,17800.0,...,1500.0,320.0,1200.0,890.0,250.0,1100.0,0.0,0.0,0.0,0.0
2,3,pulses,beans,134,common bean,"beans, brown, dried",29,313.8145,18900.0,17800.0,...,1400.0,170.0,940.0,790.0,240.0,1900.0,0.0,0.0,0.0,0.0
3,4,soy,soy,85,soy bean,"beans, soy, dried",32,410.6119,35800.0,16600.0,...,2300.0,450.0,1800.0,1400.0,460.0,1700.0,0.0,0.0,0.0,0.0
4,7,fruits,drupes,149,peach,"peach, dried",51,336.0421,4900.0,14300.0,...,180.0,180.0,110.0,170.0,23.0,240.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,197,cereals and cereal products,cereals,205,corn,"cornmeal, white (navajo)",35136,398.0000,10990.0,10400.0,...,319.0,258.0,543.0,345.0,70.0,554.0,0.0,0.0,0.0,0.0
171,198,cereals and cereal products,cereals,205,corn,"cornmeal, yellow (navajo)",35137,384.0000,9850.0,9400.0,...,301.0,230.0,499.0,307.0,50.0,494.0,0.0,0.0,0.0,0.0
172,199,cereals and cereal products,cereals,205,corn,"corn, dried, yellow (northern plains indians)",35183,419.0000,14480.0,20500.0,...,498.0,293.0,642.0,431.0,94.0,655.0,0.0,0.0,0.0,0.0
173,201,herbs and spices,herbs,863,yellow pond-lily,"wocas, dried seeds, oregon, yellow pond lily, ...",35232,361.3767,7900.0,19200.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Análisis de los datos

## Datos por columnas (compuestos)

In [22]:
def metrics_on_columns(dataframe, epsilon=0, start_idx=6):
    # Definición del diccionario a transformar en tabla
    dict_df = dict()
    
    for i in range(start_idx, dataframe.shape[1]):
        # Columna i
        df_i = dataframe.iloc[:, i]

        # Métricas de interés
        mean_i = df_i.mean()
        std_i  = df_i.std()
        zeros_i = int(sum(abs(df_i) <= epsilon))
        zeros_perc = zeros_i / dataframe.shape[0] * 100
        
        # Llenar el diccionario
        dict_df[df.columns[i]] = [mean_i, std_i, zeros_i, zeros_perc]
    
    # Pasarlo a dataframe
    df_out = pd.DataFrame(dict_df)
    
    # Agregar una columna de etiquetas
    df_out.insert(0, 'metrics', ['mean', 'std', 'zeros', 'zeros (%)']) 
    
    return df_out
    

    
df_cols_metrics = metrics_on_columns(df, start_idx=7)
df_cols_metrics.to_csv('Results/Col_analysis.csv', sep=';', index=False, 
                       decimal='.')

## Datos por filas (ingredientes)

In [23]:
def metrics_on_rows(dataframe, epsilon=0, start_idx=6):
    # Definición de la lista de filas
    list_rows = list()
    
    # Definición de los nombres de las primeras columnas
    col_names = [i for i in dataframe][:6]
    
    for i in range(dataframe.shape[0]):
        # Columna i
        df_i = dataframe.iloc[i, start_idx:]
        
        # Métricas de interés
        mean_i = df_i.mean()
        std_i  = df_i.std()
        zeros_i = int(sum(abs(df_i) <= epsilon))
        zeros_perc = zeros_i / dataframe.shape[1] * 100
        
        # Llenar el diccionario
        list_rows.append([dataframe['id_food_table'][i],
                          dataframe['orig_food_id'][i],
                          dataframe['food_group'][i],
                          dataframe['food_subgroup'][i],
                          dataframe['nombre_primario'][i],
                          dataframe['nombre_secundario'][i], 
                          mean_i, std_i, zeros_i, zeros_perc])
    
    # Pasarlo a dataframe
    df_out = pd.DataFrame(list_rows, columns=col_names + ['mean', 'std', 
                                                          'zeros', 
                                                          'zeros (%)'])
    
    return df_out



df_rows_metrics = metrics_on_rows(df, start_idx=7)
df_rows_metrics.to_csv('Results/Row_analysis.csv', sep=';', index=False, 
                       decimal='.')
df_rows_metrics

Unnamed: 0,id_aux,food_group,food_subgroup,id_food_table,nombre_primario,nombre_secundario,mean,std,zeros,zeros (%)
0,144,1,fruits,drupes,apricot,"apricot, dried",1102.647078,5770.411944,12,19.672131
1,134,25,pulses,beans,common bean,"beans, white, dried",2319.237585,7118.081244,10,16.393443
2,134,29,pulses,beans,common bean,"beans, brown, dried",2163.997252,6830.809961,10,16.393443
3,85,32,soy,soy,soy bean,"beans, soy, dried",2251.645483,5772.900150,10,16.393443
4,149,51,fruits,drupes,peach,"peach, dried",2726.519369,11599.274145,13,21.311475
...,...,...,...,...,...,...,...,...,...,...
170,205,35136,cereals and cereal products,cereals,corn,"cornmeal, white (navajo)",2274.934211,10593.250401,11,18.032787
171,205,35137,cereals and cereal products,cereals,corn,"cornmeal, yellow (navajo)",2168.251689,10004.181120,10,16.393443
172,205,35183,cereals and cereal products,cereals,corn,"corn, dried, yellow (northern plains indians)",2970.614306,9878.210738,11,18.032787
173,863,35232,herbs and spices,herbs,yellow pond-lily,"wocas, dried seeds, oregon, yellow pond lily, ...",2073.528693,11164.873929,29,47.540984


# Resultados notables

In [24]:
# Compuestos con mayor porcentaje de ceros
zp = 50
print(f'Compuestos con >= {zp}% cantidad de ceros')
a = df_cols_metrics.loc[3, [True] + list(df_cols_metrics.iloc[3, 1:] >= zp)][1:].to_frame(name='zeros (%)').sort_values(by='zeros (%)', ascending=False)
display(a)
a.to_csv('Results/comp_zeros.csv', sep=';', index=True, decimal='.')

# Ingredientes con mayor porcentaje de ceros
zp = 50
print(f'Ingredientes con >= {zp}% cantidad de ceros')
a = df_rows_metrics.loc[df_rows_metrics['zeros (%)'] >= zp].sort_values(by='zeros (%)', ascending=False)
display(a)
a.to_csv('Results/ingd_zeros.csv', sep=';', index=False, decimal='.')

# Compuestos con mayor varianza
q_comp = 20
print(f'{q_comp} Compuestos con mayor varianza')
a = df_cols_metrics.iloc[0:2, 1:].sort_values(by=1, axis=1, ascending=False).T[:q_comp].rename(columns={0: 'mean', 1: 'std'})
display(a)
a.to_csv('Results/comp_var_more.csv', sep=';', index=True, decimal='.')

# Compuestos con menor varianza
print(f'{q_comp} Compuestos con menor varianza')
a = df_cols_metrics.iloc[0:2, 1:].sort_values(by=1, axis=1, ascending=True).T[:q_comp].rename(columns={0: 'mean', 1: 'std'})
display(a)
a.to_csv('Results/comp_var_less.csv', sep=';', index=True, decimal='.')

Compuestos con >= 50% cantidad de ceros


Unnamed: 0,zeros (%)
OT_caffeine,100.0
OT_theobromine,100.0
V_vit b12,98.857143
V_vit h,93.142857
M_chromium,89.142857
M_iodine,86.857143
OT_betaine,84.571429
G_carbohydrate,82.285714
OT_choline,62.857143
V_vit k,61.142857


Ingredientes con >= 50% cantidad de ceros


Unnamed: 0,id_aux,food_group,food_subgroup,id_food_table,nombre_primario,nombre_secundario,mean,std,zeros,zeros (%)
16,682,389,baking goods,baking goods,leavening agent,baking powder,2324.494689,8765.955401,44,72.131148
135,682,18369,baking goods,baking goods,leavening agent,"leavening agents, baking powder, double-acting...",2110.70575,9912.577935,40,65.57377
19,61,527,herbs and spices,herbs,coriander,"coriander, leaf, dried",1781.185106,6673.31192,39,63.934426
27,13,842,herbs and spices,herbs,dill,"dill weed, dried",1760.550661,6833.570751,39,63.934426
136,682,18370,baking goods,baking goods,leavening agent,"leavening agents, baking powder, double-acting...",2249.654022,10350.196433,39,63.934426
10,205,173,cereals and cereal products,cereals,corn,corn starch,1670.243119,12080.976735,38,62.295082
71,393,9147,fruits,other fruits,jujube,"jujube, dried",1506.651406,10012.513212,37,60.655738
18,86,479,herbs and spices,oilseed crops,sunflower,"sunflower seeds, decorticated, dried",886.999574,3617.981648,37,60.655738
137,682,18371,baking goods,baking goods,leavening agent,"leavening agents, baking powder, low-sodium",2180.11788,8974.7239,35,57.377049
47,13,2017,herbs and spices,herbs,dill,"spices, dill weed, dried",2096.111835,8302.680838,34,55.737705


20 Compuestos con mayor varianza


Unnamed: 0,mean,std
"G_carbohydrate, by difference",45535.542857,29878.584412
G_carbohydrate,7550.857143,19556.028757
G_total lipid (fat),11725.828571,18747.822227
"G_sugars, total",5339.885714,12646.693493
G_protein,15204.742857,12122.321191
"G_fiber, total dietary",12154.857143,11932.114825
G_ash,5542.228571,9196.805927
"G_fatty acids, total polyunsaturated",4634.702857,8883.37651
"G_fatty acids, total monounsaturated",3629.88,6875.535227
"G_fatty acids, total saturated",2718.96,6490.383709


20 Compuestos con menor varianza


Unnamed: 0,mean,std
OT_theobromine,0.0,0.0
OT_caffeine,0.0,0.0
V_vit b12,2e-06,1.7e-05
M_chromium,0.001252,0.006779
M_selenium,0.023345,0.145567
V_vit k,0.05527,0.243982
V_vit b9,0.123641,0.358372
V_vit b2,0.369886,0.625834
V_vit b6,0.570829,0.629708
V_vit b1,0.554063,0.982613


# Análisis con reducción de dimensiones

In [25]:
# Definición del conjunto de etiquetas/grupos de los alimentos
group_labels = sorted(tuple(set([i.split('_')[0] for i in df.columns[7:]])))
group_labels = group_labels + ['all']
group_labels

['AE', 'ANE', 'G', 'M', 'OT', 'V', 'all']

## T-SNE

In [26]:
def get_TSNE_projection(dataframe, filter_type='all', start_idx=6, plot_tsne=False,
                        save_plot=False, save_data=False):
    # Definición de la lista que almacena los vectores
    food_vectors = list()
    
    # Definición del conjunto de etiquetas/grupos de los alimentos
    group_labels = sorted(list(set([i.split('_')[0] for i in df.columns[6:]])) + ['all'])
    
    # Asegurándose de que la etiqueta se encuentra en el dataframe
    if filter_type in group_labels:
        # Agregando los valores a la lista
        if filter_type == 'all':
            idx_to_append = [i for i in range(start_idx, df.shape[1])]
        elif filter_type == 'AE':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'AE']
        elif filter_type == 'ANE':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'ANE']
        elif filter_type == 'G':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'G']
        elif filter_type == 'M':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'M']
        elif filter_type == 'OT':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'OT']
        elif filter_type == 'V':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'V']
    else:
        raise ValueError(f'"{filter_type}" no es una opción permitida para "filter_type"')
    
    
    # Para cada fila
    for i, row in dataframe.iterrows():
        food_vectors.append(list(row[idx_to_append]))
    
    # Transformando a array
    food_vectors = np.array(food_vectors)
    
    # Aplicando el algoritmo T-SNE
    tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=3)
    T_foods = tsne.fit_transform(food_vectors)
    
    # Definición de las etiquetas
    labels = dataframe['nombre_secundario']
    
    # Opción de graficar
    if plot_tsne:
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=T_foods[:, 0],
                                 y=T_foods[:, 1],
                                 text=labels,
                                 textposition='top center',
                                 mode='markers+text'))
        fig.update_traces(textfont_size=8)
        fig.update_layout(title='Análisis de alimentos objetivos FooDB: '
                                'TSNE sobre Word Embeddings')
        if save_plot:
            fig.write_html(f'Results/TSNE_foodSpace_{filter_type}.html')
        fig.show()
        
    # Opción de guardar valores
    if save_data:
        to_save = (T_foods, labels, 
                   dataframe.iloc[:, [i for i in range(start_idx)] + idx_to_append])
        
        with open(f'Results/TSNE_foodSpace_{filter_type}.pkl', 'wb') as file:
            pickle.dump(to_save, file)
        
    return T_foods, labels

## Clustering

In [27]:
def get_clusters_df(dataframe, filter_type='all', cluster_op=None, start_idx=6, plot_clusters=False, 
                    save_plot=False, save_data=False, cluster_on_tsne=True, extra_label=''):
    def _get_clusters(features):
        # Aplicando un algoritmo de clustering DBSCAN
        dbscan = DBSCAN(eps=cluster_op['eps'], min_samples=cluster_op['min_samples'])
        clustering = dbscan.fit(features)
        clusters = clustering.labels_
        return clusters
    
    
    # Definición de la lista que almacena los vectores
    food_vectors = list()
    
    # Definición del conjunto de etiquetas/grupos de los alimentos
    group_labels = sorted(list(set([i.split('_')[0] for i in dataframe.columns[6:]])) + ['all'])
    
    # Asegurándose de que la etiqueta se encuentra en el dataframe
    if filter_type in group_labels:
        # Agregando los valores a la lista
        if filter_type == 'all':
            idx_to_append = [i for i in range(start_idx, dataframe.shape[1])]
        elif filter_type == 'AE':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'AE']
        elif filter_type == 'ANE':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'ANE']
        elif filter_type == 'G':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'G']
        elif filter_type == 'M':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'M']
        elif filter_type == 'OT':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'OT']
        elif filter_type == 'V':
            idx_to_append = [i for i, col in enumerate(dataframe.columns)
                             if col.split('_')[0] == 'V']
    else:
        raise ValueError(f'"{filter_type}" no es una opción permitida para "filter_type"')
    
    
    # Para cada fila
    for i, row in dataframe.iterrows():
        food_vectors.append(list(row[idx_to_append]))
    
    # Transformando a array
    food_vectors = np.array(food_vectors)
    
    # Aplicando el algoritmo T-SNE
    tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=3)
    T_foods = tsne.fit_transform(food_vectors)
    
    # Definición de las etiquetas
    labels = dataframe['nombre_secundario']
    
    # Opción de cluster
    if cluster_on_tsne:
        # Obteniendo los clusters mediante T-SNE
        clusters = _get_clusters(T_foods)
    else:
        # Obteniendo los clusters mediante valores originales
        clusters = _get_clusters(food_vectors)
    
    # Opción de guardar valores
    if save_data:
        to_save = (T_foods, labels, 
                   dataframe.iloc[:, [i for i in range(start_idx)] + idx_to_append],
                   clusters)
        
        if extra_label != '':
            filename = f'Results/TSNE_foodSpace_{filter_type}_{extra_label}.pkl'
        else:
            filename = f'Results/TSNE_foodSpace_{filter_type}.pkl'
        
        with open(filename, 'wb') as file:
            pickle.dump(to_save, file)
        
    # Creando un dataframe que condense toda la información
    dfi = np.concatenate((T_foods, np.array([labels]).T, np.array([clusters]).T), axis=1)
    dfi = pd.DataFrame(dfi, columns=['x1', 'x2', 'label', 'cluster'])
    
    if plot_clusters:
        fig = px.scatter(dfi, x='x1', y='x2', color='cluster', labels='label',
                         hover_data=['label', 'cluster'], text='label')
        fig.update_traces(textfont_size=7, textposition='top center')
        fig.update_traces(marker={'size': 15, 'opacity': 0.5})
        fig.update_layout(title='Análisis de ingredientes objetivos FooDB: '
                                f'TSNE con filtro {filter_type}.')
        if save_plot:
            if extra_label != '':
                filename = f'Results/TSNE_foodSpace_{filter_type}_{extra_label}.html'
            else:
                filename = f'Results/TSNE_foodSpace_{filter_type}.html'
            fig.write_html(filename)
        fig.show()
    
    return T_foods, labels, clusters

# Obteniendo los labels

In [28]:
# Definición de las listas que registran los resultados para cada caso
T_foods_list = list()
labels_list = list()
clusters_list = list()

# Opciones de cluster
cluster_op = {'eps': 20, 'min_samples': 5}

# Para cada grupo definido previamente
for group in group_labels:
    print(f'Getting T-SNE from group {group}...')
    # Rutina
    T_foods, labels, clusters = \
            get_clusters_df(df, filter_type=group, start_idx=7, 
                            plot_clusters=True, 
                            save_plot=True, save_data=True,
                            cluster_on_tsne=True,
                            cluster_op=cluster_op)
    
    # Se almacena
    T_foods_list.append(T_foods)
    labels_list.append(labels)
    clusters_list.append(clusters)

Getting T-SNE from group AE...


Getting T-SNE from group ANE...


Getting T-SNE from group G...


Getting T-SNE from group M...


Getting T-SNE from group OT...


Getting T-SNE from group V...


Getting T-SNE from group all...


# Obtener tabla filtrando columnas

In [29]:
# Compuestos con mayor porcentaje de ceros
zp = 80
# Definición de la lista de índices a revisar
index_to_rev = list()
index_to_rev_carb = list()

print(f'Compuestos con >= {zp}% cantidad de ceros')
for num, val_i in enumerate(df_cols_metrics.iloc[3,:]):
    try:
        if val_i < zp:
            index_to_rev.append(df_cols_metrics.columns[num])
            index_to_rev_carb.append(df_cols_metrics.columns[num])
        elif df_cols_metrics.columns[num] == 'G_carbohydrate':
            index_to_rev_carb.append('G_carbohydrate')
    except:
        pass
    
# Agregando los nombres de las columnas
index_to_rev = list(df.columns[:6]) + index_to_rev
index_to_rev_carb = list(df.columns[:6]) + index_to_rev_carb

# Acortando el dataframe
df_short = df.loc[:, index_to_rev]
df_short_carb = df.loc[:, index_to_rev_carb]

Compuestos con >= 80% cantidad de ceros


In [30]:
df_short

Unnamed: 0,id_aux,food_group,food_subgroup,id_food_table,nombre_primario,nombre_secundario,G_energy,G_protein,"G_fiber, total dietary","G_carbohydrate, by difference",...,ANE_histidine,ANE_isoleucine,ANE_leucine,ANE_lysine,ANE_methionine,ANE_phenylalanine,ANE_threonine,ANE_tryptophan,ANE_valine,OT_choline
0,1,fruits,drupes,144,apricot,"apricot, dried",277.0076,3390.0,7300.0,0.0,...,44.0,79.0,140.0,120.0,19.0,79.0,65.0,14.0,120.0,0.0
1,2,pulses,beans,134,common bean,"beans, white, dried",324.0918,18900.0,17800.0,0.0,...,580.0,950.0,1700.0,1500.0,320.0,1200.0,890.0,250.0,1100.0,0.0
2,3,pulses,beans,134,common bean,"beans, brown, dried",313.8145,18900.0,17800.0,0.0,...,480.0,880.0,1300.0,1400.0,170.0,940.0,790.0,240.0,1900.0,0.0
3,4,soy,soy,85,soy bean,"beans, soy, dried",410.6119,35800.0,16600.0,0.0,...,910.0,1600.0,2800.0,2300.0,450.0,1800.0,1400.0,460.0,1700.0,0.0
4,7,fruits,drupes,149,peach,"peach, dried",336.0421,4900.0,14300.0,0.0,...,100.0,77.0,170.0,180.0,180.0,110.0,170.0,23.0,240.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,197,cereals and cereal products,cereals,205,corn,"cornmeal, white (navajo)",398.0000,10990.0,10400.0,77140.0,...,292.0,404.0,1375.0,319.0,258.0,543.0,345.0,70.0,554.0,0.0
171,198,cereals and cereal products,cereals,205,corn,"cornmeal, yellow (navajo)",384.0000,9850.0,9400.0,72900.0,...,265.0,370.0,1275.0,301.0,230.0,499.0,307.0,50.0,494.0,0.0
172,199,cereals and cereal products,cereals,205,corn,"corn, dried, yellow (northern plains indians)",419.0000,14480.0,20500.0,66270.0,...,313.0,485.0,1468.0,498.0,293.0,642.0,431.0,94.0,655.0,0.0
173,201,herbs and spices,herbs,863,yellow pond-lily,"wocas, dried seeds, oregon, yellow pond lily, ...",361.3767,7900.0,19200.0,80020.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Definición de las listas que registran los resultados para cada caso
T_foods_list_zeros = list()
labels_list_zeros = list()
clusters_list_zeros = list()

# Opciones de cluster
cluster_op = {'eps': 20, 'min_samples': 5}

# Para cada grupo definido previamente
for group in ['all']: # group_labels:
    print(f'Getting T-SNE from group {group}...')
    # Rutina
    T_foods, labels, clusters = \
            get_clusters_df(df_short, filter_type=group, start_idx=7, 
                            plot_clusters=True, 
                            save_plot=True, save_data=True,
                            cluster_on_tsne=True,
                            cluster_op=cluster_op,
                            extra_label='baseZero')
    
    # Se almacena
    T_foods_list_zeros.append(T_foods)
    labels_list_zeros.append(labels)
    clusters_list_zeros.append(clusters)

Getting T-SNE from group all...


In [109]:
# Definición de las listas que registran los resultados para cada caso
T_foods_list_zeros = list()
labels_list_zeros = list()
clusters_list_zeros = list()

# Opciones de cluster
cluster_op = {'eps': 20, 'min_samples': 2}

# Para cada grupo definido previamente
for group in ['all']: # group_labels:
    print(f'Getting T-SNE from group {group}...')
    # Rutina
    T_foods, labels, clusters = \
            get_clusters_df(df_short_carb, filter_type=group, start_idx=7, 
                            plot_clusters=True, 
                            save_plot=True, save_data=True,
                            cluster_on_tsne=True,
                            cluster_op=cluster_op,
                            extra_label='baseZero_carb')
    
    # Se almacena
    T_foods_list_zeros.append(T_foods)
    labels_list_zeros.append(labels)
    clusters_list_zeros.append(clusters)

Getting T-SNE from group all...


# Aplicando las etiquetas al mapa anterior

In [81]:
T_0 = T_foods_list[-1]
clusters_new = clusters_list_zeros[0]
labels_new = labels_list_zeros[0]

dfi = np.concatenate((T_0, np.array([labels_new]).T, np.array([clusters_new]).T), axis=1)
dfi = pd.DataFrame(dfi, columns=['x1', 'x2', 'label', 'cluster'])

In [82]:
dfi

Unnamed: 0,x1,x2,label,cluster
0,94.02166,93.167953,"date, dried",0
1,-0.485664,-102.217117,"cereals, farina, unenriched, dry",1
2,56.407326,-14.586665,"corn, dried, yellow (northern plains indians)",2
3,-65.224365,52.587021,"spices, mustard seed, ground",3
4,-14.65625,-89.523193,"nuts, ginkgo nuts, dried",1
...,...,...,...,...
201,153.380844,-11.580437,"currants, zante, dried",4
202,41.183113,18.301876,"peppers, pasilla, dried",2
203,-10.779682,-120.215012,"seaweed, agar, dried",1
204,158.502457,-9.364155,"agave, dried (southwest)",4


In [85]:
fig = px.scatter(dfi, x='x1', y='x2', color='cluster', labels='label',
                         hover_data=['label', 'cluster'], text='label')
fig.update_traces(textfont_size=7, textposition='top center')
fig.update_traces(marker={'size': 15, 'opacity': 0.5})
fig.update_layout(title='Análisis de ingredientes objetivos FooDB: '
                        f'TSNE con filtro all - Base zero.')
fig.write_html(f'Results/TSNE_foodSpace_all_baseZero.html')
fig.show()