In [1]:
import os, getpass
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.graph_objects as go
K = 10

In [70]:
df = pd.read_csv('CONSOLIDADO MVP v2.dat', sep='|')
df

Unnamed: 0,orig_food_id,id_food_table,food_group,food_subgroup,nombre_primario,nombre_secundario,G_energy,G_protein,"G_fiber, total dietary",G_carbohydrate,...,ANE_lysine,ANE_methionine,ANE_phenylalanine,ANE_threonine,ANE_tryptophan,ANE_valine,OT_betaine,OT_caffeine,OT_choline,OT_theobromine
0,44,135,fruits,other fruits,date,"date, dried",329.110899,2000.0,6400.0,71200.0,...,54.0,26.0,58.0,54.0,54.0,64.0,0.0,0.0,0.0,0.0
1,8172,270,cereals and cereal products,cereal products,breakfast cereal,"cereals, farina, unenriched, dry",369.024857,10600.0,1900.0,0.0,...,203.0,165.0,514.0,280.0,136.0,451.0,0.0,0.0,0.0,0.0
2,35183,205,cereals and cereal products,cereals,corn,"corn, dried, yellow (northern plains indians)",419.000000,14480.0,20500.0,0.0,...,498.0,293.0,642.0,431.0,94.0,655.0,0.0,0.0,0.0,0.0
3,2024,420,herbs and spices,spices,white mustard,"spices, mustard seed, ground",508.126195,26080.0,12200.0,0.0,...,1844.0,483.0,1173.0,838.0,256.0,1509.0,1.9,0.0,122.7,0.0
4,12128,372,herbs and spices,other seeds,ginkgo nuts,"nuts, ginkgo nuts, dried",348.000000,10350.0,0.0,0.0,...,494.0,133.0,408.0,640.0,170.0,677.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,9085,204,fruits,berries,common grape,"currants, zante, dried",283.000000,4080.0,6800.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.6,0.0
202,11982,40,vegetables,fruit vegetables,pepper,"peppers, pasilla, dried",345.000000,12350.0,26800.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,11663,285,aquatic foods,seaweed,agar,"seaweed, agar, dried",306.405354,6210.0,7700.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.3,0.0
204,35194,577,vegetables,other vegetables,agave,"agave, dried (southwest)",341.061185,1710.0,15600.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Análisis de los datos

## Datos por columnas (compuestos)

In [85]:
def metrics_on_columns(dataframe, epsilon=0, start_idx=6):
    # Definición del diccionario a transformar en tabla
    dict_df = dict()
    
    for i in range(start_idx, dataframe.shape[1]):
        # Columna i
        df_i = dataframe.iloc[:, i]

        # Métricas de interés
        mean_i = df_i.mean()
        std_i  = df_i.std()
        zeros_i = int(sum(abs(df_i) <= epsilon))
        zeros_perc = zeros_i / df.shape[0] * 100
        
        # Llenar el diccionario
        dict_df[df.columns[i]] = [mean_i, std_i, zeros_i, zeros_perc]
    
    # Pasarlo a dataframe
    df_out = pd.DataFrame(dict_df)
    
    # Agregar una columna de etiquetas
    df_out.insert(0, 'metrics', ['mean', 'std', 'zeros', 'zeros (%)']) 
    
    return df_out
    

    
df_cols_metrics = metrics_on_columns(df)
df_cols_metrics.to_csv('Results/Col_analysis.csv', sep=';', index=False, 
                       decimal='.')

## Datos por filas (ingredientes)

In [87]:
def metrics_on_rows(dataframe, epsilon=0, start_idx=6):
    # Definición de la lista de filas
    list_rows = list()
    
    # Definición de los nombres de las primeras columnas
    col_names = [i for i in df][:6]
    
    for i in range(dataframe.shape[0]):
        # Columna i
        df_i = dataframe.iloc[i, start_idx:]
        
        # Métricas de interés
        mean_i = df_i.mean()
        std_i  = df_i.std()
        zeros_i = int(sum(abs(df_i) <= epsilon))
        zeros_perc = zeros_i / df.shape[0] * 100
        
        # Llenar el diccionario
        list_rows.append([df['id_food_table'][i],
                          df['orig_food_id'][i],
                          df['food_group'][i],
                          df['food_subgroup'][i],
                          df['nombre_primario'][i],
                          df['nombre_secundario'][i], 
                          mean_i, std_i, zeros_i, zeros_perc])
    
    # Pasarlo a dataframe
    df_out = pd.DataFrame(list_rows, columns=col_names + ['mean', 'std', 
                                                          'zeros', 
                                                          'zeros (%)'])
    
    return df_out



df_rows_metrics = metrics_on_rows(df)
df_rows_metrics.to_csv('Results/Row_analysis.csv', sep=';', index=False, 
                       decimal='.')

# Resultados notables

In [106]:
df_rows_metrics

Unnamed: 0,orig_food_id,id_food_table,food_group,food_subgroup,nombre_primario,nombre_secundario,mean,std,zeros,zeros (%)
0,135,44,fruits,other fruits,date,"date, dried",2736.257513,12883.048081,11,5.339806
1,270,8172,cereals and cereal products,cereal products,breakfast cereal,"cereals, farina, unenriched, dry",1906.824229,10659.843740,12,5.825243
2,205,35183,cereals and cereal products,cereals,corn,"corn, dried, yellow (northern plains indians)",2970.614306,9878.210738,11,5.339806
3,420,2024,herbs and spices,spices,white mustard,"spices, mustard seed, ground",3312.886254,7627.531850,7,3.398058
4,372,12128,herbs and spices,other seeds,ginkgo nuts,"nuts, ginkgo nuts, dried",1896.093037,9892.831328,13,6.310680
...,...,...,...,...,...,...,...,...,...,...
201,204,9085,fruits,berries,common grape,"currants, zante, dried",2899.644704,13479.460987,26,12.621359
202,40,11982,vegetables,fruit vegetables,pepper,"peppers, pasilla, dried",2132.018411,8160.927580,32,15.533981
203,285,11663,aquatic foods,seaweed,agar,"seaweed, agar, dried",1949.258892,11041.073293,27,13.106796
204,577,35194,vegetables,other vegetables,agave,"agave, dried (southwest)",2903.956783,13103.199218,30,14.563107


## Compuestos

In [154]:
# Compuestos con mayor porcentaje de ceros
zp = 50
print(f'Compuestos con >= {zp}% cantidad de ceros')
display(df_cols_metrics.loc[3, [True] + list(df_cols_metrics.iloc[3, 1:] >= zp)][1:].to_frame(name='zeros (%)').sort_values(by='zeros (%)', ascending=False))

# Ingredientes con mayor porcentaje de ceros
zp = 17
print(f'Ingredientes con >= {zp}% cantidad de ceros')
display(df_rows_metrics.loc[df_rows_metrics['zeros (%)'] >= zp].sort_values(by='zeros (%)', ascending=False))

# Compuestos con mayor varianza
q_comp = 20
print(f'{q_comp} Compuestos con mayor varianza')
display(df_cols_metrics.iloc[0:2, 1:].sort_values(by=1, axis=1, ascending=False).T[:q_comp].rename(columns={0: 'mean', 1: 'std'}))

# Compuestos con menor varianza
print(f'{q_comp} Compuestos con menor varianza')
display(df_cols_metrics.iloc[0:2, 1:].sort_values(by=1, axis=1, ascending=True).T[:q_comp].rename(columns={0: 'mean', 1: 'std'}))


Compuestos con >= 50% cantidad de ceros


Unnamed: 0,zeros (%)
V_vit b12,99.029126
OT_theobromine,97.087379
OT_caffeine,95.631068
V_vit h,93.68932
M_chromium,89.320388
M_iodine,87.378641
OT_betaine,84.466019
G_carbohydrate,82.524272
OT_choline,59.223301
V_vit k,57.281553


Ingredientes con >= 17% cantidad de ceros


Unnamed: 0,orig_food_id,id_food_table,food_group,food_subgroup,nombre_primario,nombre_secundario,mean,std,zeros,zeros (%)
199,707,19171,cocoa and cocoa products,cocoa products,cocoa powder,"cocoa, dry powder, unsweetened, hersheys europ...",2045.112882,8980.384379,48,23.300971
145,682,389,baking goods,baking goods,leavening agent,baking powder,2324.494689,8765.955401,44,21.359223
176,682,18369,baking goods,baking goods,leavening agent,"leavening agents, baking powder, double-acting...",2110.705749,9912.577935,40,19.417476
153,61,527,herbs and spices,herbs,coriander,"coriander, leaf, dried",1781.185106,6673.31192,39,18.932039
174,682,18370,baking goods,baking goods,leavening agent,"leavening agents, baking powder, double-acting...",2249.654022,10350.196433,39,18.932039
189,13,842,herbs and spices,herbs,dill,"dill weed, dried",1760.550661,6833.570751,39,18.932039
144,205,173,cereals and cereal products,cereals,corn,corn starch,1670.243119,12080.976735,38,18.446602
180,86,479,herbs and spices,oilseed crops,sunflower,"sunflower seeds, decorticated, dried",886.999575,3617.981648,37,17.961165
183,393,9147,fruits,other fruits,jujube,"jujube, dried",1506.651405,10012.513212,37,17.961165


20 Compuestos con mayor varianza


Unnamed: 0,mean,std
"G_carbohydrate, by difference",46779.368932,29794.301093
G_carbohydrate,7603.883495,19597.463362
G_total lipid (fat),11430.776699,17771.630211
"G_sugars, total",7229.174757,15448.388338
"G_fiber, total dietary",12962.621359,12502.054891
G_protein,14421.990291,11603.978562
G_ash,5390.048544,8571.09399
"G_fatty acids, total polyunsaturated",4039.65534,8312.065846
"G_fatty acids, total saturated",3299.902913,6954.72927
"G_fatty acids, total monounsaturated",3315.150485,6432.206426


20 Compuestos con menor varianza


Unnamed: 0,mean,std
V_vit b12,1e-06,1.5e-05
M_chromium,0.002292,0.01457
M_selenium,0.021076,0.134287
V_vit k,0.056334,0.253667
V_vit b9,0.114904,0.333738
V_vit b2,0.349379,0.588316
V_vit b6,0.555286,0.634335
V_vit b1,0.49649,0.918151
M_copper,0.910325,1.087551
M_iodine,0.174958,2.508225


# Análisis con reducción de dimensiones

In [164]:
# Definición del conjunto de etiquetas/grupos de los alimentos
group_labels = tuple(set([i.split('_')[0] for i in df.columns[6:]]))
group_labels

('V', 'M', 'AE', 'G', 'ANE', 'OT')

## T-SNE

In [None]:
def get_TSNE_projection(filter_type='all'):
    # Definición de la lista que almacena los vectores
    food_vectors = list()

    # Definición de las etiquetas
    labels = df['nombre_secundario']

    for i, row in df.iterrows():
        # Agregando los valores a la lista
        food_vectors[i] = list(row[4:])
    
    
    tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=3)
    T_foods = tsne.fit_transform(food_vectors)
    
    return T_foods

### Sobre todos

In [17]:
# Definición de la matriz que almacena los vectores
food_vectors = np.zeros((df.shape[0], df.shape[1] - 4))

# Definición de las etiquetas
labels = df['nombre_secundario']

for i, row in df.iterrows():
    # Agregando los valores a la lista
    food_vectors[i] = list(row[4:])
    
food_vectors.shape

(208, 54)

In [18]:
tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=3)
T_foods = tsne.fit_transform(food_vectors)

In [19]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=T_foods[:, 0],
                         y=T_foods[:, 1],
                         text=labels,
                         textposition='top center',
                         mode='markers+text'))
fig.update_traces(textfont_size=8)
fig.update_layout(title='Análisis de alimentos objetivos FooDB: TSNE sobre Word Embeddings')
fig.write_html('Results/TSNE_foodSpace_all.html')
fig.show()

### Sobre solo los grupos de interés

## PCA

In [None]:
PCA = 

# Análisis de componentes