# Análisis de variables más importantes por cluster

In [23]:
import os, pickle
import pandas as pd
import numpy as np
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px
from collections import defaultdict
from sklearn.linear_model import LogisticRegression 

# Obtención de los datos

In [46]:
with open('Results/TSNE_foodSpace_all_baseZero_carb.pkl', 'rb') as file:
    data = pickle.load(file)

T_foods, labels, food_vectors, clusters = data
len(clusters)

206

In [47]:
count_dict = defaultdict(int)
for i in clusters:
    count_dict[i] += 1
count_dict

defaultdict(int, {0: 21, 1: 62, 2: 49, 3: 23, 4: 14, 5: 10, 6: 7, 7: 17, 8: 3})

In [48]:
interest_vectors = food_vectors.iloc[:, 6:]
interest_vectors

Unnamed: 0,G_energy,G_protein,"G_fiber, total dietary",G_carbohydrate,"G_carbohydrate, by difference","G_fatty acids, total monounsaturated","G_fatty acids, total polyunsaturated","G_fatty acids, total saturated","G_sugars, total",G_total lipid (fat),...,ANE_histidine,ANE_isoleucine,ANE_leucine,ANE_lysine,ANE_methionine,ANE_phenylalanine,ANE_threonine,ANE_tryptophan,ANE_valine,OT_choline
0,329.110899,2000.0,6400.0,71200.0,0.0,0.0,0.0,200.0,63900.0,0.0,...,29.0,45.0,86.0,54.0,26.0,58.0,54.0,54.0,64.0,0.0
1,369.024857,10600.0,1900.0,0.0,78000.0,60.0,220.0,80.0,0.0,500.0,...,215.0,409.0,724.0,203.0,165.0,514.0,280.0,136.0,451.0,0.0
2,419.000000,14480.0,20500.0,0.0,66270.0,3750.0,4543.0,1974.0,21320.0,10640.0,...,313.0,485.0,1468.0,498.0,293.0,642.0,431.0,94.0,655.0,0.0
3,508.126195,26080.0,12200.0,0.0,28090.0,22518.0,10088.0,1989.0,6790.0,36240.0,...,878.0,1183.0,2080.0,1844.0,483.0,1173.0,838.0,256.0,1509.0,122.7
4,348.000000,10350.0,0.0,0.0,72450.0,739.0,737.0,381.0,0.0,2000.0,...,244.0,500.0,755.0,494.0,133.0,408.0,640.0,170.0,677.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,283.000000,4080.0,6800.0,0.0,74080.0,47.0,180.0,28.0,67280.0,270.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.6
202,345.000000,12350.0,26800.0,0.0,51130.0,0.0,0.0,0.0,0.0,15850.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,306.405354,6210.0,7700.0,0.0,80880.0,27.0,102.0,61.0,2970.0,300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.3
204,341.061185,1710.0,15600.0,0.0,81980.0,0.0,0.0,0.0,50700.0,690.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modelos

In [49]:
def model_1(input_shape, compile_dict):
    # Definición de la capa de entrada
    x_in = tf.keras.Input(shape=(input_shape[1]))
    # Definición de la capa de salida (para los pesos)
    x_out = tf.keras.layers.Dense(units=1, activation='sigmoid',
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=tf.keras.regularizers.L1L2(l1=0, l2=1))(x_in)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    # Compilando el modelo
    model.compile(optimizer=compile_dict['optimizer'], loss=compile_dict['loss'],
                  metrics=compile_dict['metrics'])
    
    return model


def model_2(input_shape, compile_dict):
    # Definición de la capa de entrada
    x_in = tf.keras.Input(shape=(input_shape[1]))
    # Definición de la capa de salida (para los pesos)
    x_out = tf.keras.layers.Dense(units=compile_dict['num_clusters'], 
                                  activation='softmax',
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=tf.keras.regularizers.L1L2(l1=0, l2=1))(x_in)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    # Compilando el modelo
    model.compile(optimizer=compile_dict['optimizer'], loss=compile_dict['loss'],
                  metrics=compile_dict['metrics'])
    
    return model


def model_1_1(input_shape, compile_dict):
    # Definición de la capa de entrada
    x_in = tf.keras.Input(shape=(input_shape[1]))
    # Definición de la capa de salida (para los pesos)
    x_out = tf.keras.layers.Dense(units=2, activation='softmax', 
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=tf.keras.regularizers.L1L2(l1=0, l2=1))(x_in)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    # Compilando el modelo
    model.compile(optimizer=compile_dict['optimizer'], loss=compile_dict['loss'],
                  metrics=compile_dict['metrics'])
    
    return model

# Función conjunta

In [31]:
def get_important_features(data_matrix, comp_dict, cluster_to_rev, model_num='1'):
    # Obteniendo el modelo
    if model_num == '1':
        model = model_1(data_matrix.shape, comp_dict)
    elif model_num == '2':
        model = model_2(data_matrix.shape, comp_dict)
    elif model_num == '1_1':
        model = model_1_1(data_matrix.shape, comp_dict)
    
    # Definición del Early Stopping
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
    
    # Definición del x e y
    X = data_matrix.to_numpy()
    if model_num in ['1', '1_1']:
        Y = np.array(clusters == cluster_to_rev, dtype=int)
    elif model_num == '2':
        Y = np.array([clusters]).T
    
    # Ajustar el modelo
    model.fit(X, Y, batch_size=1, epochs=1000, verbose=1, callbacks=[early_stop])
    
    # Obtener los pesos
    weights = model.layers[1].weights[0].numpy()
    
    if model_num == '1':
        # Eliminar la dimensión extra
        weights = np.squeeze(weights)

        # Ordenando los pesos
        weights_data = [(interest_vectors.columns[i], data_i) for i, data_i in enumerate(weights)]
        weights_data = sorted(weights_data, key=lambda x: x[1], reverse=True)
    
        return weights_data
    
    elif model_num in ['2']:
        return weights
    
    elif model_num in ['1_1']:
        return weights

# Diseño del sistema

### Idea 1

In [33]:
weights_list = list()

# Definición de parámetros
comp_dict = {'optimizer': 'adam',
             'loss': 'binary_crossentropy',
             'metrics': ['accuracy'],
             'model_num': '1'}

for cluster in range(len(np.unique(clusters))):
    print(f'Revisando cluster #{cluster}')
    weights_data = get_important_features(interest_vectors, comp_dict, cluster_to_rev=cluster,
                                          model_num=comp_dict['model_num'])
    weights_list.append(weights_data)
    display(weights_data)
    break

Revisando cluster #0
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000


[('V_vit b2', 0.00830599),
 ('V_vit b6', 0.00652485),
 ('V_vit b12', -2.3642147e-06),
 ('V_vit k', -0.0014318905),
 ('V_vit b1', -0.003279257),
 ('V_vit b9', -0.0054591154),
 ('V_vit c', -0.0075351656),
 ('V_vit h', -0.01594795),
 ('V_vit b5', -0.021292591)]

### Idea 2

In [22]:
weights_list = list()

# Definición de parámetros
comp_dict = {'optimizer': 'adam',
             'loss': 'sparse_categorical_crossentropy',
             'metrics': ['accuracy'],
             'num_clusters': len(np.unique(clusters)),
             'model_num': '2'}


for cluster in range(len(np.unique(clusters))):
    print(f'Revisando cluster #{cluster}')
    weights_data = get_important_features(interest_vectors, comp_dict, cluster_to_rev=cluster,
                                          model_num=comp_dict['model_num'])
    weights_list.append(weights_data)
    display(weights_data)
    break

Revisando cluster #0
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000


array([[nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan]], dtype=float32)

### Idea 3

In [386]:
weights_list = list()

# Definición de parámetros
comp_dict = {'optimizer': 'adam',
             'loss': 'sparse_categorical_crossentropy',
             'metrics': ['accuracy'],
             'num_clusters': len(np.unique(clusters)),
             'model_num': '1_1'}


for cluster in range(len(np.unique(clusters))):
    print(f'Revisando cluster #{cluster}')
    weights_data = get_important_features(interest_vectors, comp_dict, cluster_to_rev=cluster,
                                          model_num=comp_dict['model_num'])
    weights_list.append(weights_data)
    display(weights_data)
    break

Revisando cluster #0
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000


array([[ 3.9777798e-03, -3.9714319e-03],
       [ 7.2078684e-08, -1.4199226e-08],
       [-8.8992761e-03,  8.9303618e-03],
       [ 1.8144624e-02, -1.8141564e-02],
       [-6.6987635e-03,  6.7009642e-03],
       [ 2.1231249e-03, -1.9584233e-03],
       [ 3.3590609e-01,  2.8413904e-01],
       [ 9.2697311e-03, -9.3598757e-03],
       [ 1.2146984e-03, -2.9937651e-03]], dtype=float32)

In [387]:
a = [(interest_vectors.columns[i], data_i[0]) for i, data_i in enumerate(weights_data)]
a.sort(key=lambda x: x[1], reverse=True)
a

[('V_vit c', 0.3359061),
 ('V_vit b5', 0.018144624),
 ('V_vit h', 0.009269731),
 ('V_vit b1', 0.0039777798),
 ('V_vit b9', 0.002123125),
 ('V_vit k', 0.0012146984),
 ('V_vit b12', 7.2078684e-08),
 ('V_vit b6', -0.0066987635),
 ('V_vit b2', -0.008899276)]

In [352]:
interest_vectors.columns

Index(['V_vit b1', 'V_vit b12', 'V_vit b2', 'V_vit b5', 'V_vit b6', 'V_vit b9',
       'V_vit c', 'V_vit h', 'V_vit k'],
      dtype='object')

# Con sklearn

In [205]:
from sklearn.linear_model import LogisticRegression

In [44]:
# Definición del x e y
X = interest_vectors.to_numpy()
Y = np.array(clusters == 0, dtype=int)

In [45]:
clf = LogisticRegression(random_state=0, max_iter=2000, verbose=1, solver='liblinear').fit(X, Y)
display(clf.coef_)
display(clf.predict(X) == Y)
display(np.sum(clf.predict(X) == Y) / len(Y))

[LibLinear]

array([[-0.05489158, -0.37704036, -0.95228148, -0.00402614]])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

1.0

In [43]:
a = [(interest_vectors.columns[i], data_i) for i, data_i in enumerate(clf.coef_[0])]
a.sort(key=lambda x: x[1], reverse=True)
a

[('OT_theobromine', -0.0040261378884168374),
 ('OT_betaine', -0.05489158375462725),
 ('OT_caffeine', -0.3770403575726564),
 ('OT_choline', -0.9522814806524978)]

In [19]:
clf.coef_

array([[-0.05489158, -0.37704036, -0.95228148, -0.00402614]])