# Análisis de variables más importantes por cluster

In [1]:
import os, pickle
# import seaborn as sns
# import pandas as pd
import numpy as np
import tensorflow as tf
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA
# from sklearn.cluster import DBSCAN
import plotly.graph_objects as go
import plotly.express as px
from collections import defaultdict

# Obtención de los datos

In [406]:
with open('Results/TSNE_foodSpace_OT.pkl', 'rb') as file:
    data = pickle.load(file)

T_foods, labels, food_vectors, clusters = data
len(clusters)

206

In [407]:
count_dict = defaultdict(int)
for i in clusters:
    count_dict[i] += 1
count_dict

defaultdict(int, {0: 124, 1: 20, 2: 43, 3: 8, 4: 7, -1: 4})

In [408]:
interest_vectors = food_vectors.iloc[:, 6:]
interest_vectors

Unnamed: 0,OT_betaine,OT_caffeine,OT_choline,OT_theobromine
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,1.9,0.0,122.7,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
201,0.0,0.0,10.6,0.0
202,0.0,0.0,0.0,0.0
203,0.0,0.0,63.3,0.0
204,0.0,0.0,0.0,0.0


# Función conjunta

In [455]:
def get_important_features(data_matrix, comp_dict, cluster_to_rev, model_num='1'):
    # Obteniendo el modelo
    if model_num == '1':
        model = model_1(data_matrix.shape, comp_dict)
    elif model_num == '2':
        model = model_2(data_matrix.shape, comp_dict)
    elif model_num == '1_1':
        model = model_1_1(data_matrix.shape, comp_dict)
    
    # Definición del Early Stopping
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=20)
    
    # Definición del x e y
    X = data_matrix.to_numpy()
    if model_num in ['1', '1_1']:
        Y = np.array(clusters == cluster_to_rev, dtype=int)
    elif model_num == '2':
        Y = np.array([clusters]).T
    
    # Ajustar el modelo
    model.fit(X, Y, batch_size=1, epochs=1000, verbose=1, callbacks=[early_stop])
    
    # Obtener los pesos
    weights = model.layers[1].weights[0].numpy()
    
    if model_num == '1':
        # Eliminar la dimensión extra
        weights = np.squeeze(weights)

        # Ordenando los pesos
        weights_data = [(interest_vectors.columns[i], data_i) for i, data_i in enumerate(weights)]
        weights_data = sorted(weights_data, key=lambda x: x[1], reverse=True)
    
        return weights_data
    
    elif model_num in ['2']:
        return weights
    
    elif model_num in ['1_1']:
        return weights

# Diseño del sistema

### Idea 1

In [439]:
def model_1(input_shape, compile_dict):
    # Definición de la capa de entrada
    x_in = tf.keras.Input(shape=(input_shape[1]))
    # Definición de la capa de salida (para los pesos)
    x_out = tf.keras.layers.Dense(units=1, activation='sigmoid',
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=tf.keras.regularizers.L1L2(l1=0, l2=1))(x_in)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    # Compilando el modelo
    model.compile(optimizer=compile_dict['optimizer'], loss=compile_dict['loss'],
                  metrics=compile_dict['metrics'])
    
    return model

In [440]:
# Definición de parámetros
comp_dict = {'optimizer': tf.keras.optimizers.Adam(learning_rate=0.001),
             'loss': 'binary_crossentropy',
             'metrics': ['mse', 'accuracy'],
             'num_clusters': len(np.unique(clusters))}

model = model_1(interest_vectors.shape, comp_dict)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
model.summary()

Model: "functional_87"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_45 (InputLayer)        [(None, 4)]               0         
_________________________________________________________________
dense_44 (Dense)             (None, 1)                 5         
Total params: 5
Trainable params: 5
Non-trainable params: 0
_________________________________________________________________


In [441]:
# Definición del x e y
X = interest_vectors.to_numpy()
Y = np.array(clusters == 0, dtype=int)

In [442]:
model.fit(X, Y, batch_size=1, epochs=200, verbose=1, callbacks=[early_stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200


Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200


<tensorflow.python.keras.callbacks.History at 0x1e1762f5250>

# Obteniendo los pesos

In [443]:
weights = model.get_weights()[0]
weights = np.squeeze(weights)

In [444]:
# Ordenando los pesos
weights_data = [(interest_vectors.columns[i], data_i) for i, data_i in enumerate(weights)]
weights_data = sorted(weights_data, key=lambda x: x[1], reverse=True)
weights_data

[('OT_theobromine', -0.007650562),
 ('OT_betaine', -0.023614766),
 ('OT_caffeine', -0.091201365),
 ('OT_choline', -0.22879389)]

In [445]:
model.get_weights()

[array([[-0.02361477],
        [-0.09120136],
        [-0.22879389],
        [-0.00765056]], dtype=float32),
 array([2.4168947], dtype=float32)]

In [446]:
weights_list = list()

# Definición de parámetros
comp_dict = {'optimizer': 'adam',
             'loss': 'binary_crossentropy',
             'metrics': ['accuracy'],
             'model_num': '1'}

for cluster in range(len(np.unique(clusters))):
    print(f'Revisando cluster #{cluster}')
    weights_data = get_important_features(interest_vectors, comp_dict, cluster_to_rev=cluster,
                                          model_num=comp_dict['model_num'])
    weights_list.append(weights_data)
    display(weights_data)
    break

Revisando cluster #0
[1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0
 1 0 1 1 0 0 1 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 0 1 1 0 1 1 0 0
 1 1 1 0 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 0 0 1 1 0 1 1
 0 1 1 1 1 0 0 0 1 0 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 0 0 1 1 1 0 1 1 1 1 0 0
 1 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 1 0 1 1 1 0 1 1 0 1 1
 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 0 1 0 1 1]
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000


[('OT_theobromine', -0.010898626),
 ('OT_betaine', -0.020615062),
 ('OT_caffeine', -0.07968655),
 ('OT_choline', -0.20979962)]

### Idea 2

In [453]:
def model_2(input_shape, compile_dict):
    # Definición de la capa de entrada
    x_in = tf.keras.Input(shape=(input_shape[1]))
    # Definición de la capa de salida (para los pesos)
    x_out = tf.keras.layers.Dense(units=compile_dict['num_clusters'], 
                                  activation='softmax')(x_in)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    # Compilando el modelo
    model.compile(optimizer=compile_dict['optimizer'], loss=compile_dict['loss'],
                  metrics=compile_dict['metrics'])
    
    return model

In [457]:
weights_list = list()

# Definición de parámetros
comp_dict = {'optimizer': 'adam',
             'loss': 'sparse_categorical_crossentropy',
             'metrics': ['accuracy'],
             'num_clusters': len(np.unique(clusters)),
             'model_num': '2'}


for cluster in range(len(np.unique(clusters))):
    print(f'Revisando cluster #{cluster}')
    weights_data = get_important_features(interest_vectors, comp_dict, cluster_to_rev=cluster,
                                          model_num=comp_dict['model_num'])
    weights_list.append(weights_data)
    display(weights_data)
    break

Revisando cluster #0
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000


[('OT_betaine', nan),
 ('OT_caffeine', nan),
 ('OT_choline', nan),
 ('OT_theobromine', nan)]

In [449]:
weights_data.layers[1].weights[0].numpy().shape

AttributeError: 'numpy.ndarray' object has no attribute 'layers'

### Idea 3

In [385]:
def model_1_1(input_shape, compile_dict):
    # Definición de la capa de entrada
    x_in = tf.keras.Input(shape=(input_shape[1]))
    # Definición de la capa de salida (para los pesos)
    x_out = tf.keras.layers.Dense(units=2, activation='softmax', 
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=tf.keras.regularizers.L1L2(l1=0, l2=1))(x_in)
    
    # Definición del modelo
    model = tf.keras.Model(inputs=x_in, outputs=x_out)
    
    # Compilando el modelo
    model.compile(optimizer=compile_dict['optimizer'], loss=compile_dict['loss'],
                  metrics=compile_dict['metrics'])
    
    return model

In [386]:
weights_list = list()

# Definición de parámetros
comp_dict = {'optimizer': 'adam',
             'loss': 'sparse_categorical_crossentropy',
             'metrics': ['accuracy'],
             'num_clusters': len(np.unique(clusters)),
             'model_num': '1_1'}


for cluster in range(len(np.unique(clusters))):
    print(f'Revisando cluster #{cluster}')
    weights_data = get_important_features(interest_vectors, comp_dict, cluster_to_rev=cluster,
                                          model_num=comp_dict['model_num'])
    weights_list.append(weights_data)
    display(weights_data)
    break

Revisando cluster #0
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000


array([[ 3.9777798e-03, -3.9714319e-03],
       [ 7.2078684e-08, -1.4199226e-08],
       [-8.8992761e-03,  8.9303618e-03],
       [ 1.8144624e-02, -1.8141564e-02],
       [-6.6987635e-03,  6.7009642e-03],
       [ 2.1231249e-03, -1.9584233e-03],
       [ 3.3590609e-01,  2.8413904e-01],
       [ 9.2697311e-03, -9.3598757e-03],
       [ 1.2146984e-03, -2.9937651e-03]], dtype=float32)

In [387]:
a = [(interest_vectors.columns[i], data_i[0]) for i, data_i in enumerate(weights_data)]
a.sort(key=lambda x: x[1], reverse=True)
a

[('V_vit c', 0.3359061),
 ('V_vit b5', 0.018144624),
 ('V_vit h', 0.009269731),
 ('V_vit b1', 0.0039777798),
 ('V_vit b9', 0.002123125),
 ('V_vit k', 0.0012146984),
 ('V_vit b12', 7.2078684e-08),
 ('V_vit b6', -0.0066987635),
 ('V_vit b2', -0.008899276)]

In [352]:
interest_vectors.columns

Index(['V_vit b1', 'V_vit b12', 'V_vit b2', 'V_vit b5', 'V_vit b6', 'V_vit b9',
       'V_vit c', 'V_vit h', 'V_vit k'],
      dtype='object')

# Con sklearn

In [205]:
from sklearn.linear_model import LogisticRegression 

In [421]:
# Definición del x e y
X = interest_vectors.to_numpy()
Y = np.array(clusters == 0, dtype=int)

In [422]:
clf = LogisticRegression(random_state=3, max_iter=2000, verbose=1, solver='liblinear').fit(X, Y)
display(clf.coef_)
display(clf.predict(X) == Y)
display(np.sum(clf.predict(X) == Y) / len(Y))

[LibLinear]

array([[-0.05489158, -0.37704036, -0.95228148, -0.00402614]])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

1.0

In [423]:
a = [(interest_vectors.columns[i], data_i) for i, data_i in enumerate(clf.coef_[0])]
a.sort(key=lambda x: x[1], reverse=True)
a

[('OT_theobromine', -0.0040261378884168374),
 ('OT_betaine', -0.05489158375462725),
 ('OT_caffeine', -0.3770403575726564),
 ('OT_choline', -0.9522814806524978)]

In [209]:
clf.coef_

array([[-0.05489158, -0.37704036, -0.95228148, -0.00402614]])