<a href="https://colab.research.google.com/github/dudaholandah/sibgrapi2022_wuw/blob/main/sibgrapi_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!apt-get install poppler-utils
!pip install pdf2image
!pip install unidecode
!pip install umap-learn[plot]
!pip install trimap
!pip install -U kaleido
!pip install scikit-learn-extra

In [2]:
import pandas as pd
import numpy as np
from unidecode import unidecode
from sklearn import preprocessing
from pandas.core.frame import DataFrame
import plotly.express as px
from sklearn.manifold import TSNE
import re
import umap
import trimap
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.manifold import Isomap
from sklearn.manifold import trustworthiness
import plotly.graph_objects as go
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
import plotly
import matplotlib.pyplot as plt
import plotly.io as pio
import pdf2image
import os
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn_extra.cluster import KMedoids

# Pre-Processing Data

In [4]:
data = pd.read_excel("vegan_dataset.xlsx", sheet_name='Veganos')

In [5]:
data.head()

Unnamed: 0,Classification,Description,Ingredients,Serving Size,Kcal,Carbohydrate,Sugars,Proteins,Total Fats,Saturated Fats,Dietary Fiber,Sodium,B12,Calcium,Zinc
0,MEAT 2,Carne Moída do Futuro,"water, texturized soy protein, pea protein, ch...",80.0,168.0,11.0,0.0,8.2,10.0,9.0,0.7,392.0,0.0,0.0,0.0
1,MEAT 3,Carne Moída do Futuro,"water, texturized soy protein, isolated soy pr...",80.0,168.0,11.0,0.0,8.2,10.0,9.0,0.7,392.0,0.0,0.0,0.0
2,POULTRY 3,Frango do Futuro,"water, texturized soy protein, isolated soy pr...",80.0,168.0,11.0,0.0,8.2,10.0,9.0,0.7,392.0,0.0,0.0,0.0
3,PORK 1,Linguiça do Futuro,"water, texturized soy protein, isolated soy pr...",50.0,139.0,7.7,0.0,5.7,9.5,3.9,0.2,294.0,0.0,0.0,0.0
4,MEAT 1,Hamburguer de Soja Goshen,"texturized soy protein, water, soy oil, modif...",50.0,118.0,2.8,0.0,7.0,9.0,1.3,2.0,344.0,0.0,0.0,0.0


### Separating data

In [6]:
data_ingredients = pd.DataFrame(data['Ingredients'])
data_nutrients = data.drop(columns=['Ingredients', 'Classification', "Description"])
data_clasification = pd.DataFrame(data['Classification'])
data_name = pd.DataFrame(data["Description"])
label_data = data_clasification.join(data_name)

In [7]:
data_nutrients.head()

Unnamed: 0,Serving Size,Kcal,Carbohydrate,Sugars,Proteins,Total Fats,Saturated Fats,Dietary Fiber,Sodium,B12,Calcium,Zinc
0,80.0,168.0,11.0,0.0,8.2,10.0,9.0,0.7,392.0,0.0,0.0,0.0
1,80.0,168.0,11.0,0.0,8.2,10.0,9.0,0.7,392.0,0.0,0.0,0.0
2,80.0,168.0,11.0,0.0,8.2,10.0,9.0,0.7,392.0,0.0,0.0,0.0
3,50.0,139.0,7.7,0.0,5.7,9.5,3.9,0.2,294.0,0.0,0.0,0.0
4,50.0,118.0,2.8,0.0,7.0,9.0,1.3,2.0,344.0,0.0,0.0,0.0


### Normalizing Nutrient Values (MinMax Scaler) 

In [8]:
X = data_nutrients.values
print(X)

[[ 80.  168.   11.  ...   0.    0.    0. ]
 [ 80.  168.   11.  ...   0.    0.    0. ]
 [ 80.  168.   11.  ...   0.    0.    0. ]
 ...
 [ 40.   84.    5.2 ...   0.    0.    0. ]
 [ 10.   25.    3.3 ...   0.    0.    0. ]
 [  7.   28.    1.2 ...   0.    0.    0. ]]


In [9]:
attributes_dummies = data_nutrients.columns
normalize = preprocessing.MinMaxScaler()
xscaled = normalize.fit_transform(X)

nutrients_normalized = pd.DataFrame(xscaled,columns=attributes_dummies)
nutrients_normalized = nutrients_normalized.replace(np.nan,0)

nutrients_normalized.head()

Unnamed: 0,Serving Size,Kcal,Carbohydrate,Sugars,Proteins,Total Fats,Saturated Fats,Dietary Fiber,Sodium,B12,Calcium,Zinc
0,0.212828,0.263815,0.244444,0.0,0.195238,0.204082,0.48913,0.007447,0.414376,0.0,0.0,0.0
1,0.212828,0.263815,0.244444,0.0,0.195238,0.204082,0.48913,0.007447,0.414376,0.0,0.0,0.0
2,0.212828,0.263815,0.244444,0.0,0.195238,0.204082,0.48913,0.007447,0.414376,0.0,0.0,0.0
3,0.125364,0.212121,0.171111,0.0,0.135714,0.193878,0.211957,0.002128,0.310782,0.0,0.0,0.0
4,0.125364,0.174688,0.062222,0.0,0.166667,0.183673,0.070652,0.021277,0.363636,0.0,0.0,0.0


### Normalizing Ingredients (One Hot Encode)

In [10]:
def pre_process(text):
    text = re.sub(r'[.,():%-]+', " ", text)
    text = re.sub(r'[\s]+', " ", text)
    text = unidecode(text.strip().lower())
    return text

In [None]:
vocab = set()

for i in range(data_ingredients.shape[0]):
  aux = data_ingredients.at[i,'Ingredients'].split(",")
  for word in aux:
    word = pre_process(word)
    if len(word) > 1: vocab.add(word.strip())

sorted(vocab)

In [12]:
ingredients_normalized = pd.DataFrame()

for i in range(276):
  aux = data_ingredients.at[i,'Ingredients'].split(",")
  for word in aux:
    word = pre_process(word)
    ingredients_normalized.at[i,word] = 1

ingredients_normalized = ingredients_normalized.replace(np.nan,0)
ingredients_normalized.head()

Unnamed: 0,water,texturized soy protein,pea protein,chickpea flour,vegetal fat,modified starch,onion,meat flavored condiment,salt,sugar,...,tricalcium phosphatxanthan gum,phosphoric acid,aquafaba,potassium sorbate disodium calcium,avocado,potassium chloride,isolated vegetable protein,soy,pea starch,isolated rice protein
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Preparing data

In [13]:
raw_data = pd.DataFrame(ingredients_normalized)
raw_data = pd.concat([raw_data,nutrients_normalized], axis=1)
raw_data = raw_data.replace(np.nan,0)

raw_data.head()

Unnamed: 0,water,texturized soy protein,pea protein,chickpea flour,vegetal fat,modified starch,onion,meat flavored condiment,salt,sugar,...,Carbohydrate,Sugars,Proteins,Total Fats,Saturated Fats,Dietary Fiber,Sodium,B12,Calcium,Zinc
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.244444,0.0,0.195238,0.204082,0.48913,0.007447,0.414376,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.244444,0.0,0.195238,0.204082,0.48913,0.007447,0.414376,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.244444,0.0,0.195238,0.204082,0.48913,0.007447,0.414376,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.171111,0.0,0.135714,0.193878,0.211957,0.002128,0.310782,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.062222,0.0,0.166667,0.183673,0.070652,0.021277,0.363636,0.0,0.0,0.0


In [14]:
X = raw_data.values
X_ingredients = ingredients_normalized.values
X_nutrients = nutrients_normalized.values
y = label_data.values
all_data = label_data.join(raw_data)

for i in range(276):
  label = label_data['Classification'][i].rstrip()
  label_data.at[i,'Classification'] = label
  all_data.at[i,'Classification'] = label

print(all_data['Classification'].value_counts())

DAIRY 1      80
DAIRY 3      45
MEAT 1       34
MEAT 3       18
POULTRY 1    14
EGG 1        14
POULTRY 3    13
PORK 1       12
DAIRY 2      11
MEAT 2        9
FISH 1        8
PORK 2        6
POULTRY 2     5
FISH 2        5
EGG 2         2
Name: Classification, dtype: int64


# Visualization Techniques

In [15]:
for i in range(276):
  aux = data_ingredients.at[i,'Ingredients']
  data_ingredients.at[i,'Ingredients'] = re.sub(',\s*',"<br>",aux).lstrip()

In [16]:
if not os.path.exists("imgs"):
    os.mkdir("imgs")

## t-SNE

In [17]:
tsne = TSNE(n_components=2,perplexity=5,learning_rate=350,metric='euclidean', init='pca')
X_tsne = tsne.fit_transform(X)

X_tsne[1:4, :]
all_data['x'] = X_tsne[:,0]
all_data['y'] = X_tsne[:,1]
all_data['Ingredients'] = data_ingredients

fig = px.scatter(
    all_data, 
    x='x', 
    y='y', 
    color='Classification', 
    template="simple_white",
    labels={ 
      "Classification": "Classification (Label)"
    },
    color_discrete_sequence= px.colors.qualitative.Plotly + px.colors.qualitative.Bold,
    hover_name="Description",
    hover_data={'x':False,
                'y':False,  
                'Ingredients':True},
    width=700)

fig.update_traces(showlegend=False,
                  marker=dict(size=8,),
                              # line=dict(width=1,
                              #           color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.update_layout(xaxis={'visible': False},
                  yaxis={'visible': False},
                  margin=dict(l=0,r=0,b=0,t=0))


plt.savefig('imgs/TSNE_ALL.eps', format='eps')
fig.show()

<Figure size 432x288 with 0 Axes>

## UMAP

In [18]:
reducer = umap.UMAP(n_components=2,n_neighbors=10, min_dist=0.1,metric='euclidean', random_state=0)
X_umap = reducer.fit_transform(X)

X_umap[1:4, :]
all_data['x'] = X_umap[:,0]
all_data['y'] = X_umap[:,1]

fig = px.scatter(
    all_data, 
    x='x', 
    y='y', 
    color='Classification', 
    template="simple_white",
    labels={ 
      "Classification": "Classification (Label)"
    },
    color_discrete_sequence= px.colors.qualitative.Plotly + px.colors.qualitative.Bold,
    hover_name="Description",
    hover_data={'x':False,
                'y':False,  
                'Ingredients':True}, 
    width=800)

fig.update_traces(marker=dict(size=8),
                  selector=dict(mode='markers'))

fig.update_layout(xaxis={'visible': False},
                  yaxis={'visible': False},
                  legend=dict(font=dict(size=16)),
                  margin=dict(l=0,r=0,b=0,t=0))

plt.savefig('imgs/UMAP_ALL.eps', format='eps')
fig.show()

<Figure size 432x288 with 0 Axes>

## PCA

In [33]:
sklearn_pca = sklearnPCA(n_components=2)
X_pca = sklearn_pca.fit_transform(X)

X_pca[1:4, :]
all_data['x'] = X_pca[:,0]
all_data['y'] = X_pca[:,1]

fig = px.scatter(
    all_data, 
    x='x', 
    y='y', 
    color='Classification', 
    template="simple_white",
    labels={ 
      "Classification": "Classification (Label)"
    },
    color_discrete_sequence= px.colors.qualitative.Plotly + px.colors.qualitative.Bold,
    hover_name="Description",
    hover_data={'x':False,
                'y':False,  
                # 'Kcal ': [f' {x}' for x in data['Kcal']],
                # 'Serving Size ': [f' {y}' for y in data['Serving Size']],
                # 'Carbohydrate ': [f' {z}' for z in data['Carbohydrate']],
                # 'Sugars ': [f' {a}' for a in data['Sugars']],
                # 'Proteins ': [f' {b}' for b in data['Proteins']],
                'Ingredients':True}, 
    width=700)

fig.update_traces(showlegend=False,
                  marker=dict(size=8),
                  selector=dict(mode='markers'))

fig.update_layout(xaxis={'visible': False},
                  yaxis={'visible': False},
                  margin=dict(l=0,r=0,b=0,t=0))

plt.savefig('imgs/PCA_ALL.eps', format='eps')
fig.show()

<Figure size 432x288 with 0 Axes>

## TRIMAP

In [20]:
X_trimap = trimap.TRIMAP(n_dims=2, n_inliers=10).fit_transform(X)

X_trimap[1:4, :]
all_data['x'] = X_trimap[:,0]
all_data['y'] = X_trimap[:,1]
all_data['Ingredients'] = data_ingredients

fig = px.scatter(
    all_data, 
    x='x', 
    y='y', 
    color='Classification', 
    template="simple_white",
    labels={ 
      "Classification": "Classification (Label)"
    },
    color_discrete_sequence= px.colors.qualitative.Plotly + px.colors.qualitative.Bold,
    hover_name="Description",
    hover_data={'x':False,
                'y':False,  
                # 'Kcal ': [f' {x}' for x in data['Kcal']],
                # 'Serving Size ': [f' {y}' for y in data['Serving Size']],
                # 'Carbohydrate ': [f' {z}' for z in data['Carbohydrate']],
                # 'Sugars ': [f' {a}' for a in data['Sugars']],
                # 'Proteins ': [f' {b}' for b in data['Proteins']],
                'Ingredients':True
                }, 
    width=800)

fig.update_traces(marker=dict(size=8),
                  selector=dict(mode='markers'))

fig.update_layout(xaxis={'visible': False},
                  yaxis={'visible': False},
                  legend=dict(font=dict(size=16)),
                  margin=dict(l=0,r=0,b=0,t=0))

plt.savefig('imgs/TRIMAP_ALL.eps', format='eps')
fig.show()

<Figure size 432x288 with 0 Axes>

# Evaluation metrics

## Trustworthiness

In [34]:
kneigh = []
y_tsne = []
y_umap = []
y_pca = []
y_trimap = []

for k in range(1,51):
  kneigh.append(k)

  precision = trustworthiness(X, X_tsne, n_neighbors=k, metric='euclidean')
  y_tsne.append(precision)

  precision = trustworthiness(X, X_umap, n_neighbors=k, metric='euclidean')
  y_umap.append(precision)

  precision = trustworthiness(X, X_pca, n_neighbors=k, metric='euclidean')
  y_pca.append(precision)

  precision = trustworthiness(X, X_trimap, n_neighbors=k, metric='euclidean')
  y_trimap.append(precision)

In [35]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=kneigh, y=y_tsne,
                    mode='lines+markers',
                    name='t-SNE',
                    line=dict(color=px.colors.qualitative.Plotly[0])))

fig.add_trace(go.Scatter(x=kneigh, y=y_umap,
                    mode='lines+markers',
                    name='UMAP',
                    line=dict(color=px.colors.qualitative.Plotly[1])))

fig.add_trace(go.Scatter(x=kneigh, y=y_pca,
                    mode='lines+markers',
                    name='PCA',
                    line=dict(color=px.colors.qualitative.Plotly[2])))

fig.add_trace(go.Scatter(x=kneigh, y=y_trimap,
                    mode='lines+markers',
                    name='TRIMAP',
                    line=dict(color=px.colors.qualitative.Plotly[3])))

fig.update_layout(width=1000, template="simple_white",
                  xaxis_title="K (number of neighbors)",
                  yaxis_title="Trustworthiness",
                  font=dict(size=18),
                  margin=dict(l=0,r=0,b=0,t=0))

fig.show()
fig. write_image('imgs/neighborhoodpres.png')

## Silhouette Coefficient

In [36]:
def coeficiente_silhueta(X_high,X_low):
  silhouette_avg_high = []
  silhouette_avg_low = []

  for k in range(3,20):
    clusterer = KMedoids(n_clusters=k, init='k-medoids++',random_state=10)
    cluster_labels_high = clusterer.fit_predict(X_high)
    cluster_labels_low = clusterer.fit_predict(X_low)
    silhouette_avg_high.append(silhouette_score(X_high, cluster_labels_high))
    silhouette_avg_low.append(silhouette_score(X_low, cluster_labels_low))

  mean_silhouete_high = np.mean(np.array(silhouette_avg_high))
  mean_silhouete_low = np.mean(np.array(silhouette_avg_low))
  std_silhouete_high = np.std(np.array(silhouette_avg_high))
  std_silhouete_low = np.std(np.array(silhouette_avg_low))
    
  return mean_silhouete_high, mean_silhouete_low, std_silhouete_high, std_silhouete_low

t-SNE

In [37]:
mean_high,mean_low,std_high,std_low = coeficiente_silhueta(X,X_tsne)

print("TSNE")
print(f"Média Silhueta Original: {mean_high} Desvio Padrão Original: {std_high}")
print(f"Média Silhueta Visualização: {mean_low:.10f} Desvio Padrão Visualização: {std_low:.10f}\n")

TSNE
Média Silhueta Original: 0.0860648712739905 Desvio Padrão Original: 0.02080142865863332
Média Silhueta Visualização: 0.4358619452 Desvio Padrão Visualização: 0.0584011637



UMAP

In [38]:
mean_high,mean_low,std_high,std_low = coeficiente_silhueta(X,X_umap)

print("UMAP")
print(f"Média Silhueta Original: {mean_high} Desvio Padrão Original: {std_high}")
print(f"Média Silhueta Visualização: {mean_low:.10f} Desvio Padrão Visualização: {std_low:.10f}\n")

UMAP
Média Silhueta Original: 0.0860648712739905 Desvio Padrão Original: 0.02080142865863332
Média Silhueta Visualização: 0.5213357210 Desvio Padrão Visualização: 0.0776655748



PCA

In [39]:
mean_high,mean_low,std_high,std_low = coeficiente_silhueta(X,X_pca)

print("PCA")
print(f"Média Silhueta Original: {mean_high} Desvio Padrão Original: {std_high}")
print(f"Média Silhueta Visualização: {mean_low:.10f} Desvio Padrão Visualização: {std_low:.10f}\n")

PCA
Média Silhueta Original: 0.0860648712739905 Desvio Padrão Original: 0.02080142865863332
Média Silhueta Visualização: 0.4175589453 Desvio Padrão Visualização: 0.0468529850



TRIMAP

In [40]:
mean_high,mean_low,std_high,std_low = coeficiente_silhueta(X,X_trimap)

print("TriMap")
print(f"Média Silhueta Original: {mean_high} Desvio Padrão Original: {std_high}")
print(f"Média Silhueta Visualização: {mean_low:.10f} Desvio Padrão Visualização: {std_low:.10f}\n")

TriMap
Média Silhueta Original: 0.0860648712739905 Desvio Padrão Original: 0.02080142865863332
Média Silhueta Visualização: 0.5328438878 Desvio Padrão Visualização: 0.0486698411

