In [60]:
import pandas as pd 
import numpy as np
from sklearn.manifold import TSNE
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()

In [62]:
df = pd.read_csv('datasets/cosmetic.csv') 

corpus = [ingredients.lower().split(', ') for ingredients in df['ingredients']]

all_ingre = (ingredient for ingredients in corpus for ingredient in ingredients)

unique_ingr = list(dict.fromkeys(all_ingre))

ingre_tupla = [(i, ingre) for ingre, i in enumerate(unique_ingr)]
    
ingredient_idx = dict(ingre_tupla)

In [64]:
M = len(df)
N = len(ingredient_idx)
print(M, N)
# matrix
A = np.zeros((M, N))

def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        idx = ingredient_idx[ingredient]
        x[idx] = 1
    return x

i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i += 1

1270 5931


In [66]:
# Dimension reduction with t-SNE
model = TSNE(n_components=2, learning_rate=200, random_state=42)
tsne_features = model.fit_transform(A)
 
df['X'] = tsne_features[:, 0]
df['Y'] = tsne_features[:, 1]

In [68]:
source = ColumnDataSource(df)

plot = figure(x_axis_label = 'T-SNE 1', 
              y_axis_label = 'T-SNE 2', 
              width = 500, height = 400)

plot.circle(
    x = 'X', 
    y = 'Y', 
    source = source, 
    size = 10, color = '#FF7373', alpha = .8
)

hover = HoverTool(tooltips = [('Item', '@name'),
                              ('Brand', '@brand'),
                              ('Price', '$@price'),
                              ('Rank', '@rank'),
                             ('Ingredients', '@ingredients')])
plot.add_tools(hover)

show(plot)