In [23]:
import pandas as pd
import numpy as np
import re
from sklearn.manifold import TSNE
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
import random
output_notebook()

In [16]:
df = pd.read_csv('datasets/cosmetic.csv')

df['ingredients'] = df['ingredients'].str.replace(r'\.+$', '', regex=True)

df['id'] = df.index
df['count'] = [len(re.split(r'\s*,\s+', ingredients.lower())) for ingredients in df['ingredients']]

In [29]:
corpus = [re.split(r'\s*,\s+', ingredients.lower()) for ingredients in df['ingredients']]

all_ingre = (ingredient for ingredients in corpus for ingredient in ingredients)

unique_ingr = list(dict.fromkeys(all_ingre))

ingre_tupla = [(i, ingre) for ingre, i in enumerate(unique_ingr)]

ingredient_idx = dict(ingre_tupla)

M = len(df)
N = len(ingredient_idx)

# matrix
A = np.zeros((M, N))

def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        idx = ingredient_idx[ingredient]
        x[idx] = 1
    return x

i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i += 1

model = TSNE(n_components=2, learning_rate=200, random_state=42)
tsne_features = model.fit_transform(A)
    
df['x'] = tsne_features[:, 0]
df['y'] = tsne_features[:, 1]

In [35]:

source = ColumnDataSource(df[df['count']< 2])

plot = figure(x_axis_label = 'T-SNE 1', 
              y_axis_label = 'T-SNE 2', 
              width = 500, height = 400)

plot.circle(
    x = 'x', 
    y = 'y', 
    source = source, 
    size = 10, color = '#FF7373', alpha = .8
)

hover = HoverTool(tooltips = [('Item', '@name'),
                              ('Brand', '@brand'),
                              ('Price', '$@price'),
                              ('C', '@count'), ('id', '@id')])
plot.add_tools(hover)

show(plot)