# Exploring the Embeddings in the two Datasets

In [3]:
import pandas as pd
import numpy as np
from scipy.spatial import distance

In [4]:
beauty_df = pd.read_csv('../beauty/product_embeddings_openai.csv.gzip', compression="gzip", index_col=0)
beauty_df

Unnamed: 0,global_product_id,name,ada_embedding
0,1504,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,"[-0.008468648418784142, 0.014345130883157253, ..."
1,564,Xtreme Brite Brightening Gel 1oz.,"[0.019681310281157494, 0.009377948939800262, -..."
2,9963,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,"[-0.00300808809697628, -0.007103437092155218, ..."
3,9839,Versace Bright Crystal Eau de Toilette Spray f...,"[0.0053097945638000965, 0.0017624408937990665,..."
4,4132,Stella McCartney Stella,"[-0.006986561696976423, -0.0015255995094776154..."
...,...,...,...
4206,5210,Unknown item,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4381,12017,Unknown item,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4398,122,Unknown item,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10771,9757,Unknown item,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [5]:
beauty_df = beauty_df[beauty_df['name'] != 'Unknown item']
beauty_df

Unnamed: 0,global_product_id,name,ada_embedding
0,1504,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,"[-0.008468648418784142, 0.014345130883157253, ..."
1,564,Xtreme Brite Brightening Gel 1oz.,"[0.019681310281157494, 0.009377948939800262, -..."
2,9963,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,"[-0.00300808809697628, -0.007103437092155218, ..."
3,9839,Versace Bright Crystal Eau de Toilette Spray f...,"[0.0053097945638000965, 0.0017624408937990665,..."
4,4132,Stella McCartney Stella,"[-0.006986561696976423, -0.0015255995094776154..."
...,...,...,...
12096,1982,"Moroccan Argan Oil - For Hair, Face, Skin, an...","[-0.012483133003115654, -0.012812826782464981,..."
12097,8158,LIME CRIME Velvetines - Wicked,"[-0.004164917394518852, -0.02391231246292591, ..."
12098,9571,Dr Song Rosehip Oil 4oz (4 oz),"[-0.0009733201586641371, -0.016524959355592728..."
12099,7057,VITAMIN C SERUM 20% with Hyaluronic Acid For Y...,"[-0.010926627553999424, -0.0015354464994743466..."


In [6]:
beauty_embeddings = beauty_df['ada_embedding'].str.\
    replace('[', '').str.replace(']', '').\
    str.split(',').apply(pd.Series).astype(float)
beauty_embeddings.shape

  beauty_embeddings = beauty_df['ada_embedding'].str.\


(12094, 1536)

In [7]:
beauty_embeddings_lengths = np.linalg.norm(beauty_embeddings, axis=1)
beauty_embeddings_lengths.shape

(12094,)

In [8]:
beauty_embeddings_lengths[:10]

array([0.99999996, 1.00000002, 1.00000002, 0.99999996, 0.99999996,
       1.00000003, 1.00000002, 0.99999998, 1.00000006, 1.00000005])

In [9]:
(beauty_embeddings_lengths.min(), beauty_embeddings_lengths.max())

(0.9999999251546825, 1.0000000719584459)

In [10]:
beauty_distances = distance.pdist(beauty_embeddings, 'cosine')
beauty_distances.shape

(73126371,)

In [11]:
(beauty_distances.mean(),
 np.median(beauty_distances),
 beauty_distances.std(),
 beauty_distances.min(),
 np.quantile(beauty_distances, 0.25),
 np.quantile(beauty_distances, 0.75),
 beauty_distances.max())


(0.21119035574997344,
 0.21289751468660767,
 0.02887496309299617,
 0.0,
 0.19301701693760365,
 0.23117767982469256,
 0.3430562021997774)

In [12]:
beauty_distance_matrix = distance.squareform(beauty_distances)
beauty_distance_matrix.shape

(12094, 12094)

In [13]:
indices = np.argsort(beauty_distance_matrix, axis=1)
indices.shape

(12094, 12094)

In [14]:
sorted_beauty_distance_matrix = np.take_along_axis(beauty_distance_matrix, 
                                                   indices, 
                                                   axis=1)

In [15]:
sorted_beauty_distance_matrix[:, 1:11].std(axis=1)

array([0.01825627, 0.03727636, 0.01349309, ..., 0.00285086, 0.0027459 ,
       0.00512516])

In [16]:
(sorted_beauty_distance_matrix[:, 1:11].mean(axis=1).mean(),
 sorted_beauty_distance_matrix[:, 1:11].std(axis=1).mean(),
 sorted_beauty_distance_matrix[:, 1:11].min(axis=1).mean(),
 sorted_beauty_distance_matrix[:, 1:11].max(axis=1).mean())

 

(0.09461123791230651,
 0.013677416431106738,
 0.06501645621030769,
 0.10828623959153279)