In [1]:
import os
import sys

# root path
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the Python path
if ROOT not in sys.path:
    sys.path.append(ROOT)

import polars as pl
import pandas as pd
import numpy as np

# Products similarity

1. Calculate the cosine similarity between each product and all other products
2. Save to file
3. Retain only the top 100 most similar products for each product
4. Save to file

In [2]:
from config import PRODUCTS_PARQUET_PATH_IMPUTED, PROD_SIM_MATRIX_PATH, PROD_TOP_SIMILAR_PATH
from src.data.loaders import PolarsLoader

loader = PolarsLoader(sampling=True, file_type='parquet', n_sample=1000)

In [3]:
prods = loader.load_data(PRODUCTS_PARQUET_PATH_IMPUTED)
print(prods.shape)
print(prods.head())
print(prods.null_count())

(1000, 6)
shape: (5, 6)
┌──────────┬─────────────────────────────────┬────────────┬──────────┬─────────────┬────────┐
│ discount ┆ embedding                       ┆ partnumber ┆ color_id ┆ cod_section ┆ family │
│ ---      ┆ ---                             ┆ ---        ┆ ---      ┆ ---         ┆ ---    │
│ u8       ┆ list[f32]                       ┆ u16        ┆ u16      ┆ u8          ┆ u16    │
╞══════════╪═════════════════════════════════╪════════════╪══════════╪═════════════╪════════╡
│ 0        ┆ [-0.134014, -0.120043, … -0.08… ┆ 32776      ┆ 85       ┆ 4           ┆ 73     │
│ 0        ┆ [-0.094927, -0.107294, … -0.04… ┆ 41431      ┆ 135      ┆ 4           ┆ 73     │
│ 0        ┆ [-0.129044, -0.077246, … 0.002… ┆ 39419      ┆ 339      ┆ 4           ┆ 73     │
│ 1        ┆ [-0.127833, -0.133868, … -0.09… ┆ 36087      ┆ 135      ┆ 4           ┆ 73     │
│ 1        ┆ [-0.140929, -0.125828, … -0.04… ┆ 34132      ┆ 3        ┆ 4           ┆ 73     │
└──────────┴────────────────────────

Caclulate the cosine similarity between the product vectors.

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Extract the embeddings as a NumPy array
embeddings = np.stack(prods['embedding'].to_numpy())

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(embeddings)

In [5]:
# Convert to a DataFrame for easier inspection
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=prods['partnumber'], columns=prods['partnumber'])
cosine_sim_df.to_parquet(PROD_SIM_MATRIX_PATH)

Select only the top 100 similar products for each product

In [6]:
cosine_sim_matrix

array([[1.0000002 , 0.8517331 , 0.9556012 , ..., 0.4215176 , 0.36861256,
        0.38999045],
       [0.8517331 , 1.0000001 , 0.85641503, ..., 0.37317014, 0.26663315,
        0.31397867],
       [0.9556012 , 0.85641503, 1.0000001 , ..., 0.41421548, 0.3696135 ,
        0.39984655],
       ...,
       [0.4215176 , 0.37317014, 0.41421548, ..., 1.        , 0.558054  ,
        0.7487153 ],
       [0.36861256, 0.26663315, 0.3696135 , ..., 0.558054  , 0.99999964,
        0.79222757],
       [0.38999045, 0.31397867, 0.39984655, ..., 0.7487153 , 0.79222757,
        0.99999994]], shape=(1000, 1000), dtype=float32)

In [7]:
# Create the DataFrame with partnumber and top 100 similar products
top_k = 100  # Number of top similar products to retain
partnumbers = prods['partnumber']

In [30]:
# Initialize a list to store the results
top_similar_products = []

for idx, partnumber in enumerate(partnumbers):
    # Get the similarity scores for the current product
    similarities = cosine_sim_matrix[idx]
    
    # Get indices of the top 100 similar products (excluding itself)
    top_indices = np.argsort(similarities)[::-1][1:top_k + 1]
    
    # Get the corresponding partnumbers for the top similar products
    top_similar = partnumbers[top_indices]
    
    # Append to the results
    top_similar_products.append({'partnumber': partnumber, 'top_100_cos_partnumber': top_similar})

# Convert the list of dictionaries to a DataFrame
top_similar_df = pd.DataFrame(top_similar_products)
top_similar_df

Unnamed: 0,partnumber,top_100_cos_partnumber
0,32776,"[39419, 35047, 42796, 40304, 39420, 34813, 392..."
1,41431,"[33920, 33720, 38988, 41664, 38744, 43452, 378..."
2,39419,"[32776, 34813, 36957, 40304, 36089, 42796, 398..."
3,36087,"[41664, 34132, 40306, 40944, 38988, 42127, 434..."
4,34132,"[37825, 36087, 41664, 35244, 43452, 40944, 407..."
...,...,...
995,41689,"[43035, 38757, 33923, 38290, 34819, 40722, 423..."
996,41690,"[36966, 39210, 42823, 36310, 40494, 41670, 343..."
997,33511,"[41672, 36562, 33033, 40066, 42354, 36745, 369..."
998,33936,"[40078, 35259, 42146, 35471, 35884, 34831, 432..."


In [31]:
# Initialize a dictionary to store the results
top_similar_products = {}

for idx, partnumber in enumerate(partnumbers):
    # Get the similarity scores for the current product
    similarities = cosine_sim_matrix[idx]
    
    # Get indices of the top 100 similar products (excluding itself)
    top_indices = np.argsort(similarities)[::-1][1:top_k + 1]
    
    # Get the corresponding partnumbers for the top similar products
    top_similar = partnumbers[top_indices]
    
    # Add to the dictionary
    top_similar_products[partnumber] = top_similar

top_similar_products

{32776: shape: (100,)
 Series: 'partnumber' [u16]
 [
 	39419
 	35047
 	42796
 	40304
 	39420
 	…
 	35456
 	39859
 	33735
 	42590
 	33504
 ],
 41431: shape: (100,)
 Series: 'partnumber' [u16]
 [
 	33920
 	33720
 	38988
 	41664
 	38744
 	…
 	37630
 	38073
 	42819
 	33927
 	38766
 ],
 39419: shape: (100,)
 Series: 'partnumber' [u16]
 [
 	32776
 	34813
 	36957
 	40304
 	36089
 	…
 	41192
 	33735
 	42590
 	38516
 	42816
 ],
 36087: shape: (100,)
 Series: 'partnumber' [u16]
 [
 	41664
 	34132
 	40306
 	40944
 	38988
 	…
 	33504
 	37641
 	35888
 	33930
 	38511
 ],
 34132: shape: (100,)
 Series: 'partnumber' [u16]
 [
 	37825
 	36087
 	41664
 	35244
 	43452
 	…
 	34603
 	35875
 	38516
 	41192
 	38511
 ],
 40303: shape: (100,)
 Series: 'partnumber' [u16]
 [
 	40944
 	33720
 	33921
 	41431
 	38744
 	…
 	41192
 	41200
 	42816
 	43029
 	40726
 ],
 34133: shape: (100,)
 Series: 'partnumber' [u16]
 [
 	37825
 	33718
 	42573
 	33921
 	33030
 	…
 	33504
 	35875
 	43230
 	38516
 	38517
 ],
 33718: shape

In [32]:
pl.DataFrame(top_similar_products)

TypeError: argument 'name': 'int' object cannot be converted to 'PyString'