# Finding similar products (shoes) using vector-based similarity search in Db2



General flow:
- Setup, including Db2 database connection and creating a table
- Generate fake product data
- Generate vector embeddings for key features using a local ollama service
- Add new [vector-based](https://www.ibm.com/docs/en/db2/12.1.0?topic=list-vector-values) embedding column to table, insert data
- Perform some queries utilizing [vector distance search](https://www.ibm.com/docs/en/db2/12.1.0?topic=functions-vector-distance) for semantic product recommendation (what other products are similar?)
- Cleanup



In [1]:
# Load the required modules
import pandas as pd
import os, csv
import random
from dotenv import load_dotenv
import numpy as np
import ollama
%load_ext sql

from IPython.core.magic import register_cell_magic
from IPython import get_ipython

# define a cell magic to skip a cell based on a condition
@register_cell_magic
def skip_if(line, cell):
    if eval(line):
        return
    get_ipython().run_cell(cell)

In [2]:
# Configure the SQL magic
%config SqlMagic.dsn_filename = '.db2conn'
%config SqlMagic.displaylimit = 20
%config SqlMagic.named_parameters="enabled"

# load more settings from .env
load_dotenv(os.getcwd()+"/.env", override=True)

# variables to define if we generate or import all data, export the data, keep the table, which embedding mode to use
IMPORT_DATA=os.getenv('IMPORT_DATA')
EXPORT_DATA=os.getenv('EXPORT_DATA')
KEEP_DATA=os.getenv('KEEP_DATA')
EMBEDDING_MODEL=os.getenv('EMBEDDING_MODEL')

## Connect to Db2 database
Check the file `.db2conn` for the configuration

In [3]:
%sql --section db2
%sql --connections

current,url,alias
*,db2://db2inst1:***@localhost:50000/testdb,db2


# Setting up a Shoes Table in Db2

In [4]:
# Drop the table if it exists
%sql DROP TABLE IF EXISTS SHOES
# Create the table
sql="""
    CREATE TABLE IF NOT EXISTS SHOES (
        SKU VARCHAR(8),
        PRODUCT_NAME VARCHAR(40),
        BRAND VARCHAR(20),
        CLASS VARCHAR(5),
        S_TYPE VARCHAR(7),
        MATERIAL VARCHAR(20),
        COLOR VARCHAR(10),
        WEATHER_RESISTANCE VARCHAR(10),
        ARCH_SUPPORT VARCHAR(4),
        SIZE FLOAT,
        PRICE FLOAT,
        RATING FLOAT,
        STORE_ID BIGINT,
        CITY VARCHAR(40)
    );
    """

%sql {{sql}}

In [5]:
%%skip_if $IMPORT_DATA
# Generate data
cities = [
    "Frankfurt", "Berlin", "Munich", "Hamburg"
]

# Definitions
brands = ['Zentrax', 'FootFlex', 'StrideOne', 'Loopic', 'RunXpress', 'ComfRun']
types = ['Running', 'Walking', 'Trail']
classes = ['Men', 'Women']
materials = ['Synthetic', 'Knit', 'Leather']
colors = ['Black', 'White']
arch_supports = ['High', 'Flat']
weather_resistances = ['Waterproof', 'Resistant']
sizes = [round(s, 1) for s in range(6, 13)] + [s + 0.5 for s in range(6, 13)]
store_ids = range(1, 21)

# Helper: create a fake product name
def create_product_name(brand, shoe_type):
    return f"{brand} {random.choice(['Ultra', 'Flex', 'Pro', 'X', 'Max'])} {shoe_type}"

# Helper: create fake keywords
def generate_keywords(shoe_type, material):
    keywords = [shoe_type.lower(), material.lower()]
    keywords += random.sample(['lightweight', 'durable', 'breathable', 'cushioned', 'supportive', 'flexible'], 3)
    return ', '.join(keywords)

def generate_shoe_data(n=500):
    data = []
    used_skus = set()

    for _ in range(n):
        brand = random.choice(brands)
        shoe_type = random.choice(types)
        shoe_class = random.choice(classes)
        material = random.choice(materials)
        size = random.choice(sizes)
        color = random.choice(colors)
        arch = random.choice(arch_supports)
        weather = random.choice(weather_resistances)
        store_id = random.choice(store_ids)
        city = random.choice(cities)
                
        price = round(random.uniform(29.99, 149.99), 2)
        rating = round(random.uniform(3.0, 5.0), 1)
        product_name = create_product_name(brand, shoe_type)

        # Ensure SKU uniqueness
        while True:
            sku = f"{brand[:3].upper()}-{random.randint(1000, 9999)}"
            if sku not in used_skus:
                used_skus.add(sku)
                break

        data.append({
            'SKU': sku,
            'PRODUCT_NAME': product_name,
            'BRAND': brand,
            'CLASS': shoe_class,
            'S_TYPE': shoe_type,
            'MATERIAL': material,
            'COLOR': color,
            'WEATHER_RESISTANCE': weather,
            'ARCH_SUPPORT': arch,
            'SIZE': size,
            'PRICE': price,
            'RATING': rating,
            'STORE_ID': store_id,
            'CITY': city
        })

    return pd.DataFrame(data)

# Generate and save
df_shoes = generate_shoe_data(500)


In [6]:
%%skip_if $IMPORT_DATA
# A look at the generated data
df_shoes.head()

Unnamed: 0,SKU,PRODUCT_NAME,BRAND,CLASS,S_TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,SIZE,PRICE,RATING,STORE_ID,CITY
0,ZEN-1607,Zentrax X Walking,Zentrax,Women,Walking,Synthetic,White,Resistant,High,8.5,58.51,4.9,3,Frankfurt
1,COM-1053,ComfRun Pro Running,ComfRun,Men,Running,Leather,White,Waterproof,High,6.0,123.86,4.2,20,Munich
2,COM-9382,ComfRun Flex Trail,ComfRun,Women,Trail,Synthetic,White,Waterproof,High,7.5,74.43,3.4,8,Hamburg
3,ZEN-6599,Zentrax Max Running,Zentrax,Women,Running,Knit,Black,Waterproof,Flat,10.0,138.58,4.6,20,Frankfurt
4,LOO-4151,Loopic Pro Trail,Loopic,Men,Trail,Knit,White,Resistant,High,11.0,54.84,3.4,10,Munich


In [7]:
%%skip_if $IMPORT_DATA
# Define columns that define features for embedding
embedding_cols = ['S_TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT']
# The output matches the columns and output shown in the previous cell (see above)
df_shoes[embedding_cols].head()

Unnamed: 0,S_TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT
0,Walking,Synthetic,White,Resistant,High
1,Running,Leather,White,Waterproof,High
2,Trail,Synthetic,White,Waterproof,High
3,Running,Knit,Black,Waterproof,Flat
4,Trail,Knit,White,Resistant,High


# Generating embedding vectors for the shoes

In [8]:
%%skip_if $IMPORT_DATA
# Combine all embedding columns into a single string for each row, including column names
# The key/value pairs are separated by ' [SEP] '
df_shoes['COMBINED'] = df_shoes.apply(
    lambda row: ' [SEP] '.join([f"{col_name}: {row[col_name]}" for col_name in embedding_cols]), 
    axis=1
)

In [9]:
%%skip_if $IMPORT_DATA
# Show the same columns plus the new COMBINED column
cols_to_show = ['S_TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'COMBINED']
df_shoes[cols_to_show].head()

Unnamed: 0,S_TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,COMBINED
0,Walking,Synthetic,White,Resistant,High,S_TYPE: Walking [SEP] MATERIAL: Synthetic [SEP...
1,Running,Leather,White,Waterproof,High,S_TYPE: Running [SEP] MATERIAL: Leather [SEP] ...
2,Trail,Synthetic,White,Waterproof,High,S_TYPE: Trail [SEP] MATERIAL: Synthetic [SEP] ...
3,Running,Knit,Black,Waterproof,Flat,S_TYPE: Running [SEP] MATERIAL: Knit [SEP] COL...
4,Trail,Knit,White,Resistant,High,S_TYPE: Trail [SEP] MATERIAL: Knit [SEP] COLOR...


In [10]:
%%skip_if $IMPORT_DATA
df_shoes.iloc[0]['COMBINED']

'S_TYPE: Walking [SEP] MATERIAL: Synthetic [SEP] COLOR: White [SEP] WEATHER_RESISTANCE: Resistant [SEP] ARCH_SUPPORT: High'

Generate the embeddings using a local Ollama service.

In [11]:
%%skip_if $IMPORT_DATA
# Make list from combined columns
row_combined = df_shoes['COMBINED'].tolist()
# Run batch processing for generation of embeddings
response = ollama.embed(model=EMBEDDING_MODEL, input=row_combined)
shoe_vectors = response["embeddings"]
df_shoes['EMBEDDING'] = shoe_vectors
# remove the column with the input values
df_shoes.drop(['COMBINED'], axis=1, inplace=True)


Instead of generating embeddings with an AI model, you can also use the following to load already generated data from a CSV file. The following cell is only run, if configured in `.env`.

In [12]:
%%skip_if not $IMPORT_DATA
# Instead of generating new data, load pregenerated data from a CSV file and use it instead.

df_shoes=pd.read_csv('shoes_data_with_vectors.csv')
df_shoes.head()

In [13]:
# show a sample vector value
df_shoes.iloc[0]['EMBEDDING']

[-0.038015913,
 0.0019587302,
 0.036395896,
 0.0050778836,
 -0.04683683,
 -0.0051095113,
 -0.0097230645,
 0.005062786,
 -0.0033472006,
 0.0205848,
 0.021602223,
 -0.0031587987,
 -0.033878174,
 -0.0049700155,
 -0.030726913,
 -0.015864618,
 -0.035015367,
 0.030917423,
 0.015019753,
 -0.030409731,
 0.02373948,
 0.031666055,
 -0.0025017248,
 -0.034466732,
 -0.037206408,
 -0.03442028,
 -0.06407887,
 0.00862117,
 -0.039192136,
 -0.14472179,
 0.056720182,
 -0.010661362,
 -0.06560199,
 -0.071424,
 -0.044250023,
 -0.08184219,
 0.073111795,
 -0.033596985,
 -0.0058488944,
 0.043869704,
 -0.015359135,
 0.03882127,
 -0.03745344,
 0.04163604,
 0.11181399,
 0.03832804,
 -0.07221442,
 0.04282418,
 0.026467994,
 -0.02221736,
 0.008413324,
 -0.09213223,
 -0.023552967,
 0.04842496,
 -0.0050935806,
 -0.05557232,
 0.0042551095,
 -0.02944423,
 -0.05480462,
 -0.08403032,
 0.06310562,
 0.037268076,
 -0.002025392,
 -0.024604382,
 -0.015852893,
 0.012896472,
 0.01908829,
 0.011775858,
 0.017972855,
 -0.03212321

# Add vector column to SHOES table and then insert the data

In [14]:
# Extract the dimensions, they vary by model
# The dimension is needed to set up the vector column in Db2 and to insert data
vector_dimension=len(df_shoes['EMBEDDING'][0])
vector_dimension

384

### Adding a `VECTOR` column

Alter the SHOES table and add the vector column.
Note that the dimension needs to fit with the generated embeddings

In [15]:
%%sql
ALTER TABLE SHOES
ADD COLUMN EMBEDDING VECTOR({{vector_dimension}}, FLOAT32);

In [16]:
# DESCRIBE the table to show schema. Note the VECTOR-typed column EMBEDDING
%sql CALL SYSPROC.ADMIN_CMD('describe table shoes')


colname,typeschema,typename,length,scale,nullable
SKU,SYSIBM,VARCHAR,8,0,Y
PRODUCT_NAME,SYSIBM,VARCHAR,40,0,Y
BRAND,SYSIBM,VARCHAR,20,0,Y
CLASS,SYSIBM,VARCHAR,5,0,Y
S_TYPE,SYSIBM,VARCHAR,7,0,Y
MATERIAL,SYSIBM,VARCHAR,20,0,Y
COLOR,SYSIBM,VARCHAR,10,0,Y
WEATHER_RESISTANCE,SYSIBM,VARCHAR,10,0,Y
ARCH_SUPPORT,SYSIBM,VARCHAR,4,0,Y
SIZE,SYSIBM,DOUBLE,8,0,Y


Insert the data into SHOES table by looping over the data frame. Not efficient, but ok for this example.

In [17]:
# Turn regular output off to not have 500 outputs
%config SqlMagic.feedback=0
sql="""
insert into shoes values
(:sku, :product, :brand, :rclass, :rtype, :material, :color, :wr, :arch_s,
:rsize, :price, :rating, :storeid, :city, VECTOR(:vector_str ,{vector_dimension}, FLOAT32))
""".format(vector_dimension=vector_dimension)

for index, row in df_shoes.iterrows():
    sku, product, brand, rclass, rtype, material, color, wr, arch_s, rsize, price,\
     rating, storeid, city, embedding = row
    vector_str = "[" + ", ".join(map(str, embedding)) + "]"
    %sql {{sql}}
    
# Turn regular output back on
%config SqlMagic.feedback=1

## Work with the inserted data

In [18]:
# The row count should match the number of generated data records
%sql SELECT count(*) as NUM_ROWS FROM SHOES

num_rows
500


In [19]:
# Search for Men shoes of size 12
sql = """ 
    SELECT SKU, PRODUCT_NAME, BRAND, S_TYPE, MATERIAL, COLOR, WEATHER_RESISTANCE, ARCH_SUPPORT, PRICE, RATING, CITY
    FROM SHOES 
    WHERE CLASS = 'Men' AND Size = 12 
    """

shoe_search = %sql {{sql}}

shoe_search

sku,product_name,brand,s_type,material,color,weather_resistance,arch_support,price,rating,city
COM-6705,ComfRun X Walking,ComfRun,Walking,Synthetic,Black,Resistant,Flat,125.33,3.3,Munich
ZEN-2634,Zentrax Max Running,Zentrax,Running,Leather,Black,Waterproof,Flat,105.69,4.1,Berlin
STR-7390,StrideOne X Walking,StrideOne,Walking,Knit,White,Resistant,Flat,36.1,4.1,Frankfurt
LOO-6648,Loopic X Running,Loopic,Running,Synthetic,White,Resistant,Flat,64.73,4.6,Berlin
COM-5245,ComfRun Pro Running,ComfRun,Running,Leather,White,Waterproof,High,52.32,4.8,Frankfurt
RUN-9794,RunXpress Max Running,RunXpress,Running,Knit,White,Waterproof,High,114.34,4.8,Munich
COM-4101,ComfRun Flex Trail,ComfRun,Trail,Synthetic,Black,Waterproof,Flat,37.58,3.7,Berlin
LOO-8242,Loopic X Walking,Loopic,Walking,Leather,White,Resistant,High,34.89,3.2,Hamburg
LOO-5291,Loopic Ultra Walking,Loopic,Walking,Leather,White,Waterproof,Flat,144.06,3.7,Munich
LOO-3311,Loopic X Trail,Loopic,Trail,Synthetic,Black,Waterproof,Flat,34.38,3.6,Hamburg


In [20]:
# Turn the result into a DataFrame
df_shoe_search = shoe_search.DataFrame()
# extract SKUs
sku_list = df_shoe_search['sku']
# Pick a random SKU as our "choice"
my_choice_sku = random.choice(sku_list)
#print the selected SKU
my_choice_sku

'COM-1458'

In [21]:
# What is the full record for "our" choice?
%sql select * from SHOES where SKU='{{my_choice_sku}}'

sku,product_name,brand,class,s_type,material,color,weather_resistance,arch_support,size,price,rating,store_id,city,embedding
COM-1458,ComfRun Max Trail,ComfRun,Men,Trail,Knit,White,Waterproof,Flat,12.0,70.14,3.9,17,Berlin,"[-0.0141454292,-0.00586366747,0.0164870769,-0.00264918362,0.000109605062,0.00436646538,0.0361024775,-0.00493029924,-0.0177709199,-0.00159126311,0.0138651142,-0.0246437639,-0.0367927663,-0.0108344397,-0.0209909622,-0.050312046,-0.00826925877,0.0066806688,0.00095428928,-0.0447674356,0.0139973154,0.00592687866,0.0192699414,-0.0545085296,-0.0459321514,-0.0577623919,-0.00899489503,0.00203090976,-0.0219426285,-0.152762085,0.0557851829,-0.000892292999,-0.0282742716,-0.0844328254,-0.0517299101,-0.08499562,0.0659450516,-0.0251708291,0.0177642684,0.0189367346,-0.0219738204,0.0654334575,-0.00351896463,0.0257349443,0.110508889,0.0388614275,-0.072857976,0.0212553367,0.0638266727,-0.0336946435,0.0304474682,-0.0773060843,-0.00985398144,0.0451224148,-0.0118129663,-0.0743805915,0.0166553184,-0.009132131,-0.0422183461,-0.0909924433,0.0576638281,0.0471429639,0.0193204079,-0.0324659422,-0.00137328636,0.00739740254,0.0142444335,-0.0151637709,0.00879457034,-0.0222641677,0.0409867764,-0.066985622,0.0181116723,-0.0677116737,0.0418655239,-0.119232796,-0.0140911592,-0.027512705,-0.0210779477,0.013524631,-0.0124934772,0.0133861657,-0.0884114429,-0.0826135501,0.116215229,-0.020525584,-0.0164704416,0.0575200915,-0.00402551377,0.169754326,0.0176718868,-0.0217113681,0.0323977843,0.0329678394,0.0287819784,-0.0533995368,-0.187948629,0.00795170851,-0.0861684754,-0.00772979576,-0.00336500118,-0.0313764699,-0.0105008306,-0.0243411399,0.00676790765,-0.0309551526,0.0429176763,0.0419024304,-0.0283514708,-0.0960578099,-0.030443754,0.100570247,-0.0373390876,-0.000190154926,0.0114562931,0.0197927821,0.066797927,-0.103112519,-0.0281439107,0.00115570636,0.0327162221,0.0270746332,-0.0101894271,-0.0348306373,0.0551960133,0.0221260879,0.0040221042,-0.0310008172,-0.0240874123,0.0279384181,0.0270298291,0.0571610257,-0.12670885,0.0617239326,-0.010137287,0.0186540633,-0.0324820764,0.0309295971,0.0410882868,-0.0277370848,-0.0230433457,-0.0646685138,0.00292266416,-0.0826998055,0.0445455238,-0.0216704905,-0.110193685,0.0125384042,-0.0586003885,0.041508656,-0.0232033953,-0.0244488362,-0.0111289192,0.011105028,-0.0635450557,-0.0249304399,-0.0312676728,0.0790303871,0.0139450934,0.0177931301,-0.00386137026,-0.00872453582,0.0322508439,0.0378726125,-0.117837936,0.0501407683,0.00800729729,-0.0174198952,-0.0886342302,0.0858948454,0.0477369092,-0.0133303143,-0.0106160427,0.0183481518,-0.0761001632,0.0436225832,-0.0315847173,-0.0143346442,0.0192736797,0.00612569787,-0.0584445782,0.0590033196,0.0514871515,-0.0287013762,-0.0720252916,-0.0187522024,-0.0639593601,-0.0789336562,0.0159462821,-0.020110894,-0.0157806985,0.0542632788,0.0656144917,-0.0185527951,-0.00643482851,-0.0366684869,-0.0315979347,-0.00970100425,-0.0260843504,0.0231394786,0.0728142187,-0.0295354314,0.171216622,0.0358773917,0.103295475,0.025882557,0.0338794477,0.0412391424,-0.024455376,-0.0261264574,0.0123143559,0.00279711117,0.00865853019,-0.0610932,0.00311947893,-0.0371032543,-0.0863638371,-0.0592399947,-0.0437078364,-0.00239466387,0.091993317,-0.0390832722,-0.054120902,0.0529290289,-0.0283773821,-0.0261973739,0.0440223217,0.00862407964,-0.0196073316,0.0152709549,-0.00549597293,0.0497655571,-0.0101752272,0.0137656806,-0.098803021,-0.0291646644,-0.0750835687,-7.98004403E-05,-0.062567845,-0.0426229984,0.017881956,0.119863994,0.0066995169,-0.0155738462,0.110866964,0.0893096551,0.00450007757,0.0360133573,-0.0295438357,0.0118419547,0.0195724145,0.0605312437,0.0581049323,0.0180149619,-0.012985317,0.0188765991,-0.00611191802,-0.00923525821,0.0216051359,-0.00896373484,0.0788857341,-0.0859327093,0.00534965284,0.0227608774,0.000108286324,-0.00982031785,0.005520429,0.059691485,-0.00949857291,0.0151904942,-0.0219442025,0.00205465173,0.00089651352,-0.0150763281,0.0605986789,-0.0271883029,-0.021585945,-0.0131885102,-0.0137109207,0.0411579311,0.0486048386,0.0220278706,-0.0113406926,-0.0346060134,-0.0194913,-0.0221919846,-0.0399656221,-0.0363658071,-0.055957444,-0.023867676,0.0260585621,-0.0670854449,0.0189762563,0.0415141657,0.0717732608,0.146446884,-0.04074255,-0.0657167211,0.0556389764,-0.0481311902,-0.0216009598,0.0309454035,-0.0133619346,-0.117838547,0.00856127311,0.0220339242,0.0202834215,-0.0202711653,0.0147955325,-0.0411993191,-0.0707990602,-0.0124151725,0.00939169526,-0.065340586,-0.0204936713,0.010069835,0.00752729923,0.00889116991,-0.0393865593,-0.00292150583,0.00109762966,-0.0575226061,-0.0378960557,0.0775322616,0.178303108,0.00906556845,-0.00983136892,0.00491670053,0.0909773856,0.0331587009,-0.0234310236,-0.0328678563,0.0220405255,0.0705408305,0.160908788,-0.0200935379,-0.0867980048,0.118047066,0.0307068508,-0.0121980337,-0.0332247056,-0.0126678925,0.00203634379,-0.0179422349,0.114881232,0.00400391361,0.00249644252,-0.0145833623,-0.031456802,-0.00343462662,-0.00246744091,-0.0957401097,0.0720744431,-0.0747023225,-0.0740533322,0.0229235273,-0.0946532488,-0.0143001899,-0.00960556418,-0.00381797971,-0.0272530373,0.0691485703,-0.0795809925,0.0116103943,0.0672215894,0.0364233665,-0.112493813,-0.0247770697,-0.0541849956,-0.0530692935,-0.00503956154,-0.0463992283,0.01485461,0.0201502498,0.0315038525,0.00611111661,-0.00300465943,-0.0533839762,-0.00914514996,-0.100984558,0.0145411277,-0.00537043717,-0.160756722,0.0261443928]"


Searching for similar 'Men' shoes (type, material, color, weather resistance, arch support) at the Frankfurt location with size 12

In [22]:
# SQL query using VECTOR_DISTANCE and the EMBEDDING from the selected shoe (my_choice_sku)
sql = f"""
SELECT 
    SKU, 
    PRODUCT_NAME, 
    BRAND, 
    S_TYPE, 
    MATERIAL, 
    COLOR, 
    WEATHER_RESISTANCE, 
    ARCH_SUPPORT, 
    PRICE, 
    RATING,
    VECTOR_DISTANCE(
        (SELECT EMBEDDING FROM SHOES WHERE SKU = '{my_choice_sku}'), 
        EMBEDDING, 
        EUCLIDEAN
    ) AS DISTANCE
FROM 
    SHOES
WHERE 
    SKU <> '{my_choice_sku}'
    AND SIZE = 12
    AND CLASS = 'Men'
ORDER BY 
    DISTANCE ASC
FETCH FIRST 10 ROWS ONLY
""".format(my_choice_sku=my_choice_sku)

top_shoes = %sql {{sql}}
top_shoes

sku,product_name,brand,s_type,material,color,weather_resistance,arch_support,price,rating,distance
COM-4307,ComfRun Flex Trail,ComfRun,Trail,Knit,White,Waterproof,High,111.75,3.3,0.1493332219168855
LOO-3399,Loopic Flex Trail,Loopic,Trail,Knit,Black,Resistant,Flat,103.02,3.3,0.1933822777775447
COM-2181,ComfRun X Trail,ComfRun,Trail,Knit,White,Resistant,High,76.56,4.9,0.2439051990322654
COM-4101,ComfRun Flex Trail,ComfRun,Trail,Synthetic,Black,Waterproof,Flat,37.58,3.7,0.2596531900481073
LOO-3311,Loopic X Trail,Loopic,Trail,Synthetic,Black,Waterproof,Flat,34.38,3.6,0.2596531900481073
STR-7390,StrideOne X Walking,StrideOne,Walking,Knit,White,Resistant,Flat,36.1,4.1,0.3033739940972922
STR-6917,StrideOne Ultra Walking,StrideOne,Walking,Knit,Black,Resistant,Flat,43.68,4.7,0.3108205917494637
FOO-9005,FootFlex Pro Walking,FootFlex,Walking,Knit,Black,Resistant,Flat,149.02,3.0,0.3108205917494637
ZEN-8860,Zentrax Flex Trail,Zentrax,Trail,Leather,White,Resistant,Flat,45.28,5.0,0.3130399433313913
RUN-9794,RunXpress Max Running,RunXpress,Running,Knit,White,Waterproof,High,114.34,4.8,0.3133123647254505


The output above should show a mix of same values with - top to down - increasing variety.

Next, the same query again, but using UNION ALL to show "our" row as first one for better comparison of similarity. We limit the result set to only 5 similar records.

In [23]:
# SQL query using VECTOR_DISTANCE and the EMBEDDING from the selected shoe (my_choice_sku)
sql = f"""
(SELECT 
    SKU, 
    PRODUCT_NAME, 
    BRAND, 
    S_TYPE, 
    MATERIAL, 
    COLOR, 
    WEATHER_RESISTANCE, 
    ARCH_SUPPORT, 
    PRICE, 
    RATING,
    0 AS DISTANCE
FROM
    SHOES
WHERE
    SKU = '{my_choice_sku}')
UNION ALL
(SELECT 
    SKU, 
    PRODUCT_NAME, 
    BRAND, 
    S_TYPE, 
    MATERIAL, 
    COLOR, 
    WEATHER_RESISTANCE, 
    ARCH_SUPPORT, 
    PRICE, 
    RATING,
    VECTOR_DISTANCE(
        (SELECT EMBEDDING FROM SHOES WHERE SKU = '{my_choice_sku}'), 
        EMBEDDING, 
        EUCLIDEAN
    ) AS DISTANCE
FROM 
    SHOES
WHERE 
    SKU <> '{my_choice_sku}'
    AND SIZE = 12
    AND CLASS = 'Men'
ORDER BY 
    DISTANCE ASC
FETCH FIRST 5 ROWS ONLY)
ORDER BY DISTANCE ASC
""".format(my_choice_sku=my_choice_sku)

%sql {{sql}}

sku,product_name,brand,s_type,material,color,weather_resistance,arch_support,price,rating,distance
COM-1458,ComfRun Max Trail,ComfRun,Trail,Knit,White,Waterproof,Flat,70.14,3.9,0.0
COM-4307,ComfRun Flex Trail,ComfRun,Trail,Knit,White,Waterproof,High,111.75,3.3,0.1493332219168855
LOO-3399,Loopic Flex Trail,Loopic,Trail,Knit,Black,Resistant,Flat,103.02,3.3,0.1933822777775447
COM-2181,ComfRun X Trail,ComfRun,Trail,Knit,White,Resistant,High,76.56,4.9,0.2439051990322654
COM-4101,ComfRun Flex Trail,ComfRun,Trail,Synthetic,Black,Waterproof,Flat,37.58,3.7,0.2596531900481073
LOO-3311,Loopic X Trail,Loopic,Trail,Synthetic,Black,Waterproof,Flat,34.38,3.6,0.2596531900481073


Compare the first row (our shoe) to the other similar shoes.

# Cleanup and Tools

In [24]:
%%skip_if $KEEP_DATA
# DROP the created table SHOES if configured
%sql DROP TABLE SHOES

In [25]:
%%skip_if not $EXPORT_DATA
# Export the shoe data to keep it for history and more experiments

df_shoes.to_csv(
    'shoes_data_with_vectors.csv',
    index=False,
    quoting=csv.QUOTE_NONNUMERIC
)


In [26]:
# Close the database connection
%sql --close db2
%sql --connections

current,url,alias
