# Finding similar products (shoes) using vector-based similarity search in Db2



General flow:
- Setup, including Db2 database connection and creating a table
- Generate fake product data
- Generate vector embeddings for key features using a local ollama service
- Add new [vector-based](https://www.ibm.com/docs/en/db2/12.1.0?topic=list-vector-values) embedding column to table, insert data
- Perform some queries utilizing [vector distance search](https://www.ibm.com/docs/en/db2/12.1.0?topic=functions-vector-distance) for semantic product recommendation (what other products are similar?)
- Cleanup



In [1]:
import pandas as pd
import os, csv
import random
from dotenv import load_dotenv
import numpy as np
import ollama
%load_ext sql

%config SqlMagic.dsn_filename = '.db2conn'
%config SqlMagic.displaylimit = 20
%config SqlMagic.named_parameters="enabled"
# load more settings from .env
load_dotenv(os.getcwd()+"/.env", override=True)

True

## Setting up Db2 Connection

In [2]:
%sql --section db2
%sql --connections

current,url,alias
*,db2://db2inst1:***@localhost:50000/testdb,db2


# Setting up a Shoes Table in Db2

In [3]:
# Drop the table if it exists
%sql DROP TABLE IF EXISTS SHOES
# Create the table
sql="""
    CREATE TABLE IF NOT EXISTS SHOES (
        SKU VARCHAR(8),
        PRODUCT_NAME VARCHAR(40),
        BRAND VARCHAR(20),
        CLASS VARCHAR(5),
        S_TYPE VARCHAR(7),
        MATERIAL VARCHAR(20),
        COLOR VARCHAR(10),
        WEATHER_RESISTANCE VARCHAR(10),
        ARCH_SUPPORT VARCHAR(4),
        SIZE FLOAT,
        PRICE FLOAT,
        RATING FLOAT,
        STORE_ID BIGINT,
        CITY VARCHAR(40)
    );
    """

%sql {{sql}}

In [4]:
# Generate data
cities = [
    "Frankfurt", "Berlin", "Munich", "Hamburg"
]

# Definitions
brands = ['Zentrax', 'FootFlex', 'StrideOne', 'Loopic', 'RunXpress', 'ComfRun']
types = ['Running', 'Walking', 'Trail']
classes = ['Men', 'Women']
materials = ['Synthetic', 'Knit', 'Leather']
colors = ['Black', 'White']
arch_supports = ['High', 'Flat']
weather_resistances = ['Waterproof', 'Resistant']
sizes = [round(s, 1) for s in range(6, 13)] + [s + 0.5 for s in range(6, 13)]
store_ids = range(1, 21)

# Helper: create a fake product name
def create_product_name(brand, shoe_type):
    return f"{brand} {random.choice(['Ultra', 'Flex', 'Pro', 'X', 'Max'])} {shoe_type}"

# Helper: create fake keywords
def generate_keywords(shoe_type, material):
    keywords = [shoe_type.lower(), material.lower()]
    keywords += random.sample(['lightweight', 'durable', 'breathable', 'cushioned', 'supportive', 'flexible'], 3)
    return ', '.join(keywords)

def generate_shoe_data(n=500):
    data = []
    used_skus = set()

    for _ in range(n):
        brand = random.choice(brands)
        shoe_type = random.choice(types)
        shoe_class = random.choice(classes)
        material = random.choice(materials)
        size = random.choice(sizes)
        color = random.choice(colors)
        arch = random.choice(arch_supports)
        weather = random.choice(weather_resistances)
        store_id = random.choice(store_ids)
        city = random.choice(cities)
                
        price = round(random.uniform(29.99, 149.99), 2)
        rating = round(random.uniform(3.0, 5.0), 1)
        product_name = create_product_name(brand, shoe_type)

        # Ensure SKU uniqueness
        while True:
            sku = f"{brand[:3].upper()}-{random.randint(1000, 9999)}"
            if sku not in used_skus:
                used_skus.add(sku)
                break

        data.append({
            'SKU': sku,
            'PRODUCT_NAME': product_name,
            'BRAND': brand,
            'CLASS': shoe_class,
            'S_TYPE': shoe_type,
            'MATERIAL': material,
            'COLOR': color,
            'WEATHER_RESISTANCE': weather,
            'ARCH_SUPPORT': arch,
            'SIZE': size,
            'PRICE': price,
            'RATING': rating,
            'STORE_ID': store_id,
            'CITY': city
        })

    return pd.DataFrame(data)

# Generate and save
df_shoes = generate_shoe_data(500)


In [5]:
# A look at the generated data
df_shoes.head()

Unnamed: 0,SKU,PRODUCT_NAME,BRAND,CLASS,S_TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,SIZE,PRICE,RATING,STORE_ID,CITY
0,COM-9702,ComfRun Flex Walking,ComfRun,Women,Walking,Leather,Black,Waterproof,Flat,10.0,65.27,4.8,12,Munich
1,RUN-2605,RunXpress Ultra Walking,RunXpress,Men,Walking,Knit,Black,Waterproof,High,12.0,83.85,4.4,7,Hamburg
2,FOO-2056,FootFlex Ultra Trail,FootFlex,Women,Trail,Knit,White,Waterproof,Flat,10.5,97.38,4.8,14,Munich
3,ZEN-8408,Zentrax X Running,Zentrax,Men,Running,Knit,Black,Resistant,High,6.5,111.8,4.1,6,Frankfurt
4,FOO-1444,FootFlex Ultra Running,FootFlex,Men,Running,Synthetic,White,Resistant,High,6.5,55.78,3.7,18,Munich


In [6]:
# Define columns that define features for embedding
embedding_cols = ['S_TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT']
# The output matches the columns and output shown in the previous cell (see above)
df_shoes[embedding_cols].head()

Unnamed: 0,S_TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT
0,Walking,Leather,Black,Waterproof,Flat
1,Walking,Knit,Black,Waterproof,High
2,Trail,Knit,White,Waterproof,Flat
3,Running,Knit,Black,Resistant,High
4,Running,Synthetic,White,Resistant,High


# Generating embedding vectors for the shoes

In [7]:
# Combine all embedding columns into a single string for each row, including column names
# The key/value pairs are separated by ' [SEP] '
df_shoes['COMBINED'] = df_shoes.apply(
    lambda row: ' [SEP] '.join([f"{col_name}: {row[col_name]}" for col_name in embedding_cols]), 
    axis=1
)

In [8]:
# Show the same columns plus the new COMBINED column
cols_to_show = ['S_TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'COMBINED']
df_shoes[cols_to_show].head()

Unnamed: 0,S_TYPE,MATERIAL,COLOR,WEATHER_RESISTANCE,ARCH_SUPPORT,COMBINED
0,Walking,Leather,Black,Waterproof,Flat,S_TYPE: Walking [SEP] MATERIAL: Leather [SEP] ...
1,Walking,Knit,Black,Waterproof,High,S_TYPE: Walking [SEP] MATERIAL: Knit [SEP] COL...
2,Trail,Knit,White,Waterproof,Flat,S_TYPE: Trail [SEP] MATERIAL: Knit [SEP] COLOR...
3,Running,Knit,Black,Resistant,High,S_TYPE: Running [SEP] MATERIAL: Knit [SEP] COL...
4,Running,Synthetic,White,Resistant,High,S_TYPE: Running [SEP] MATERIAL: Synthetic [SEP...


In [9]:
df_shoes.iloc[0]['COMBINED']

'S_TYPE: Walking [SEP] MATERIAL: Leather [SEP] COLOR: Black [SEP] WEATHER_RESISTANCE: Waterproof [SEP] ARCH_SUPPORT: Flat'

Instead of generating embeddings with an AI model, you can also use the following to load already generated data. In that case, uncomment the two last lines in the following cell, then run the cell. Then, skip the cell that uses the local Ollama service.

In [10]:
# Instead of generating new data, load pregenerated data from a CSV file and use it instead.

# Uncomment to use
#df_shoes=pd.read_csv('shoes_data_with_vectors.csv')
#df_shoes.head()

Generate the embeddings using a local Ollama service.

In [11]:
# Make list from combined columns
row_combined = df_shoes['COMBINED'].tolist()
# Run batch processing for generation of embeddings
embedding_model=os.getenv('embedding_model')
response = ollama.embed(model=embedding_model, input=row_combined)
shoe_vectors = response["embeddings"]
df_shoes['EMBEDDING'] = shoe_vectors
# remove the column with the input values
df_shoes.drop(['COMBINED'], axis=1, inplace=True)


In [12]:
# show a sample vector value
df_shoes.iloc[0]['EMBEDDING']

[-0.03537109,
 -0.0047420138,
 0.028959494,
 -0.013077513,
 -0.029978702,
 -0.013919533,
 0.015907919,
 -0.0071371454,
 0.0023483336,
 0.011121175,
 0.014191492,
 -0.006909531,
 -0.032855045,
 -0.0142255295,
 -0.019326443,
 -0.034315847,
 -0.024172166,
 0.023227382,
 0.014961322,
 -0.048821364,
 0.022883493,
 0.023729617,
 0.020607768,
 -0.039682373,
 -0.031305276,
 -0.049624845,
 -0.0292224,
 0.0038479094,
 -0.032414794,
 -0.16908763,
 0.06459867,
 0.011801292,
 -0.036199443,
 -0.07522699,
 -0.027838513,
 -0.08487413,
 0.053480547,
 -0.033653222,
 0.022869818,
 0.034194387,
 -0.014140504,
 0.04401588,
 -0.015367272,
 0.024804844,
 0.08731736,
 0.0254141,
 -0.081722006,
 0.008996423,
 0.025861159,
 -0.009663253,
 0.017060291,
 -0.0614645,
 -0.030723657,
 0.05830879,
 0.00041869888,
 -0.011059541,
 0.0054649995,
 -0.022807572,
 -0.0759506,
 -0.082663454,
 0.09027697,
 0.03330702,
 0.014867084,
 -0.023527123,
 -0.016960023,
 0.012702277,
 0.021295588,
 0.0049932683,
 0.02457401,
 -0.0077

# Add vector column to SHOES table and then insert the data

In [13]:
# Extract the dimensions, they vary by model
# The dimension is needed to set up the vector column in Db2 and to insert data
vector_dimension=len(shoe_vectors[0])
vector_dimension

384

### Adding a `VECTOR` column

Alter the SHOES table and add the vector column.
Note that the dimension needs to fit with the generated embeddings

In [14]:
%%sql
ALTER TABLE SHOES
ADD COLUMN EMBEDDING VECTOR({{vector_dimension}}, FLOAT32);

In [15]:
# DESCRIBE the table to show schema. Note the VECTOR-typed column EMBEDDING
%sql CALL SYSPROC.ADMIN_CMD('describe table shoes')


colname,typeschema,typename,length,scale,nullable
SKU,SYSIBM,VARCHAR,8,0,Y
PRODUCT_NAME,SYSIBM,VARCHAR,40,0,Y
BRAND,SYSIBM,VARCHAR,20,0,Y
CLASS,SYSIBM,VARCHAR,5,0,Y
S_TYPE,SYSIBM,VARCHAR,7,0,Y
MATERIAL,SYSIBM,VARCHAR,20,0,Y
COLOR,SYSIBM,VARCHAR,10,0,Y
WEATHER_RESISTANCE,SYSIBM,VARCHAR,10,0,Y
ARCH_SUPPORT,SYSIBM,VARCHAR,4,0,Y
SIZE,SYSIBM,DOUBLE,8,0,Y


Insert the data into SHOES table by looping over the data frame. Not efficient, but ok for this example.

In [16]:
# Turn regular output off to not have 500 outputs
%config SqlMagic.feedback=0
sql="""
insert into shoes values
(:sku, :product, :brand, :rclass, :rtype, :material, :color, :wr, :arch_s,
:rsize, :price, :rating, :storeid, :city, VECTOR(:vector_str ,{vector_dimension}, FLOAT32))
""".format(vector_dimension=vector_dimension)

for index, row in df_shoes.iterrows():
    sku, product, brand, rclass, rtype, material, color, wr, arch_s, rsize, price,\
     rating, storeid, city, embedding = row
    vector_str = "[" + ", ".join(map(str, embedding)) + "]"
    %sql {{sql}}
    
# Turn regular output back on
%config SqlMagic.feedback=1

## Work with the inserted data

In [17]:
# The row count should match the number of generated data records
%sql SELECT count(*) as NUM_ROWS FROM SHOES

num_rows
500


In [18]:
# Search for Men shoes of size 12
sql = """ 
    SELECT SKU, PRODUCT_NAME, BRAND, S_TYPE, MATERIAL, COLOR, WEATHER_RESISTANCE, ARCH_SUPPORT, PRICE, RATING, CITY
    FROM SHOES 
    WHERE CLASS = 'Men' AND Size = 12 
    """

shoe_search = %sql {{sql}}

shoe_search

sku,product_name,brand,s_type,material,color,weather_resistance,arch_support,price,rating,city
RUN-2605,RunXpress Ultra Walking,RunXpress,Walking,Knit,Black,Waterproof,High,83.85,4.4,Hamburg
COM-8094,ComfRun Ultra Walking,ComfRun,Walking,Leather,Black,Resistant,Flat,88.88,4.2,Frankfurt
ZEN-9942,Zentrax Flex Trail,Zentrax,Trail,Leather,Black,Resistant,Flat,33.55,4.3,Berlin
COM-9889,ComfRun Flex Trail,ComfRun,Trail,Synthetic,White,Resistant,High,67.77,4.4,Hamburg
FOO-1787,FootFlex Flex Running,FootFlex,Running,Synthetic,Black,Waterproof,Flat,69.05,4.3,Hamburg
STR-2142,StrideOne Pro Walking,StrideOne,Walking,Leather,Black,Waterproof,Flat,135.28,4.4,Frankfurt
LOO-8565,Loopic Pro Running,Loopic,Running,Synthetic,Black,Resistant,Flat,142.73,4.2,Munich
FOO-7866,FootFlex Pro Walking,FootFlex,Walking,Synthetic,White,Waterproof,High,110.57,4.4,Munich
FOO-6826,FootFlex X Trail,FootFlex,Trail,Leather,Black,Resistant,Flat,30.27,4.0,Hamburg
FOO-3933,FootFlex Ultra Walking,FootFlex,Walking,Synthetic,Black,Resistant,High,124.4,4.1,Berlin


In [19]:
# Turn the result into a DataFrame
df_shoe_search = shoe_search.DataFrame()
# extract SKUs
sku_list = df_shoe_search['sku']
# Pick a random SKU as our "choice"
my_choice_sku = random.choice(sku_list)
#print the selected SKU
my_choice_sku

'LOO-8565'

In [20]:
# What is the full record for "our" choice?
%sql select * from SHOES where SKU='{{my_choice_sku}}'

sku,product_name,brand,class,s_type,material,color,weather_resistance,arch_support,size,price,rating,store_id,city,embedding
LOO-8565,Loopic Pro Running,Loopic,Men,Running,Synthetic,Black,Resistant,Flat,12.0,142.73,4.2,9,Munich,"[-0.00654082512,-0.00487492094,0.0336975791,0.0142087396,-0.042049244,-0.0338306613,0.00839051697,0.00923586357,0.00279061892,0.0245122332,0.0284971315,0.0066618612,-0.0472129099,-0.00213743607,-0.0276402514,-0.00848993659,-0.0374484472,0.0154802641,-0.00326416199,-0.0366191305,0.0404099487,0.0100623248,0.0293632727,-0.0658844933,-0.0417483896,-0.0248604305,-0.0513547994,-0.0215768348,-0.0349731445,-0.123930059,0.0620663315,-0.0166650135,-0.042241931,-0.0813293532,-0.0301133897,-0.0640202388,0.0533766821,-0.0408517048,0.0212880298,0.0322455764,-0.0529585443,0.0613790788,-0.024624031,0.0369028673,0.134013444,0.0357572511,-0.068400614,0.0421089754,0.0273698978,-0.043699652,0.01660873,-0.101546779,-0.017618021,0.0173755325,-0.0139869396,-0.0839106515,0.00590342842,-0.0144379428,-0.0479518026,-0.0653025582,0.0712339655,0.0238983855,0.0504654609,-0.0420859978,-0.0250139348,0.019142516,0.0178377852,0.0145866312,0.00271927891,-0.0250857472,0.0269926246,-0.0931573212,0.0208980981,-0.0375392027,0.0612118244,-0.0892519578,0.0177020393,-0.00580761302,-0.0340581983,0.0293332525,-0.00649267109,0.018840922,-0.0842611194,-0.0389522053,0.0963342041,-0.0364437327,-0.030373957,0.0428243205,0.0160724465,0.10425549,-0.0064587458,0.00201874087,0.0785515383,0.0601953454,0.0298406761,-0.0608865879,-0.187658727,0.0265311077,-0.105447806,0.000144123187,-0.00704774214,-0.0220975429,0.0283157695,-0.0518461615,0.044109121,-0.0456375964,0.0560906753,0.0616816357,-0.00651575578,-0.0789486393,-0.0251816418,0.1108362,-0.0064895004,-0.0132182026,-0.0255921483,0.0260636937,0.0486594923,-0.0836316794,-0.00824677944,-0.017744774,0.0182905924,0.0168502275,-0.0420021676,-0.0621904023,0.0841301531,0.0248865299,-0.00897366367,-0.0140545173,-0.0352776088,-0.0216457229,0.018835878,0.0706339478,-0.0847182721,0.0572616942,-0.00136353937,0.0330146328,-0.0190288872,0.000694470655,0.0532739758,-0.0170372352,-0.0268046688,-0.0709685311,0.0135947643,-0.0671706721,0.0283450652,-0.0132802222,-0.125496551,-0.0174675956,-0.040870253,-0.00366022345,-0.0178376269,-0.0486543141,-0.0210285168,0.00470552733,0.00352965528,-0.0107445549,-0.0229465086,0.0472291671,0.0122223925,0.00523369806,-0.02838476,0.0134379379,-0.0244548861,0.0346598886,-0.0481471755,0.0279475134,0.0277300235,-0.0267916545,-0.0648563802,0.0622766092,0.0554022305,-0.00247049355,0.0143209845,-0.00379276858,-0.0670264959,0.0326704569,-0.0276672076,-0.0214395635,0.046282582,-0.0151837347,-0.0360860564,0.0414162017,0.0495170839,-0.0403143577,-0.094085224,-0.0301004406,-0.0448806845,-0.0980179831,0.0190522838,-0.0135309435,0.00516149867,0.0192515831,-0.00626205467,-0.0271567851,0.0261892676,-0.0387869701,-0.0344067067,0.01311839,-0.0150910523,-0.0261400379,0.0724189058,-0.0226186942,0.124727756,0.0320801511,0.137662217,0.0475198813,0.0322981253,0.0121206418,-0.0266676564,0.0125091998,0.0106604174,-0.017986875,-0.00520970719,-0.067302078,0.0140237147,-0.016001815,-0.0801673904,-0.03808707,0.000446765742,-0.00467866287,0.0815967545,-0.0545501448,-0.0375231765,0.093516238,-0.0305300374,-0.0603970103,0.0294754766,0.00345267076,-0.0137165505,-0.0134373996,-0.00741208112,0.0404341184,-0.0396112762,0.00312867644,-0.0806514546,-0.0605942868,-0.108213678,-0.0319447331,-0.0619811043,-0.0482835956,0.041679576,0.142376617,0.00799240172,-0.00243004831,0.114378437,0.105305962,-0.00983014144,0.0158239007,-0.0321729034,0.0210854169,0.0297734942,0.0832046792,0.0504426248,0.006894249,0.0159697309,0.028280586,-0.0300801191,-0.0130312024,0.00729241129,-0.0252763834,0.0872035995,-0.0932977721,-0.0066836467,0.0084697986,0.0203624014,0.0276610348,0.0186427105,0.0655106977,-0.0166092217,0.0115265371,-0.0292203501,0.0149419578,-0.0170969162,-0.0413322486,0.0767212287,-0.0371527858,-0.0416653305,-0.00971159339,-0.00889134221,0.020450443,0.0220712312,0.0327185169,-0.00852493197,-0.0272237528,-0.0143144233,-0.0207128394,-0.0137080448,-0.0515029691,-0.0156093026,-0.0143589377,0.00980586186,-0.078271158,0.003270288,0.000134454502,0.0811569169,0.151491925,-0.0863271356,-0.0801010057,0.0751747936,-0.0184605848,0.0303362198,0.0382832214,-0.0531327911,-0.0476151444,0.000671940797,0.0309402104,0.0145503934,-0.0267413389,-0.0147832148,-0.0487805158,-0.053881444,-0.0169510171,-0.00372266769,-0.0269647501,-0.0380229205,0.0673902631,0.014381825,-0.00985500682,-0.0563721955,0.00712610921,0.016837392,-0.0701336488,-0.0752730966,0.0741819888,0.200192243,-0.0377057306,-0.0086497413,0.0243552011,0.0733235106,0.0794806629,-0.0756726637,-0.0416383371,0.0350296199,0.0466933809,0.168555751,-0.0172112994,-0.0881966129,0.148542762,0.0407047756,0.00712850038,-0.0412657745,-0.028540194,0.0212669522,0.00696279854,0.0870587453,-0.00576743484,-0.032648921,-0.0216224678,-0.0264007915,0.0618447624,-0.00560479285,-0.0726758093,0.0541452505,-0.0552089438,-0.062928848,0.0302161425,-0.0714173317,-7.08817606E-05,-0.0167388357,-0.0207970794,-0.0151461177,0.0523603447,-0.0744980946,0.0336413458,0.0558795184,0.039881371,-0.115530051,-0.0219505299,-0.0814868957,-0.0432099774,-0.0369941145,-0.0311570652,0.0041919779,0.0281069148,0.0128018307,0.00209982321,-0.0126932897,-0.0687220767,-0.0148710869,-0.0935041532,0.0295236912,-0.0180269387,-0.158619046,0.00373361958]"


Searching for similar 'Men' shoes (type, material, color, weather resistance, arch support) at the Frankfurt location with size 12

In [21]:
# SQL query using VECTOR_DISTANCE and the EMBEDDING from the selected shoe (my_choice_sku)
sql = f"""
SELECT 
    SKU, 
    PRODUCT_NAME, 
    BRAND, 
    S_TYPE, 
    MATERIAL, 
    COLOR, 
    WEATHER_RESISTANCE, 
    ARCH_SUPPORT, 
    PRICE, 
    RATING,
    VECTOR_DISTANCE(
        (SELECT EMBEDDING FROM SHOES WHERE SKU = '{my_choice_sku}'), 
        EMBEDDING, 
        EUCLIDEAN
    ) AS DISTANCE
FROM 
    SHOES
WHERE 
    SKU <> '{my_choice_sku}'
    AND SIZE = 12
    AND CLASS = 'Men'
ORDER BY 
    DISTANCE ASC
FETCH FIRST 10 ROWS ONLY
""".format(my_choice_sku=my_choice_sku)

top_shoes = %sql {{sql}}
top_shoes

sku,product_name,brand,s_type,material,color,weather_resistance,arch_support,price,rating,distance
FOO-1787,FootFlex Flex Running,FootFlex,Running,Synthetic,Black,Waterproof,Flat,69.05,4.3,0.1857526383081627
STR-5633,StrideOne Flex Running,StrideOne,Running,Leather,Black,Waterproof,Flat,149.36,3.5,0.2909059366659274
COM-9889,ComfRun Flex Trail,ComfRun,Trail,Synthetic,White,Resistant,High,67.77,4.4,0.3153913351496938
FOO-6597,FootFlex Pro Running,FootFlex,Running,Knit,White,Waterproof,Flat,37.71,4.7,0.3299511886677651
FOO-3933,FootFlex Ultra Walking,FootFlex,Walking,Synthetic,Black,Resistant,High,124.4,4.1,0.3369643957611685
FOO-8572,FootFlex Max Walking,FootFlex,Walking,Synthetic,Black,Resistant,High,101.67,4.2,0.3369643957611685
STR-8389,StrideOne Pro Running,StrideOne,Running,Leather,Black,Waterproof,High,149.5,3.7,0.3372387876540082
ZEN-6311,Zentrax Ultra Walking,Zentrax,Walking,Synthetic,Black,Waterproof,Flat,79.72,4.8,0.3634866826096917
ZEN-9942,Zentrax Flex Trail,Zentrax,Trail,Leather,Black,Resistant,Flat,33.55,4.3,0.3738017537504511
FOO-6826,FootFlex X Trail,FootFlex,Trail,Leather,Black,Resistant,Flat,30.27,4.0,0.3738017537504511


The output above should show a mix of same values with - top to down - increasing variety.

Next, the same query again, but using UNION ALL to show "our" row as first one for better comparison of similarity. We limit the result set to only 5 similar records.

In [22]:
# SQL query using VECTOR_DISTANCE and the EMBEDDING from the selected shoe (my_choice_sku)
sql = f"""
(SELECT 
    SKU, 
    PRODUCT_NAME, 
    BRAND, 
    S_TYPE, 
    MATERIAL, 
    COLOR, 
    WEATHER_RESISTANCE, 
    ARCH_SUPPORT, 
    PRICE, 
    RATING,
    0 AS DISTANCE
FROM
    SHOES
WHERE
    SKU = '{my_choice_sku}')
UNION ALL
(SELECT 
    SKU, 
    PRODUCT_NAME, 
    BRAND, 
    S_TYPE, 
    MATERIAL, 
    COLOR, 
    WEATHER_RESISTANCE, 
    ARCH_SUPPORT, 
    PRICE, 
    RATING,
    VECTOR_DISTANCE(
        (SELECT EMBEDDING FROM SHOES WHERE SKU = '{my_choice_sku}'), 
        EMBEDDING, 
        EUCLIDEAN
    ) AS DISTANCE
FROM 
    SHOES
WHERE 
    SKU <> '{my_choice_sku}'
    AND SIZE = 12
    AND CLASS = 'Men'
ORDER BY 
    DISTANCE ASC
FETCH FIRST 5 ROWS ONLY)
ORDER BY DISTANCE ASC
""".format(my_choice_sku=my_choice_sku)

%sql {{sql}}

sku,product_name,brand,s_type,material,color,weather_resistance,arch_support,price,rating,distance
LOO-8565,Loopic Pro Running,Loopic,Running,Synthetic,Black,Resistant,Flat,142.73,4.2,0.0
FOO-1787,FootFlex Flex Running,FootFlex,Running,Synthetic,Black,Waterproof,Flat,69.05,4.3,0.1857526383081627
STR-5633,StrideOne Flex Running,StrideOne,Running,Leather,Black,Waterproof,Flat,149.36,3.5,0.2909059366659274
COM-9889,ComfRun Flex Trail,ComfRun,Trail,Synthetic,White,Resistant,High,67.77,4.4,0.3153913351496938
FOO-6597,FootFlex Pro Running,FootFlex,Running,Knit,White,Waterproof,Flat,37.71,4.7,0.3299511886677651
FOO-3933,FootFlex Ultra Walking,FootFlex,Walking,Synthetic,Black,Resistant,High,124.4,4.1,0.3369643957611685


Compare the first row (our shoe) to the other similar shoes.

# Cleanup and Tools

In [23]:
# We could DROP the created table SHOES if required. But we keep it, so additional queries could be run.
# Uncomment if needed
#%sql DROP TABLE SHOES

In [24]:
# Export the shoe data to keep it for history and more experiments

# Uncomment if needed
""" df_shoes.to_csv(
    'shoes_data_with_vectors.csv',
    index=False,
    quoting=csv.QUOTE_NONNUMERIC
)
 """

" df_shoes.to_csv(\n    'shoes_data_with_vectors.csv',\n    index=False,\n    quoting=csv.QUOTE_NONNUMERIC\n)\n "

In [25]:
# Close the database connection
%sql --close db2
%sql --connections

current,url,alias
