In [17]:
!nproc & nvidia-smi -L

20
GPU 0: NVIDIA GeForce RTX 3060 (UUID: GPU-125eaf29-537f-5e5a-8a00-140ef5b42572)


In [18]:
import os
import time
import json
import random
import itertools
from functools import partial
from collections import defaultdict
from pprint import pprint

import pandas as pd
import numpy as np
import networkx as nx

import timm
import torch
import textdistance

import matplotlib.pyplot as plt
import seaborn as sns
import ipyplot as iplt
from IPython.display import HTML, display
from tqdm.notebook import tqdm
tqdm.pandas()

from typing import Dict, List, Any

import faiss
faiss.omp_set_num_threads(16)

data_dir = "/data/"

In [19]:
from dotenv import load_dotenv; load_dotenv()
from s3fs import S3FileSystem
s3 = S3FileSystem()
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [20]:
df = pd.read_parquet("s3://aisle3-ml-datasets/product-matching/aisle3/main.parquet")
df

Unnamed: 0,id,variant_id,title,merchant,brand,gender,color,imid,image_url,image,pose,color_pred
0,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26789,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,side_shot,black
1,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26790,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,side_shot,black
2,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26791,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,side_shot,black
3,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26792,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,upper_shot,black
4,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26793,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,partial_shot,black
...,...,...,...,...,...,...,...,...,...,...,...,...
215424,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,95785,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/ssens...,partial_shot,maroon
215425,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,95786,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/ssens...,pair_shot,white
215426,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,95787,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/ssens...,side_shot,white
215427,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,95788,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/ssens...,pair_shot,beige


In [21]:
df.image = df.image.str.replace("s3://", "") + ".jpg"
df.image

0         aisle-3-image-final/images/original/allsole/al...
1         aisle-3-image-final/images/original/allsole/al...
2         aisle-3-image-final/images/original/allsole/al...
3         aisle-3-image-final/images/original/allsole/al...
4         aisle-3-image-final/images/original/allsole/al...
                                ...                        
215424    aisle-3-image-final/images/original/ssense/sse...
215425    aisle-3-image-final/images/original/ssense/sse...
215426    aisle-3-image-final/images/original/ssense/sse...
215427    aisle-3-image-final/images/original/ssense/sse...
215428    aisle-3-image-final/images/original/ssense/sse...
Name: image, Length: 215429, dtype: object

In [22]:
(data_dir + df.image).apply(os.path.isfile).value_counts()

True    215429
Name: image, dtype: int64

In [23]:
test_df = pd.read_parquet("product_matching_test.parquet")
test_df

Unnamed: 0,id,variant_id,title,merchant,brand,gender,color,imid,image_url,image,pose,color_pred,label
0,footasylum.DH9628200,footasylum.DH9628200,Zoom-Type Crater 'Rad Transparency' Trainers,footasylum,nike,men,Multicolour,87044,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,side_shot,beige,2002
1,footasylum.DH9628200,footasylum.DH9628200,Zoom-Type Crater 'Rad Transparency' Trainers,footasylum,nike,men,Multicolour,87045,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,side_shot,white,2002
2,footasylum.DH9628200,footasylum.DH9628200,Zoom-Type Crater 'Rad Transparency' Trainers,footasylum,nike,men,Multicolour,87046,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,side_shot,beige,2002
3,footasylum.DH9628200,footasylum.DH9628200,Zoom-Type Crater 'Rad Transparency' Trainers,footasylum,nike,men,Multicolour,87047,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,pair_shot,white,2002
4,footasylum.DH9628200,footasylum.DH9628200,Zoom-Type Crater 'Rad Transparency' Trainers,footasylum,nike,men,Multicolour,87048,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,sole_shot,brown,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27280,footasylum.DJ4629002,footasylum.DJ4629002,Asuna Crater Slide,footasylum,nike,men,Multicolour,86991,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,side_shot,black,843
27281,footasylum.DJ4629002,footasylum.DJ4629002,Asuna Crater Slide,footasylum,nike,men,Multicolour,86992,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,side_shot,black,843
27282,footasylum.DJ4629002,footasylum.DJ4629002,Asuna Crater Slide,footasylum,nike,men,Multicolour,86993,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,pair_shot,navy,843
27283,footasylum.DJ4629002,footasylum.DJ4629002,Asuna Crater Slide,footasylum,nike,men,Multicolour,86994,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/foota...,sole_shot,black,843


In [24]:
test_df.id.nunique()

5732

In [25]:
test_df.label.nunique()

2454

In [26]:
test_df.label.max()

2453

In [27]:
test_df.groupby("label").id.unique().apply(len).value_counts()

2     1898
3      361
4      145
5       38
6        8
9        2
10       1
7        1
Name: id, dtype: int64

In [28]:
G = nx.Graph()
test_df.groupby("label").id.unique().apply(partial(itertools.combinations, r=2)).apply(
    G.add_edges_from
)
print(
    f"Number of edges : {G.number_of_edges()} | Number of nodes : {G.number_of_nodes()}"
)

clusters = sorted(nx.connected_components(G), key=len, reverse=True)
pd.Series(clusters).apply(len).value_counts().sort_index()


Number of edges : 4489 | Number of nodes : 5732


2     1898
3      361
4      145
5       38
6        8
7        1
9        2
10       1
dtype: int64

In [38]:
test_df = test_df.sort_values("imid").reset_index(drop=True)
test_df

Unnamed: 0,id,variant_id,title,merchant,brand,gender,color,imid,image_url,image,pose,color_pred,label
0,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,16,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,side_shot,black,1503
1,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,17,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,side_shot,black,1503
2,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,18,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,side_shot,black,1503
3,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,19,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,pair_shot,black,1503
4,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,20,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,pair_shot,white,1503
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27280,blacks.16164868,blacks.16164868,Salomon Alphacross Blast Trail Running Shoes,blacks,salomon,men,Multicolour,215066,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/black...,side_shot,black,559
27281,blacks.16164868,blacks.16164868,Salomon Alphacross Blast Trail Running Shoes,blacks,salomon,men,Multicolour,215067,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/black...,sole_shot,black,559
27282,blacks.16164868,blacks.16164868,Salomon Alphacross Blast Trail Running Shoes,blacks,salomon,men,Multicolour,215068,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/black...,partial_shot,black,559
27283,blacks.16164868,blacks.16164868,Salomon Alphacross Blast Trail Running Shoes,blacks,salomon,men,Multicolour,215069,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/black...,partial_shot,black,559


In [42]:
def compute_scores(pred, gt):
    tp = len(np.intersect1d(pred, gt))
    iou = tp / len(np.union1d(pred, gt))
    # fp = np.setdiff1d(pred, gt)
    if tp == 0:
        prec, recall, f1 = (0.0, 0.0, 0.0)
    else:
        prec = tp / len(pred)
        recall = tp / len(gt)
        f1 = (2 * prec * recall) / (prec + recall)
    return {"iou": iou, "prec": prec, "recall": recall, "f1": f1}


def get_nn_imgs(imids, k=50, threshold=None):
    if isinstance(imids, int):
        imids = [imids]
    qx = features[imids]
    D, I = index.search(qx, k=k)
    if threshold is None:
        threshold = D.max()
    mask = np.argwhere(np.logical_and(I != -1, D < threshold))
    nns = list()
    for i, nn in itertools.groupby(mask, lambda x: x[0]):
        nn = np.vstack(list(nn))
        nns.append(
            [
                (idx, dist)
                for idx, dist in zip(
                    I[nn[:, 0], nn[:, 1]].tolist(), D[nn[:, 0], nn[:, 1]].tolist()
                )
            ]
        )
    return nns


def perform_blocking(nns, block_on=["gender", "brand"], block_off=["merchant"]):
    if len(nns) > 1:
        q_imid = nns[0][0]
        query = df.iloc[q_imid]
        for col in block_on:
            nns = [nn for nn in nns if query[col] == df.iloc[nn[0]][col]]
        if len(block_off):
            for col in block_off:
                nns = [nn for nn in nns if query[col] != df.iloc[nn[0]][col]]
        nns = [(q_imid, 0.0)] + nns
    return nns


def imids_to_id(imids):
    return df.iloc[imids, 0].unique()


def id_to_imids(ids):
    imids = df.loc[ids, "imid"]
    if isinstance(imids, int):
        return [imids]
    return imids


def apply_threshold(nns, threshold=0.5, similarity=False):
    return [nn[0] for nn in nns if nn[1] < threshold]


In [None]:
features = np.load("ld_features.npy")
print(features.shape)

(215429, 512)


In [None]:
index = faiss.IndexFlatL2(features.shape[1])
index.add(features)
index.ntotal

215429

In [44]:
# get nns with dists
print("peforming index search")
t = time.perf_counter()
nns = get_nn_imgs(test_df.imid.values, k=100)
print("time to search:", time.perf_counter() - t)

# blocking
print("peforming blocking")
t = time.perf_counter()
nns = [perform_blocking(nn) for nn in nns]
print("time to perform blocking:", time.perf_counter() - t)

nns = dict(zip(test_df.imid.values, nns))
len(nns)

peforming index search
time to search: 122.79313182999977
peforming blocking
time to perform blocking: 332.5137490909983


27285

In [86]:
test_df = test_df.set_index("imid", drop=False)
test_df

Unnamed: 0_level_0,id,variant_id,title,merchant,brand,gender,color,imid,image_url,image,pose,color_pred,label,nns
imid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
16,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,16,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,side_shot,black,1503,"[(16, 0.0)]"
17,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,17,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,side_shot,black,1503,"[(17, 0.0), (107817, 0.3991560935974121), (107..."
18,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,18,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,side_shot,black,1503,"[(18, 0.0), (109690, 0.4614090919494629), (107..."
19,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,19,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,pair_shot,black,1503,"[(19, 0.0), (107848, 0.6298296451568604), (109..."
20,nike.13836524,nike.13836524,Nike Court Vision Low Shoes,nike,nike,women,Black,20,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/nike/...,pair_shot,white,1503,"[(20, 0.0), (107817, 0.3739442825317383), (107..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215066,blacks.16164868,blacks.16164868,Salomon Alphacross Blast Trail Running Shoes,blacks,salomon,men,Multicolour,215066,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/black...,side_shot,black,559,"[(215066, 0.0)]"
215067,blacks.16164868,blacks.16164868,Salomon Alphacross Blast Trail Running Shoes,blacks,salomon,men,Multicolour,215067,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/black...,sole_shot,black,559,"[(215067, 0.0)]"
215068,blacks.16164868,blacks.16164868,Salomon Alphacross Blast Trail Running Shoes,blacks,salomon,men,Multicolour,215068,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/black...,partial_shot,black,559,"[(215068, 0.0)]"
215069,blacks.16164868,blacks.16164868,Salomon Alphacross Blast Trail Running Shoes,blacks,salomon,men,Multicolour,215069,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/black...,partial_shot,black,559,"[(215069, 0.0)]"


In [89]:
test_df["nns"] = test_df.imid.map(nns)
test_df.nns

imid
16                                              [(16, 0.0)]
17        [(17, 0.0), (107817, 0.3991560935974121), (107...
18        [(18, 0.0), (109690, 0.4614090919494629), (107...
19        [(19, 0.0), (107848, 0.6298296451568604), (109...
20        [(20, 0.0), (107817, 0.3739442825317383), (107...
                                ...                        
215066                                      [(215066, 0.0)]
215067                                      [(215067, 0.0)]
215068                                      [(215068, 0.0)]
215069                                      [(215069, 0.0)]
215070                                      [(215070, 0.0)]
Name: nns, Length: 27285, dtype: object