In [1]:
!nproc & nvidia-smi -L

20
GPU 0: NVIDIA GeForce RTX 3060 (UUID: GPU-125eaf29-537f-5e5a-8a00-140ef5b42572)


In [3]:
import os
import time
import json
import random
import itertools
from functools import partial
from collections import defaultdict
from pprint import pprint

import pandas as pd
import numpy as np
import networkx as nx

import timm
import torch
import textdistance

import matplotlib.pyplot as plt
import seaborn as sns
import ipyplot as iplt
from IPython.display import HTML, display
from tqdm.notebook import tqdm

tqdm.pandas()

from typing import Dict, List, Any

import faiss

faiss.omp_set_num_threads(16)

data_dir = "/data/"


In [4]:
from dotenv import load_dotenv; load_dotenv()
from s3fs import S3FileSystem
s3 = S3FileSystem()
%load_ext sql

In [5]:
df = pd.read_parquet("s3://aisle3-ml-datasets/product-matching/aisle3/main.parquet")
df

Unnamed: 0,id,variant_id,title,merchant,brand,gender,color,imid,image_url,image,pose,color_pred
0,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26789,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,side_shot,black
1,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26790,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,side_shot,black
2,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26791,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,side_shot,black
3,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26792,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,upper_shot,black
4,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,26793,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/allso...,partial_shot,black
...,...,...,...,...,...,...,...,...,...,...,...,...
215424,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,95785,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/ssens...,partial_shot,maroon
215425,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,95786,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/ssens...,pair_shot,white
215426,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,95787,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/ssens...,side_shot,white
215427,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,95788,https://aisle-3-image-final.s3.eu-west-2.amazo...,s3://aisle-3-image-final/images/original/ssens...,pair_shot,beige


In [6]:
df.image = df.image.str.replace("s3://", "") + ".jpg"
df.image

0         aisle-3-image-final/images/original/allsole/al...
1         aisle-3-image-final/images/original/allsole/al...
2         aisle-3-image-final/images/original/allsole/al...
3         aisle-3-image-final/images/original/allsole/al...
4         aisle-3-image-final/images/original/allsole/al...
                                ...                        
215424    aisle-3-image-final/images/original/ssense/sse...
215425    aisle-3-image-final/images/original/ssense/sse...
215426    aisle-3-image-final/images/original/ssense/sse...
215427    aisle-3-image-final/images/original/ssense/sse...
215428    aisle-3-image-final/images/original/ssense/sse...
Name: image, Length: 215429, dtype: object

In [7]:
(data_dir + df.image).apply(os.path.isfile).value_counts()

True    215429
Name: image, dtype: int64

In [8]:
%%sql result << 
SELECT
    `source`,
    `target`,
    `match`
FROM gold_annotations

1190 rows affected.
Returning data to local variable result


In [9]:
annotations = result.DataFrame()

In [10]:
annotations.match.value_counts()

0    797
1    393
Name: match, dtype: int64

In [11]:
print(
    "ids in annotations not in main:",
    len(np.setdiff1d(np.union1d(annotations.source, annotations.target), df.id)),
)


ids in annotations not in main: 0


In [12]:
G = nx.Graph()
G.add_nodes_from(np.union1d(annotations.source, annotations.target))
G.add_edges_from(annotations.loc[annotations.match == 1, ["source", "target"]].values)
print(
    f"Number of nodes: {G.number_of_nodes()} | Number of edges: {G.number_of_edges()}"
)
clusters = sorted(nx.connected_components(G), key=len, reverse=True)
pd.Series(clusters).apply(len).value_counts().sort_index()


Number of nodes: 655 | Number of edges: 393


1    116
2    198
3     25
4     11
5      3
9      1
dtype: int64

In [14]:
def loc(frame, idx, col):
    result = frame.loc[idx, col]
    if isinstance(result, pd.Series):
        result = result.iloc[0]
    return result

In [26]:
ground_truth = defaultdict(list)
for cluster in clusters:
    for anchor in cluster:
        matches = cluster.copy()
        for col in ["brand", "gender"]:
            matches = [match for match in matches if loc(df, match, col) == loc(df, anchor, col)]
        for col in ["merchant"]:
            matches = [match for match in matches if loc(df, match, col) != loc(df, anchor, col)]
        ground_truth[anchor] = [anchor] + matches
len(ground_truth)

655

In [27]:
df["ground_truth"] = df.id.map(ground_truth)
df

Unnamed: 0_level_0,id,variant_id,title,merchant,brand,gender,color,imid,image_url,image,pose,color_pred,ground_truth
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
allsole.10491511,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,0,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/allsole/al...,side_shot,black,[]
allsole.10491511,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,1,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/allsole/al...,side_shot,black,[]
allsole.10491511,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,2,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/allsole/al...,side_shot,black,[]
allsole.10491511,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,3,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/allsole/al...,upper_shot,black,[]
allsole.10491511,allsole.10491511,allsole.10491633,Vans Authentic Canvas Trainers,allsole,vans,unisex,Black,4,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/allsole/al...,partial_shot,black,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ssense.221903M237021,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,215424,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/ssense/sse...,partial_shot,maroon,[]
ssense.221903M237021,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,215425,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/ssense/sse...,pair_shot,white,[]
ssense.221903M237021,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,215426,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/ssense/sse...,side_shot,white,[]
ssense.221903M237021,ssense.221903M237021,ssense.221903M237021,Coach 1941 Black & Off-White Logo Slide Sandals,ssense,coach,men,Chalk black,215427,https://aisle-3-image-final.s3.eu-west-2.amazo...,aisle-3-image-final/images/original/ssense/sse...,pair_shot,beige,[]


In [28]:
df.set_index("id", drop=False, inplace=True)
df.imid = range(len(df))

In [65]:
def viz(
    frame: pd.DataFrame,
    image_size: int = 200,
    max_images: int = None,
    **kwargs: Dict[str, List[Any]],
) -> None:
    frame = frame.reset_index(drop=True)

    extra_df = None
    if len(kwargs):
        extra_df = pd.DataFrame(kwargs)
        assert len(extra_df) == len(frame), "kwargs must have same length as frame"
        for col in extra_df.columns:
            extra_df[col] = f"{col} : " + extra_df[col].astype(str)

    html = iplt.plot_images(
        frame.image_url,
        labels=frame.id,
        custom_texts="<HR>"
        + (
            ""
            if extra_df is None
            else ("<HR>" + extra_df.apply(" <br> ".join, axis=1) + "<HR><HR>")
        )
        + frame.title
        + "<HR>"
        + frame[["merchant", "brand", "gender", "color"]].apply(" <br> ".join, axis=1),
        img_width=image_size,
        show_url=False,
        max_images=max_images if max_images else len(frame),
    )


viz(df.tail(4), dist=np.linspace(0.1, 0.9, 4))


In [29]:
def compute_scores(pred, gt):
    tp = len(np.intersect1d(pred, gt))
    iou = tp / len(np.union1d(pred, gt))
    # fp = np.setdiff1d(pred, gt)
    if tp == 0:
        prec, recall, f1 = (0.0, 0.0, 0.0)
    else:
        prec = tp / len(pred)
        recall = tp / len(gt)
        f1 = (2 * prec * recall) / (prec + recall)
    return {"iou": iou, "prec": prec, "recall": recall, "f1": f1}


def get_nn_imgs(imids, k=50, threshold=None):
    if isinstance(imids, int):
        imids = [imids]
    qx = features[imids]
    D, I = index.search(qx, k=k)
    if threshold is None:
        threshold = D.max()
    mask = np.argwhere(np.logical_and(I != -1, D < threshold))
    nns = list()
    for i, nn in itertools.groupby(mask, lambda x: x[0]):
        nn = np.vstack(list(nn))
        nns.append(
            [
                (idx, dist)
                for idx, dist in zip(
                    I[nn[:, 0], nn[:, 1]].tolist(), D[nn[:, 0], nn[:, 1]].tolist()
                )
            ]
        )
    return nns


def perform_blocking(nns, block_on=["gender", "brand"], block_off=["merchant"]):
    if len(nns) > 1:
        q_imid = nns[0][0]
        query = df.iloc[q_imid]
        for col in block_on:
            nns = [nn for nn in nns if query[col] == df.iloc[nn[0]][col]]
        if len(block_off):
            for col in block_off:
                nns = [nn for nn in nns if query[col] != df.iloc[nn[0]][col]]
        nns = [(q_imid, 0.0)] + nns
    return nns


def imids_to_id(imids):
    return df.iloc[imids, 0].unique()


def id_to_imids(ids):
    imids = df.loc[ids, "imid"]
    if isinstance(imids, int):
        return [imids]
    return imids


def apply_threshold(nns, threshold=0.5, similarity=False):
    return [nn[0] for nn in nns if nn[1] < threshold]


In [31]:
features = np.load("ld_features.npy")
features.shape

(215429, 512)

In [34]:
index = faiss.IndexFlatL2(features.shape[1])
index.add(features)
index.ntotal

215429

# Analyze popular products 

In [37]:
pred_df = pd.read_csv("../../notes/popular.csv")
pred_df

Unnamed: 0,brand,title,gender,master_product,offer_id
0,Nike,Air Max 270 Trainer,men,f45773a1-0dd3-4f16-a5ac-70dcaa45882b,footlocker.314212239504
1,Nike,Air Max 90 LTR Trainer,men,230286c1-0000-4472-930f-8a9a0057e235,footasylum.CZ5594100
2,Nike,Air Max 90 LTR Trainer,men,10a1fd88-1eea-4c6b-88a2-53f6ac1a3add,footlocker.314206402704
3,Nike,Air Max 90 LTR Trainer,men,10a1fd88-1eea-4c6b-88a2-53f6ac1a3add,asos.1666471
4,Nike,Air Max 90 LTR Trainer,men,34303c91-eb38-4a2f-91f8-4d0a785251cb,footasylum.CN8490001
...,...,...,...,...,...
156,New Balance,57/40 Trainer,men,5138bd9d-adf0-4b70-84a1-214adbee1d64,endclothing.3087649156
157,New Balance,57/40 Trainer,men,5138bd9d-adf0-4b70-84a1-214adbee1d64,footasylum.M5740SB1
158,New Balance,57/40 Trainer,men,5138bd9d-adf0-4b70-84a1-214adbee1d64,schuh.3403301150
159,New Balance,57/40 Trainer,men,5138bd9d-adf0-4b70-84a1-214adbee1d64,footlocker.314214787104


In [38]:
pred_df.offer_id.isin(df.index).value_counts()

True    161
Name: offer_id, dtype: int64

In [41]:
viz(df.loc[set(pred_df.offer_id)].drop_duplicates("id"))

In [47]:
pred_imids = pred_df.groupby("offer_id").offer_id.apply(id_to_imids).values
print("pred_imids", len(pred_imids))

# get nns with dists
print("peforming index search")
t = time.perf_counter()
nns = get_nn_imgs(pred_imids, k=100)
print("time to search:", time.perf_counter() - t)

# blocking
print("peforming blocking")
t = time.perf_counter()
nns = [perform_blocking(nn) for nn in nns]
print("time to perform blocking:", time.perf_counter() - t)

nns = dict(zip(pred_imids, nns))
len(nns)


pred_imids 867
peforming index search
time to search: 4.1847757779996755
peforming blocking
time to perform blocking: 8.262133067999457


867

In [51]:
df.pose.value_counts()

pair_shot          59832
side_shot          54196
partial_shot       31670
human_wear_shot    23285
upper_shot         12811
3qrt_shot          12510
sole_shot          11058
heel_shot           8256
non_available       1811
Name: pose, dtype: int64

In [53]:
df.loc[pred_df.offer_id.unique()].pose.value_counts()

pair_shot          368
side_shot          264
partial_shot       102
sole_shot           92
human_wear_shot     20
non_available        7
3qrt_shot            6
heel_shot            4
upper_shot           4
Name: pose, dtype: int64

In [62]:
df.loc[pred_df.offer_id.unique()].reset_index(drop=True).groupby("id").pose.value_counts()

id                        pose           
asos.104728610            partial_shot       3
                          pair_shot          1
asos.110858489            partial_shot       3
                          human_wear_shot    1
asos.1151505              pair_shot          3
                                            ..
sportsdirect.12101202270  side_shot          2
                          sole_shot          1
sportsdirect.16102901270  pair_shot          3
                          side_shot          2
                          sole_shot          1
Name: pose, Length: 467, dtype: int64

In [80]:
df.iloc[nn_imids].pose

id
footasylum.37310826         side_shot
sportsdirect.12138402270    side_shot
sportsdirect.12138402270    side_shot
sportsdirect.12138402270    pair_shot
Name: pose, dtype: object

In [91]:
for imid in random.choices(pred_imids, k=20):
    nn_images = nns[imid]
    nn_imids, nn_dists = map(list, zip(*nn_images))
    display(HTML(f"<h2>{df.iloc[imid].id}  |  Ground Truth : [ {', '.join(df.iloc[imid].ground_truth)} ]</h2>"))

    viz(
        df.iloc[nn_imids],
        dist=nn_dists,
        pose=df.iloc[nn_imids].pose.values,
        color=df.iloc[nn_imids].color_pred.values,
    )
    display(HTML("<HR>" * 3))