### kedro catalogs

In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
catalog.list()

### loading data

In [None]:
embeddings = context.catalog.load('image_embeddings')

In [None]:
articles = context.catalog.load('articles')

In [None]:
embeddings

In [None]:
articles.head()

### show similar articles

There is also `torch.cdist` for calculating distances between vectors, if you want to do these calculations in PyTorch

In [None]:
from sklearn.neighbors import KDTree
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import os
import pandas as pd

In [None]:
os.getcwd()

In [None]:
os.chdir('../data/01_raw/images_128_128')

In [None]:
os.getcwd()

In [None]:
tree = KDTree(embeddings.values, leaf_size=5)

In [None]:
def find_similar_images(query_article_id, embeddings, tree):
    _, ind = tree.query(embeddings.loc[query_article_id].values.reshape(1, -1), k=5)
    closest_embeddings = embeddings.iloc[ind[0]].index.tolist()
    imgs = [Image.open(f'{article_id}.jpg').convert('RGB') for article_id in closest_embeddings]
    _, axs = plt.subplots(1, 5, figsize=(12, 12))
    axs = axs.flatten()
    for img, article_id, ax in zip(imgs, closest_embeddings, axs):
        ax.title.set_text(f'{article_id}')
        ax.imshow(img)
    plt.show()

In [None]:
find_similar_images('0680263001', embeddings, tree)

In [None]:
def plot_n_random_similar_images(n, articles):
    for i in range(n):
        random_article = articles.sample(1)
        print(f'''{i+1} random article:
        {random_article.prod_name},
        {random_article.article_id},
        {random_article.product_group_name}
        ''')
        find_similar_images(random_article.article_id, embeddings, tree)
        print('\n\n\n')

In [None]:
plot_n_random_similar_images(5, articles)

In [None]:
for product_group in articles.product_group_name.unique():
    print(product_group)
    random_article = articles[articles['product_group_name']==product_group].sample(1)
    print(f'Random article id: {random_article.article_id}, random_article name: {random_article.prod_name}')
    find_similar_images(random_article.article_id, embeddings, tree)
    print('\n\n\n')

### Tensorboard clustering

In [None]:
os.getcwd()

In [None]:
embeddings_sample = embeddings.sample(n=4_000)

In [None]:
all_images = embeddings_sample.index.to_list()

In [None]:
embeddings_sample.to_csv('../../../notebooks/tensorboard_embeddings/feature_vecs.tsv', sep="\t", index=False, header=False)

In [None]:
images_list = [Image.open(f'{article_id}.jpg').convert('RGB') for article_id in all_images]

In [None]:
image_width, image_height = images_list[0].size

In [None]:
image_width, image_height

In [None]:
one_square_size = int(np.ceil(np.sqrt(len(images_list))))

In [None]:
master_width = image_width * one_square_size
master_height = image_height * one_square_size

In [None]:
master_width, master_height

In [None]:
spriteimage = Image.new(
    mode='RGBA',
    size=(master_width, master_height),
    color=(0,0,0,0))  # fully transparent

In [None]:
for count, image in enumerate(images_list):
    div, mod = divmod(count, one_square_size)
    h_loc = image_width*div
    w_loc = image_width*mod
    spriteimage.paste(image, (w_loc, h_loc))

In [None]:
spriteimage.convert("RGB").save('../../../notebooks/tensorboard_embeddings/sprite.jpg', transparency=0)

In [None]:
all_images[:10]

In [None]:
sample_articles = articles[articles.article_id.isin(all_images)].copy()

In [None]:
sample_articles.shape

In [None]:
sample_articles['article_id'] = pd.Categorical(sample_articles.article_id, categories=all_images, ordered=True)

In [None]:
sample_articles.sort_values(by='article_id', inplace=True)

In [None]:
sample_articles.head()

In [None]:
sample_articles[['article_id', 'product_group_name', 'colour_group_name']].to_csv('../../../notebooks/tensorboard_embeddings/metadata.tsv', sep='\t', index=False)

In [None]:
# !cd to your folder with the files

In [None]:
# create a config in that folder
#embeddings {
#  tensor_path: "feature_vecs.tsv"
#  metadata_path: "metadata.tsv"
#  sprite {
#    image_path: "sprite.jpg"
#    single_image_dim: 50
#    single_image_dim: 50
#  }
#}

In [None]:
# !tensorboard --logdir .

In [None]:
# go to Projections -> select dimensionality reduction technique