In [34]:
import pandas as pd
import numpy as np
import pickle

In [23]:
data = pd.read_csv('DeepFashion1/deepfashion1_categoryData.csv')

In [24]:
data.head()

Unnamed: 0,images,category_label,dataset,category,label
0,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,3,train,Blouse,2
1,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,3,train,Blouse,2
2,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,3,val,Blouse,2
3,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,3,train,Blouse,2
4,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,3,test,Blouse,2


In [25]:
data.category.value_counts()

Dress          72158
Tee            36887
Blouse         24557
Shorts         19666
Tank           15429
Skirt          14773
Cardigan       13311
Sweater        13123
Jacket         10467
Top            10078
Blazer          7495
Romper          7408
Jeans           7076
Jumpsuit        6153
Leggings        5013
Joggers         4416
Hoodie          4048
Sweatpants      3048
Kimono          2294
Coat            2120
Cutoffs         1669
Sweatshorts     1106
Poncho           791
Jersey           748
Henley           716
Parka            676
Jeggings         594
Chinos           527
Culottes         486
Trunks           386
Button-Down      330
Flannel          324
Bomber           309
Anorak           160
Robe             150
Turtleneck       146
Kaftan           126
Peacoat           97
Capris            77
Onesie            70
Caftan            54
Gauchos           49
Jodhpurs          45
Sarong            32
Coverup           17
Halter            17
Name: category, dtype: int64

In [31]:
def resample(df, col_name, min_treshold, random_seed=42):
    # remove all data belonging to all classes with total count less than minimum treshold
    resampled_data = df.groupby(col_name).filter(lambda x : len(x)>min_treshold)
    # minimum count of any class
    min_count = resampled_data[col_name].value_counts().min()
    return (resampled_data.groupby(col_name)
            .apply(lambda x: x.sample(min_count, random_state=random_seed))
            .reset_index(drop=True)
            )

In [32]:
final_data = resample(data, 'category', 5000, 42)

In [33]:
final_data.category.value_counts()

Skirt       5013
Jumpsuit    5013
Tee         5013
Tank        5013
Cardigan    5013
Jeans       5013
Jacket      5013
Dress       5013
Leggings    5013
Blouse      5013
Shorts      5013
Romper      5013
Sweater     5013
Blazer      5013
Top         5013
Name: category, dtype: int64

## ANNOY and Top-K

ANNOY: https://github.com/spotify/annoy

Since we are not sure how the embeddings would be passed in the pipeline. I am just assuming like the following dataframe. Iportant point is we need image_embedding and corresponding label.

(check annoy github link above)

We can get the similar images from vector (as used below) or we can get it by index using "t.get_nns_by_item" 

We can also get the distance for each of the top k image from the input image as well

In [44]:
embeddings_df = pickle.load(open('subdata.pkl', 'rb'))

In [45]:
embeddings_df.head()

Unnamed: 0,images,category_label,dataset,category,label,embd
0,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,3,train,Blouse,2,"[0.8212891, 1.0950764, 0.19870186, 0.03826549,..."
1,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,3,train,Blouse,2,"[0.22787644, 1.6668769, 0.32685006, 0.0, 0.073..."
2,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,3,val,Blouse,2,"[0.27398872, 1.0369267, 0.08694938, 0.07015417..."
3,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,3,train,Blouse,2,"[0.014149437, 0.92608607, 0.2060591, 0.8770616..."
4,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,3,test,Blouse,2,"[0.8263322, 1.5453025, 0.20867401, 0.16627665,..."


In [47]:
from annoy import AnnoyIndex

In [None]:
# embedding length
f = len(embeddings_df['embd'][0])
# create annoy index
# metric can be "angular", "euclidean", "manhattan", "hamming", or "dot"
t = AnnoyIndex(f, metric='euclidean')
# tradeoff between accuracy and speed
n_trees = 100

In [None]:
# populate the index
for i, vector in enumerate(embeddings_df['embd']):
    t.add_item(i, vector)
# build a forest of trees
_  = t.build(n_trees)

In [64]:
def get_top_K(img_embd, image_label, embd_map=embeddings_df, K=5):
    # assuming test image is not already indexed (in that case have to use K+1) 
    similar_img_ids = t.get_nns_by_vector(img_embd, K)
    top_k = embd_map.iloc[similar_img_ids]["label"].tolist().count(image_label)
    accuracy = top_k/K
    return accuracy

In [65]:
def get_K_similar_images(img_embd, image_label, embd_map=embeddings_df, K=5):
    # assuming test image is not already indexed (in that case have to use K+1)  
    similar_img_ids = t.get_nns_by_vector(img_embd, K)
    top_k_images = embd_map.iloc[similar_img_ids]
    return top_k_images