In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

from src.fashion_tools import DeepFashion_Attributes

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
with open('models/img_attr_sparse.pkl', 'rb') as S:
    img_attr_values = pickle.load(S)

with open ('models/img_attr_rows.pkl', 'rb') as S:
    img_attr_rows = pickle.load(S)
    
with open('models/img_attr_columns.pkl', 'rb') as S:
    img_attr_columns = pickle.load(S)

img_attr_values = img_attr_values.toarray()
img_attr_df = pd.DataFrame(img_attr_values, columns = img_attr_columns, index=img_attr_rows)

img_attr_df.head(5)

Unnamed: 0,a-line,abstract,abstract chevron,abstract chevron print,abstract diamond,abstract floral,abstract floral print,abstract geo,abstract geo print,abstract paisley,...,zeppelin,zig,zigzag,zip,zip-front,zip-pocket,zip-up,zipped,zipper,zippered
img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
att_vector = DeepFashion_Attributes("Top", img_attr_df)

# Now that we have our attribute vector for tops, what should we do?

For each top match, how many of positive attributes match the random selection?

What are the chances that this type of matching would occur randomly?

The expected value of two randomly chosen vectors will be the dot product of the overall frequencies (att_vector @ att_vector)

The distribution of dot products will tell us where the matching lies in relation to the random case. So: we need to compute the dot product of each image in "Top"  with its best non-self match. To do this, we should parallelize the computation and send it to a cluster

In [19]:
fv = np.load('features/current_feature_vector.npy')
#fv = np.load('features/current_feature_vector_hsv.npy')
#fv = np.load('features/current_feature_vector_hog.npy')
#fv = np.load('features/current_feature_vector_ae.npy')
fv.shape

(10078, 786)

In [42]:
from src.fashion_tools import fashion_similarity

deepKeys = [keyname for keyname in att_vector.index]

def compute_similarity_index(choice, fv, deepKeys):
    #choice = deepKeys[index]
    similarity_scores = fashion_similarity(choice, fv, deepKeys)
    similarity_scores[deepKeys.index(choice)] = np.max(similarity_scores)
    closest = np.argsort(np.array(similarity_scores))
    return [deepKeys[matc] for matc in closest[:5]]

def fashion_dot_best_match(choice, fv, deepKeys, att_vector):
    current_vector = att_vector.loc[choice]
    bestMatch = compute_similarity_index(choice, fv, deepKeys)
    dotResults = np.zeros(shape=5, dtype='float32')
    for m, match in enumerate(bestMatch):
        match_vector = att_vector.loc[match]
        dotResults[m] = current_vector.dot(match_vector)
    return np.array([dotResults[0], np.sum(dotResults)])

def process_dots_chunk(chunk, fv, deepKeys, att_vector):
    chunk_dots_results = [fashion_dot_best_match(K, fv, deepKeys, att_vector) for K in chunk]
    return np.array(chunk_dots_results)

def process_overlap_chunk(chunk, fv, deepKeys, att_vector):
    chunk_overlap_results = [fashion_top5_overlap(K, fv, deepKeys, att_vector) for K in chunk]
    return np.array(chunk_overlap_results)

def fashion_top5_overlap(choice, fv, deepKeys, att_vector):
    """asks overlap of top5 from both attributes and features"""
    fvMatch = compute_similarity_index(choice, fv, deepKeys)
    attMatch = compute_similarity_index(choice, att_vector.values, deepKeys)
    return 10 - len(set(fvMatch+attMatch))
    
#fashion_dot_best_match(deepKeys[55], fv, deepKeys, att_vector)
#process_dots_chunk([deepKeys[K] for K in range(100)], fv, deepKeys, att_vector)

overlap = fashion_top5_overlap(deepKeys[0], fv, deepKeys, att_vector)

In [41]:
overlap

0

In [37]:
len(set(a+b))

10

In [39]:
len(b)

5

In [43]:
from joblib import Parallel, delayed
chunksize = 1260
dataChunks=[]
for m in range(7):
    dataChunks.append( [deepKeys[K+m*chunksize] for K in range(chunksize)])

dataChunks.append([deepKeys[K] for K in range(chunksize*7, len(deepKeys))] )

In [44]:
#let's confirm that we split it up right
items_per_chunk = [len(K) for K in dataChunks]
np.sum(np.array(items_per_chunk))

10078

In [110]:
len(deepKeys) #matches the sum of our chunks, a good sign

10078

In [145]:
dot_scores_v = 0
dot_scores_list = 0

with Parallel(n_jobs=-1, verbose=45, backend="multiprocessing") as parallel:
    #feature_vectors = Parallel(n_jobs=-1, verbose=2, backend="threading")(delayed(image_to_feature)(K, encoder) for K in x_data)                                  
    dot_scores_list = parallel(delayed(process_dots_chunk)(K, fv, deepKeys, att_vector) for K in dataChunks)                                  
dot_scores_v = np.concatenate(dot_scores_list, axis =0)
print(dot_scores_v.shape)

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 28.4min
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed: 28.6min remaining: 85.8min
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed: 28.7min remaining: 47.8min
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed: 28.7min remaining: 28.7min
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed: 28.7min remaining: 17.2min
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed: 28.8min remaining:  9.6min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 28.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 28.9min finished


(10078, 2)


In [128]:
fv.shape

(10078, 256)

In [146]:
np.save("dot_products_DeepFashion_HOG_Only.npy", dot_scores_v)

In [149]:
Ab = np.load("dot_products_DeepFashion_allFeatures.npy")
Bb = np.load("dot_products_DeepFashion_HOG_Only.npy")
Cc = np.load("dot_products_DeepFashion_AE_Only.npy")
Dd = np.load("dot_products_DeepFashion_hsvOnly.npy")

In [135]:
Ab == Bb

array([[ True, False],
       [ True, False],
       [ True, False],
       ...,
       [False, False],
       [ True,  True],
       [False, False]])

In [138]:
Ab == Dd

array([[ True, False],
       [ True, False],
       [ True, False],
       ...,
       [False, False],
       [ True,  True],
       [False, False]])

In [150]:
np.sum(Dd == Bb, axis =0)

array([6553, 3565])

In [143]:
np.sum(Bb == Cc, axis =0)

array([7026, 3958])

In [147]:
np.sum( dot_scores_v == Dd, axis=0)

array([6553, 3565])

In [148]:
np.sum(Dd == Cc, axis=0)

array([7026, 3958])

In [46]:
#this is the overlap of the top5 based on full features or just on attribute vector

overlap_scores_v = 0
overlap_scores_list = 0

with Parallel(n_jobs=-1, verbose=45, backend="multiprocessing") as parallel:
    #feature_vectors = Parallel(n_jobs=-1, verbose=2, backend="threading")(delayed(image_to_feature)(K, encoder) for K in x_data)                                  
    overlap_scores_list = parallel(delayed(process_overlap_chunk)(K, fv, deepKeys, att_vector) for K in dataChunks)                                  
overlap_scores_v = np.concatenate(overlap_scores_list, axis =0)

KeyboardInterrupt: 

In [None]:
print(overlap_scores_v.shape)
np.save("top5_overlap.npy", overlap_scores_v)

# Plots

Now that our dot products have been computed, we can start to make plots

In [9]:
import seaborn as sns

In [5]:
relative_frequency = att_vector.sum(axis=0)/att_vector.shape[0]

In [14]:
fv = np.load('features/current_feature_vector.npy')
#fv = np.load('features/current_feature_vector_hsv.npy')
#fv = np.load('features/current_feature_vector_hog.npy')
#fv = np.load('features/current_feature_vector_ae.npy')
fv.shape

(10078, 786)

In [2]:
dvAll = np.load("models/DeepFashionGeneric_dot_products/dot_products_DeepFashion_allFeatures.npy")
dvHog = np.load("models/DeepFashionGeneric_dot_products/dot_products_DeepFashion_HOG_Only.npy")
dvAE = np.load("models/DeepFashionGeneric_dot_products/dot_products_DeepFashion_AE_Only.npy")
dvHSV = np.load("models/DeepFashionGeneric_dot_products/dot_products_DeepFashion_hsvOnly.npy")

In [7]:
rfDot = relative_frequency.dot(relative_frequency)
print(rfDot)

0.1652755067331335


In [8]:
np.mean(dvAll, axis=0)

array([0.6312761, 2.1312761], dtype=float32)

In [10]:
np.mean(dvHog, axis=0)

array([0.589105 , 2.0094264], dtype=float32)

In [11]:
np.mean(dvAE, axis=0)

array([0.42776343, 1.4086128 ], dtype=float32)

In [12]:
np.mean(dvHSV, axis=0)

array([0.44185355, 1.522822  ], dtype=float32)

In [18]:
dotsum = np.sum(att_vector*att_vector, axis=1)
perfect = np.mean(dotsum, axis=0)
print(perfect)

3.549513792419131


# top 5 match 

How many of the top five are the same using just attribute vectors?