# Feature Extraction w/ Fine-Tuned VGG16 Model

In [4]:
# Standard data science libraries
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
# Other libraries
import time
import pickle
import re
from os import listdir
from os.path import isfile, join
from scipy import spatial
# keras packages
from keras import models
from keras import layers
from keras import optimizers
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator, load_img
from keras.datasets import fashion_mnist
from keras.models import Model,load_model

In [5]:
# load the fine-tuned model
with open('FT_VGG16.pickle', 'rb') as f:
    model = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'FT_VGG16.pickle'

In [7]:
# import h5py
# f = h5py.File('small_last4_updated.h5','r+')
# data_p = f.attrs['training_config']
# data_p = data_p.decode().replace("learning_rate","lr").encode()
# f.attrs['training_config'] = data_p
# f.close()


# model = load_model('small_last4.h5')

In [9]:
model = load_model('small_last4_updated.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.




In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 1, 1, 512)         14714688  
_________________________________________________________________
flatten_1 (Flatten)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 8200      
Total params: 15,248,200
Trainable params: 7,612,936
Non-trainable params: 7,635,264
_________________________________________________________________


In [11]:
# test out function that returns output of an intermediate layer

img_path1 = 'omybag__36.png'
# img_path2 = './data_original/totes/top_48.png'
# img_path3 = './data_original/circles/gunas_18.png'
# img_path4 = './data_original/shoulder_bags/av_18.png'

layer_name = 'dense_1'
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

def get_img_features(img_path):
    img = image.load_img(img_path, target_size=(48, 48))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    
    intermediate_output = intermediate_layer_model.predict(img_data)
    return intermediate_output

In [12]:
features1 = get_img_features(img_path1)
# features2 = get_img_features(img_path2)
# features3 = get_img_features(img_path3)
# features4 = get_img_features(img_path4)

In [13]:
features1.shape

(1, 1024)

#### Test cosine similarity calculation

In [298]:
def get_cosine_sim(feature1, feature2):
    '''
    function to return cosine similarity between two feature matrices
    '''
    return 1 - spatial.distance.cosine(features1, features1)

In [40]:
# Note that spatial.distance.cosine computes the distance, and not the similarity. 
# So, you must subtract the value from 1 to get the similarity.
cosine_sim = 1 - spatial.distance.cosine(features1, features1)
cosine_sim # same bag

1.0

In [42]:
cosine_sim2 = 1 - spatial.distance.cosine(features1, features3)
cosine_sim2 # tote bag and circle bag

0.36654725670814514

In [43]:
cosine_sim3 = 1 - spatial.distance.cosine(features1, features4)
cosine_sim3

0.3918082118034363

In [44]:
cosine_sim4 = 1 - spatial.distance.cosine(features1, features3)
cosine_sim4

0.36654725670814514

#### Load the dataframe to store the feature matrices.

Clean up the dataframe as needed (especially the price column).



In [262]:
with open('df_bags.pickle', 'rb') as f:
    df= pickle.load(f)

In [263]:
df.price.fillna(value=0, inplace=True)
df.price.replace(to_replace='Sold Out', value=0, inplace=True)

In [264]:
df.price = df.price.astype(str)

In [265]:
df['cleaned_price'] = df['price'].apply(lambda x: re.findall('[0-9|$|€][0-9.]*', x))
df['cleaned_price'] = df['cleaned_price'].apply(lambda x: float(str(x[0]).replace('$', '').replace('€','')))

In [266]:
df.head()

Unnamed: 0,index,name,brand,price,bag_url,img_filename,label,is_ethical,source,cleaned_price
0,0,Tote,Victoria's Secret,$21.99$74,https://www.thredup.com/product/handbags-victo...,thredup_vic_0.png,Totes,1,thredup,21.99
1,1,Tote,Anne Klein,$17.99$69,https://www.thredup.com/product/handbags-anne-...,thredup_ann_1.png,Totes,1,thredup,17.99
2,2,Tote,Unbranded,$16.99$60,https://www.thredup.com/product/handbags-unbra...,thredup_unb_2.png,Totes,1,thredup,16.99
3,3,Tote,Unbranded,$15.99$60,https://www.thredup.com/product/handbags-unbra...,thredup_unb_3.png,Totes,1,thredup,15.99
4,4,Tote,Victoria's Secret,$25.99$68,https://www.thredup.com/product/handbags-victo...,thredup_vic_4.png,Totes,1,thredup,25.99


In [267]:
df.cleaned_price.head()

0    21.99
1    17.99
2    16.99
3    15.99
4    25.99
Name: cleaned_price, dtype: float64

Fix label column

In [268]:
# set variables for each label image directory
backpack_path = './data_original/backpacks'
bucket_path = './data_original/bucket_bags'
circle_path = './data_original/circles'
clutch_path = './data_original/clutches'
fanny_path = './data_original/fanny_packs'
shoulder_path = './data_original/shoulder_bags'
tote_path = './data_original/totes'
weekender_path = './data_original/weekenders'

In [269]:
# get list of all files in each directory
backpacks = [f for f in listdir(backpack_path) if isfile(join(backpack_path, f))]
buckets = [f for f in listdir(bucket_path) if isfile(join(bucket_path, f))]
circles = [f for f in listdir(circle_path) if isfile(join(circle_path, f))]
clutches = [f for f in listdir(clutch_path) if isfile(join(clutch_path, f))]
fanny_packs = [f for f in listdir(fanny_path) if isfile(join(fanny_path, f))]
shoulder_bags = [f for f in listdir(shoulder_path) if isfile(join(shoulder_path, f))]
totes = [f for f in listdir(tote_path) if isfile(join(tote_path, f))]
weekenders = [f for f in listdir(weekender_path) if isfile(join(weekender_path, f))]

In [270]:
def assign_label(row):
    '''
    function to apply to dataframe that assigns correct label to filename
    '''
    if row['img_filename'] in backpacks:
        return "backpacks"
    elif row['img_filename'] in circles:
        return "circles"
    elif row['img_filename'] in clutches:
        return 'clutches'
    elif row['img_filename'] in fanny_packs:
        return 'fanny_packs'
    elif row['img_filename'] in shoulder_bags:
        return 'shoulder_bags'
    elif row['img_filename'] in totes:
        return 'totes'
    elif row['img_filename'] in buckets:
        return 'bucket_bags'
    elif row['img_filename'] in weekenders:
        return 'weekenders'
    else:
        return row['label']

In [271]:
# apply function to label column in dataframe
df['label'] = df.apply(assign_label, axis=1)

In [272]:
# df = df.drop('index', axis=1)
df.head()

Unnamed: 0,index,name,brand,price,bag_url,img_filename,label,is_ethical,source,cleaned_price
0,0,Tote,Victoria's Secret,$21.99$74,https://www.thredup.com/product/handbags-victo...,thredup_vic_0.png,totes,1,thredup,21.99
1,1,Tote,Anne Klein,$17.99$69,https://www.thredup.com/product/handbags-anne-...,thredup_ann_1.png,totes,1,thredup,17.99
2,2,Tote,Unbranded,$16.99$60,https://www.thredup.com/product/handbags-unbra...,thredup_unb_2.png,totes,1,thredup,16.99
3,3,Tote,Unbranded,$15.99$60,https://www.thredup.com/product/handbags-unbra...,thredup_unb_3.png,totes,1,thredup,15.99
4,4,Tote,Victoria's Secret,$25.99$68,https://www.thredup.com/product/handbags-victo...,thredup_vic_4.png,totes,1,thredup,25.99


In [273]:
df['label'].value_counts()

totes            1553
backpacks        1520
shoulder_bags    1453
clutches          491
Clutches          415
fanny_packs       385
bucket_bags       277
weekenders        277
Totes             172
circles           138
Shoulder Bags      68
Bucket Bags         4
Weekenders          2
Name: label, dtype: int64

In [274]:
def clean_label_name(row):
    if row['label'] == 'Clutches':
        return 'clutches'
    elif row['label'] == 'Totes':
        return 'totes'
    elif row['label'] =='Shoulder Bags':
        return 'shoulder_bags'
    elif row['label'] =='Bucket Bags':
        return 'bucket_bags'
    elif row['label']=='Weekenders':
        return 'weekenders'
    else:
        return row['label']

In [275]:
df['label'] = df.apply(clean_label_name, axis=1)

In [276]:
df.head()

Unnamed: 0,index,name,brand,price,bag_url,img_filename,label,is_ethical,source,cleaned_price
0,0,Tote,Victoria's Secret,$21.99$74,https://www.thredup.com/product/handbags-victo...,thredup_vic_0.png,totes,1,thredup,21.99
1,1,Tote,Anne Klein,$17.99$69,https://www.thredup.com/product/handbags-anne-...,thredup_ann_1.png,totes,1,thredup,17.99
2,2,Tote,Unbranded,$16.99$60,https://www.thredup.com/product/handbags-unbra...,thredup_unb_2.png,totes,1,thredup,16.99
3,3,Tote,Unbranded,$15.99$60,https://www.thredup.com/product/handbags-unbra...,thredup_unb_3.png,totes,1,thredup,15.99
4,4,Tote,Victoria's Secret,$25.99$68,https://www.thredup.com/product/handbags-victo...,thredup_vic_4.png,totes,1,thredup,25.99


In [277]:
df['label'].value_counts()

totes            1725
shoulder_bags    1521
backpacks        1520
clutches          906
fanny_packs       385
bucket_bags       281
weekenders        279
circles           138
Name: label, dtype: int64

#### Apply the the feature extraction function to the dataframe

In [278]:
def apply_feature_function(row):
    try:
        img_path = './data_original/'+ row['label']+'/'+ row['img_filename']
        return get_img_features(img_path)
    except:
        return 'file not found'

In [279]:
df['feature_matrix'] = df.apply(apply_feature_function, axis=1)

  ' expressed in bytes should be converted ' +
  ' expressed in bytes should be converted ' +
  ' expressed in bytes should be converted ' +
  ' expressed in bytes should be converted ' +
  ' expressed in bytes should be converted ' +
  ' expressed in bytes should be converted ' +


In [280]:
df.head()

Unnamed: 0,index,name,brand,price,bag_url,img_filename,label,is_ethical,source,cleaned_price,feature_matrix
0,0,Tote,Victoria's Secret,$21.99$74,https://www.thredup.com/product/handbags-victo...,thredup_vic_0.png,totes,1,thredup,21.99,"[[0.0, 34.082092, 120.07053, 34.426167, 60.972..."
1,1,Tote,Anne Klein,$17.99$69,https://www.thredup.com/product/handbags-anne-...,thredup_ann_1.png,totes,1,thredup,17.99,"[[0.0, 59.236874, 93.56827, 3.9990442, 0.0, 25..."
2,2,Tote,Unbranded,$16.99$60,https://www.thredup.com/product/handbags-unbra...,thredup_unb_2.png,totes,1,thredup,16.99,"[[0.0, 62.95456, 4.9559455, 45.43826, 0.0, 143..."
3,3,Tote,Unbranded,$15.99$60,https://www.thredup.com/product/handbags-unbra...,thredup_unb_3.png,totes,1,thredup,15.99,"[[0.0, 32.848614, 31.133387, 28.699326, 4.2855..."
4,4,Tote,Victoria's Secret,$25.99$68,https://www.thredup.com/product/handbags-victo...,thredup_vic_4.png,totes,1,thredup,25.99,"[[0.0, 55.67337, 91.1055, 17.396587, 8.476363,..."


In [281]:
# see how many image files are missing feature matrix 
len(df[df['feature_matrix']=='file not found'])

661

In [291]:
df_dropped_missing = df[df['feature_matrix']!='file not found']

In [292]:
df_dropped_missing.shape

(6094, 11)

In [297]:
# save final dataframe with feature matrix
with open('df_w_features.pickle', 'wb') as f:
    pickle.dump(df_dropped_missing, f)