In [49]:
import glob
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tqdm.auto import tqdm

In [19]:
files = glob.glob('./data/*.jpg')
print(len(files))

25000


In [20]:
df = pd.DataFrame()
df['files'] = files
df['y'] = df['files'].str.contains('dog', regex=False)
print(df.head())

                  files      y
0   ./data/dog.8011.jpg   True
1   ./data/cat.5077.jpg  False
2   ./data/dog.7322.jpg   True
3   ./data/cat.2718.jpg  False
4  ./data/cat.10151.jpg  False


In [50]:
vgg = VGG16(
    include_top=False,
    weights="imagenet"
)

print([l.name for l in vgg.layers])
vgg.summary()

['input_9', 'block1_conv1', 'block1_conv2', 'block1_pool', 'block2_conv1', 'block2_conv2', 'block2_pool', 'block3_conv1', 'block3_conv2', 'block3_conv3', 'block3_pool', 'block4_conv1', 'block4_conv2', 'block4_conv3', 'block4_pool', 'block5_conv1', 'block5_conv2', 'block5_conv3', 'block5_pool']
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (

In [22]:
OUTPUT_LAYERS = [
    'block1_pool', 
    'block2_pool',
    'block3_pool',
    'block4_pool',
    'block5_pool'
    ]

feature_extractor = Model(inputs=vgg.input, outputs=[vgg.get_layer(l).output for l in OUTPUT_LAYERS])

def load_img(path_to_img, size=224):
    img = image.load_img(path_to_img, target_size=(size,size))
    data = image.img_to_array(img)
    return data

def show_img(img):
    plt.imshow(img.astype('uint8'))

def process_data(feature_extractor, path):
    img = load_img(path)
    preds = feature_extractor.predict(img[tf.newaxis, :])    
    pooled_outputs = [np.max(p, axis=(1,2))[0] for p in preds]
    return dict(zip(OUTPUT_LAYERS, pooled_outputs))

In [24]:
df = pd.read_csv('data.csv')
df.head()
# True if dog, False if cat

Unnamed: 0,files,y
0,./data/dog.8011.jpg,True
1,./data/cat.5077.jpg,False
2,./data/dog.7322.jpg,True
3,./data/cat.2718.jpg,False
4,./data/cat.10151.jpg,False


In [25]:
a = process_data(feature_extractor, df['files'].iloc[0])
for k in a:
    print('Layer: {}'.format(k), 'Shape: {}'.format(a[k].shape))

Layer: block1_pool Shape: (64,)
Layer: block2_pool Shape: (128,)
Layer: block3_pool Shape: (256,)
Layer: block4_pool Shape: (512,)
Layer: block5_pool Shape: (512,)


In [26]:
N = 5000
features = []
for name in OUTPUT_LAYERS:
    features.append((name, []))

features = dict(features)

sample = df.sample(n=N)
files = sample['files']
for path in tqdm(files, total=N):
    output = process_data(feature_extractor, path)
    for key in output:
        features[key].append(output[key])

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [48]:
sample.to_csv('sample.csv', index=False)
for key in features:
    df = pd.DataFrame(features[key])
    output_name = 'features/{}.csv'.format(key)
    df.to_csv(output_name, index=False)
    print('Saved {}'.format(output_name))

Saved features/block1_pool.csv
Saved features/block2_pool.csv
Saved features/block3_pool.csv
Saved features/block4_pool.csv
Saved features/block5_pool.csv
