In [2]:
import pandas as pd
import numpy as np

In [2]:
with open('../Flickr8k_text/Flickr_8k.trainImages.txt','r') as tr_imgs:
    train_imgs = tr_imgs.read().splitlines()
    
with open('../Flickr8k_text/Flickr_8k.devImages.txt','r') as dv_imgs:
    dev_imgs = dv_imgs.read().splitlines()
    
with open('../Flickr8k_text/Flickr_8k.testImages.txt','r') as ts_imgs:
    test_imgs = ts_imgs.read().splitlines()
    
with open('../Flickr8k_text/Flickr8k.token.txt','r') as img_tkns:
    captions = img_tkns.read().splitlines()

In [3]:
train_imgs = train_imgs + dev_imgs

In [4]:
from collections import defaultdict

caption_map = defaultdict(list)

for record in captions:
    record = record.split('\t')
    img_name = record[0][:-2]
    img_caption = record[1].strip()
    caption_map[img_name].append(img_caption)

In [5]:
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input as preprocess_vgg16_input

  return f(*args, **kwds)
Using TensorFlow backend.


In [6]:
def process_image2arr(path, img_dims=(224, 224)):
    img = image.load_img(path, target_size=img_dims)
    img_arr = image.img_to_array(img)
    img_arr = np.expand_dims(img_arr, axis=0)
    img_arr = preprocess_vgg16_input(img_arr)
    return img_arr

In [7]:
from keras.applications import vgg16
from keras.models import Model


vgg_model = vgg16.VGG16(include_top=True, weights='imagenet', 
                        input_shape=(224, 224, 3))
vgg_model.layers.pop()
output = vgg_model.layers[-1].output
vgg_model = Model(vgg_model.input, output)
vgg_model.trainable = False

In [8]:
vgg_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [13]:
def extract_tl_features_vgg(model, image_file_name, image_dir='../Flickr8k_imgs/'):
    
    pr_img = process_image2arr(image_dir+image_file_name)
    tl_features = model.predict(pr_img)
    tl_features = np.reshape(tl_features, tl_features.shape[1])
    return tl_features

In [14]:
img_tl_featureset = dict()
train_img_names = []
train_img_captions = []
test_img_names = []
test_img_captions = []

In [15]:
for img in train_imgs:
    img_tl_featureset[img] = extract_tl_features_vgg(model=vgg_model, image_file_name=img)
    for caption in caption_map[img]:
        train_img_names.append(img)
        train_img_captions.append(caption)
        
for img in test_imgs:
    img_tl_featureset[img] = extract_tl_features_vgg(model=vgg_model, image_file_name=img)
    for caption in caption_map[img]:
        test_img_names.append(img)
        test_img_captions.append(caption)
        
train_dataset = pd.DataFrame({'image': train_img_names, 'caption': train_img_captions})
test_dataset = pd.DataFrame({'image': test_img_names, 'caption': test_img_captions})
print('Train Dataset Size:', len(train_dataset), '\tTest Dataset Size:', len(test_dataset))

Train Dataset Size: 35000 	Test Dataset Size: 5000


In [17]:
train_dataset.head(10)

Unnamed: 0,caption,image
0,A black dog is running after a white dog in th...,2513260012_03d33305cf.jpg
1,Black dog chasing brown dog through snow,2513260012_03d33305cf.jpg
2,Two dogs chase each other across the snowy gro...,2513260012_03d33305cf.jpg
3,Two dogs play together in the snow .,2513260012_03d33305cf.jpg
4,Two dogs running through a low lying body of w...,2513260012_03d33305cf.jpg
5,A little baby plays croquet .,2903617548_d3e38d7f88.jpg
6,A little girl plays croquet next to a truck .,2903617548_d3e38d7f88.jpg
7,The child is playing croquette by the truck .,2903617548_d3e38d7f88.jpg
8,The kid is in front of a car with a put and a ...,2903617548_d3e38d7f88.jpg
9,The little boy is playing with a croquet hamme...,2903617548_d3e38d7f88.jpg


In [18]:
train_dataset = train_dataset[['image', 'caption']]
test_dataset = test_dataset[['image', 'caption']]

train_dataset.to_csv('image_train_dataset.tsv', sep='\t', index=False)
test_dataset.to_csv('image_test_dataset.tsv', sep='\t', index=False)

In [19]:
from sklearn.externals import joblib

joblib.dump(img_tl_featureset, 'transfer_learn_img_features.pkl')

['transfer_learn_img_features.pkl']

In [10]:
[(key, value.shape) for key, value in img_tl_featureset.items()][:5]

[('3079787482_0757e9d167.jpg', (4096,)),
 ('3284955091_59317073f0.jpg', (4096,)),
 ('1795151944_d69b82f942.jpg', (4096,)),
 ('3532192208_64b069d05d.jpg', (4096,)),
 ('454709143_9c513f095c.jpg', (4096,))]

In [11]:
[(k, np.round(v, 3)) for k, v in img_tl_featureset.items()][:5]

[('3079787482_0757e9d167.jpg',
  array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)),
 ('3284955091_59317073f0.jpg',
  array([0.615, 0.   , 0.653, ..., 0.   , 1.559, 2.614], dtype=float32)),
 ('1795151944_d69b82f942.jpg',
  array([0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.538], dtype=float32)),
 ('3532192208_64b069d05d.jpg',
  array([0.   , 0.   , 0.   , ..., 0.   , 0.   , 2.293], dtype=float32)),
 ('454709143_9c513f095c.jpg',
  array([0.   , 0.   , 0.131, ..., 0.833, 4.263, 0.   ], dtype=float32))]

In [3]:
train_df = pd.read_csv('image_train_dataset.tsv', delimiter='\t')
total_samples = train_df.shape[0]
total_samples

35000