## Importing libraries, initialising global variables

In [1]:
import imageio
from statistics import median
from random import randint
from glob import glob
import pandas as pd
import numpy as np
from keras.layers.core import Flatten, Dropout
from keras.layers import Input, Dense, Lambda, Layer
from keras import backend as K
from keras import applications
from keras.models import Sequential, Model
from keras.optimizers import RMSprop, Adam
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.resnet import ResNet152

# Path to folder containing images
DATASET_PATH = './Images'

num_samples = 12000

## Generator function

In [2]:

# data generator for neural network
# forms correct and incorrect pairings of images with text descriptions and labels them as correct (1) or incorrect (0)

def generator(batch_size, df):
    
    batch_img = np.zeros((batch_size, 224, 224, 3))
    batch_txt = np.zeros((batch_size, 512))
    batch_labels = np.zeros((batch_size,1))
    
    video_ids = df['image']
    video_txt = df['txt_enc']
    
    length = len(df) -1
    
    while True:
        for i in range(batch_size//2):
            
            i = i*2
            
            #correct
            sample = randint(0,length)
            file = video_ids.iloc[sample]
            
            correct_txt = video_txt.iloc[sample]
            
            im = load_img(file, target_size=(224, 224))
            im = img_to_array(im)
            im = np.expand_dims(im, axis=0)
            im = preprocess_input(im)
            
            batch_img[i-2] = im
            batch_txt[i-2] = correct_txt
            batch_labels[i-2] = 1
                       
            #incorrect 
            file = video_ids.iloc[randint(0,length)]
                       
            im = load_img(file, target_size=(224, 224))
            im = img_to_array(im)
            im = np.expand_dims(im, axis=0)
            im = preprocess_input(im)

            batch_img[i-1] = im
            batch_txt[i-1] = correct_txt
            batch_labels[i-1] = 0
                        
        yield [batch_txt, batch_img], batch_labels

## Utils

In [3]:
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def create_img_encoder(input_dim, resnet):
    x = Sequential()
    x.add(resnet)
    x.add(Dense(500, activation="relu"))
    x.add(Dropout(0.5))
    x.add(Dense(512, activation="relu"))
    return x

def create_txt_encoder(input_dim):
    x = Sequential()
    x.add(Dense(500, input_shape = (512,), activation="relu"))
    x.add(Dropout(0.5))
    x.add(Dense(512, activation="relu"))
    return x

def compute_accuracy(predictions, labels):
    return labels[predictions.ravel() < 0.5].mean()

## Initialise ResNet152

In [4]:
resnet = ResNet152(include_top=True, weights='imagenet')

for layer in resnet.layers:
    layer.trainable = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet152_weights_tf_dim_ordering_tf_kernels.h5


## Creating model and loading data

In [5]:
input_txt = Input(shape=(512,))
input_img = Input(shape=(224, 224, 3))

txt_enc = create_txt_encoder(input_txt)
img_enc = create_img_encoder(input_img, resnet)

encoded_txt = txt_enc(input_txt)
encoded_img = img_enc(input_img)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([encoded_txt, encoded_img])

model = Model([input_txt, input_img], distance)

adam = Adam(lr=0.00001)
model.compile(loss=contrastive_loss, optimizer=adam)

model.summary()




Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 512)          513012      input_2[0][0]                    
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 512)          61176956    input_3[0][0]                    
______________________________________________________________________________________________

In [None]:
# The CSV generated by the word2vec(gensim) model
data = pd.read_csv('./word2vec_gensim.csv', header=None)
data = list(np.array(data))

img_paths = [DATASET_PATH + str(i) + '.jpg' for i in range(12305)]

dataset = pd.DataFrame()
dataset['image'] = pd.Series(img_paths)
dataset['txt_enc'] = pd.Series(data)

df_test = dataset[num_samples:]
dataset = dataset[:num_samples]

df_train = dataset[:int(num_samples*0.8)]
df_val = dataset[int(num_samples*0.8):]


## Training

In [None]:
model.fit_generator(generator(30, df_train), steps_per_epoch= int(int(num_samples*0.8)/30), validation_data= generator(30, df_val), validation_steps=int(int(num_samples*0.2)/30), epochs=200, verbose=1)
model.save_weights('./weights.h5')

## Load saved weights

In [None]:
# Load from where you stored the weights
model.load_weights('./weights.h5')

## Decide size of test set

In [None]:
subset_size = 300
subset = df_test.iloc[:subset_size]

## Metrics

In [None]:
# metrics - img -> text

mr = []
top_1_count = 0
top_5_count = 0
top_10_count = 0

for i in range(subset_size):
    file = subset['image'].iloc[i]
    im = load_img(file, target_size=(224, 224))
    im = img_to_array(im)
    im = np.expand_dims(im, axis=0)
    im = preprocess_input(im)
    
    image_array = np.zeros((subset_size, 224, 224, 3))
    for k in range(subset_size):
        image_array[k] = im
        
    txt_array = np.zeros((subset_size, 512))
    for j in range(subset_size):
        txt = subset['txt_enc'].iloc[j]
        txt_array[j] = txt
    
    predictions = [pred[0] for pred in model.predict([txt_array, image_array])]
    pred_i = predictions[i]
    predictions.sort()
    rank = predictions.index(pred_i)
    if rank < 10:
        top_10_count += 1
    if rank < 5:
        top_5_count += 1
    if rank < 1:
        top_1_count += 1
    mr.append(rank+1)     

print('Median Rank(img->txt):', median(mr)*100/subset_size, '%')
print('R@1(img->txt):', top_1_count*100/subset_size, '%')
print('R@5(img->txt):', top_5_count*100/subset_size, '%')
print('R@10(img->txt):', top_10_count*100/subset_size, '%')

In [None]:
# metrics - txt -> img

mr = []
top_1_count = 0
top_5_count = 0
top_10_count = 0

for i in range(subset_size):
    txt = subset['txt_enc'].iloc[i] 
    txt_array = np.zeros((subset_size, 512))
    for k in range(subset_size):
        txt_array[k] = txt
        
        
    image_array = np.zeros((subset_size, 224, 224, 3))
    for j in range(subset_size):
        file = subset['image'].iloc[j]
        im = load_img(file, target_size=(224, 224))
        im = img_to_array(im)
        im = np.expand_dims(im, axis=0)
        im = preprocess_input(im)
        image_array[k] = im
    
    predictions = [pred[0] for pred in model.predict([txt_array, image_array])]
    pred_i = predictions[i]
    predictions.sort()
    rank = predictions.index(pred_i)
    if rank < 10:
        top_10_count += 1
    if rank < 5:
        top_5_count += 1
    if rank < 1:
        top_1_count += 1
    mr.append(rank+1)     

print('Median Rank(txt->img):', median(mr)*100/subset_size, '%')
print('R@1(txt->img):', top_1_count*100/subset_size, '%')
print('R@5(txt->img):', top_5_count*100/subset_size, '%')
print('R@10(txt->img):', top_10_count*100/subset_size, '%')

Median Rank(txt->img): 0.6666666666666666 %
R@1(txt->img): 33.666666666666664 %
R@5(txt->img): 95.33333333333333 %
R@10(txt->img): 95.33333333333333 %


## Download Weights

In [None]:
# download weights

from IPython.display import FileLink

FileLink(r'./weights.h5')

## Try predicting

In [None]:
# trying out predict

text = np.zeros((2, 512))
image = np.zeros((2, 224, 224, 3))

file = dataset['image'].iloc[21]     
correct_txt = dataset['txt_enc'].iloc[21]

im = load_img(file, target_size=(224, 224))
im = img_to_array(im)
im = np.expand_dims(im, axis=0)
im = preprocess_input(im)

image[0] = im

text[0] = correct_txt

file = dataset['image'].iloc[21]     
correct_txt = dataset['txt_enc'].iloc[90]

im = load_img(file, target_size=(224, 224))
im = img_to_array(im)
im = np.expand_dims(im, axis=0)
im = preprocess_input(im)

image[1] = im

text[1] = correct_txt

model.predict([text, image])