In [1]:
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
import numpy as np


# DATASETS SLPLITING

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

def load_descriptions(doc):
    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # extract filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    
    i=0
    
    descriptions["id"] = list()
    descriptions["label"] = list()
    descriptions["image_id"] = list()
    descriptions["desc"] = list()
    
    # As the dataset is not prepared for a multimodal binary classification
    # let's mix some instances and make the label '0' means that the image 
    # and the text are similar and the laber '1' means that are not similar
    lines = doc.split('\n')
    total_instances = len(dataset)
    positivos = int(0.8 * total_instances)
    negativos = total_instances - positivos

    for j, line in enumerate(lines):        
        # split line by white space
        tokens = line.split()
        
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        
        desc = ""
        for w in image_desc:
            desc = desc + " " + w
            
        if image_id in dataset:
            
            # create list
            descriptions["id"].append(i)
            descriptions["desc"].append(desc)
            
            if i<positivos*5:
                descriptions["image_id"].append(image_id)
                descriptions["label"].append(0)

            else:
                original_state = j
                j = j-10
                tokens = lines[j].split()
                
                # split id from description
                image_id, image_desc = tokens[0], tokens[1:]
                
                while image_id not in dataset:
                    j = j-5
                    tokens = lines[j].split()
                    # split id from description
                    image_id, image_desc = tokens[0], tokens[1:]
                
                descriptions["image_id"].append(image_id)
                descriptions["label"].append(1)  
                    
                j = original_state
                
            i+=1

    return descriptions

In [3]:
doc = load_doc("./descriptions.txt")
doc = load_descriptions(doc)

train_set = load_set("E://TFM/Flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt")
train_set = load_clean_descriptions("./descriptions.txt",train_set)

test_set = load_set("E://TFM/Flickr8k/Flickr8k_text/Flickr_8k.testImages.txt")
test_set = load_clean_descriptions("./descriptions.txt",test_set)

val_set = load_set("E://TFM/Flickr8k/Flickr8k_text/Flickr_8k.devImages.txt")
val_set = load_clean_descriptions("./descriptions.txt",val_set)

df = pd.DataFrame(data=doc)
df_train = pd.DataFrame(data=train_set)
df_test = pd.DataFrame(data=test_set)
df_val = pd.DataFrame(data=val_set)

In [4]:
df_train.head()

Unnamed: 0,id,label,image_id,desc
0,0,0,1000268201_693b08cb0e,child in pink dress is climbing up set of sta...
1,1,0,1000268201_693b08cb0e,girl going into wooden building
2,2,0,1000268201_693b08cb0e,little girl climbing into wooden playhouse
3,3,0,1000268201_693b08cb0e,little girl climbing the stairs to her playhouse
4,4,0,1000268201_693b08cb0e,little girl in pink dress going into wooden c...


In [5]:
df_test.head()

Unnamed: 0,id,label,image_id,desc
0,0,0,1056338697_4f7d7ce270,blond woman in blue shirt appears to wait for...
1,1,0,1056338697_4f7d7ce270,blond woman is on the street hailing taxi
2,2,0,1056338697_4f7d7ce270,woman is signaling is to traffic as seen from...
3,3,0,1056338697_4f7d7ce270,woman with blonde hair wearing blue tube top ...
4,4,0,1056338697_4f7d7ce270,woman in the blue dress is holding out her ar...


In [6]:
df_val.head()

Unnamed: 0,id,label,image_id,desc
0,0,0,1022454332_6af2c1449a,child and woman are at waters edge in big city
1,1,0,1022454332_6af2c1449a,large lake with lone duck swimming in it with...
2,2,0,1022454332_6af2c1449a,little boy at lake watching duck
3,3,0,1022454332_6af2c1449a,young boy waves his hand at the duck in the w...
4,4,0,1022454332_6af2c1449a,people are at the edge of lake facing the wat...


# BERT encoder

In this section I encode the descriptions of the images and then I save them into pickle format so now I just need to load the encode descriptions from the files instead of reencode all the data. The commented cells should be executed only if don't have the encoded pickle files.

In [7]:
from sent2vec.vectorizer import Vectorizer

In [8]:
def bert_vectors(x):
    vectorizer = Vectorizer()
    sentences = list(x)
    vectorizer.bert(sentences)
    vectors_bert = vectorizer.vectors
    return vectors_bert

In [9]:
list_train = [df_train[i:i+100] for i in range(0,df_train.shape[0],100)]
list_test = [df_test[i:i+100] for i in range(0,df_test.shape[0],100)]

In [10]:
# Extract BERT embeddings
bert_train = []
for i,x in enumerate(list_train):
    bert_train.append(bert_vectors(list(x['desc'])))

bert_test = []
for i,x in enumerate(list_test):
    bert_test.append(bert_vectors(list(x['desc'])))

In [11]:
bert_train_new = np.concatenate(bert_train, axis = 0)
bert_test_new = np.concatenate(bert_test, axis = 0)

In [12]:
# len(list_train)*len(list_train[0])
# len(bert_train[299])

# for i,x in enumerate(list_train[2]['desc']):
#         m = bert_vectors([x])

# print(len(m))

In [13]:
# len(list_train[2]['desc'])
# print(list_train[2].iloc[-1]['desc'])
# m = bert_vectors([str(list_train[2].iloc[-1]['desc'])])

# print(m.shape)

# str(list_train[2].iloc[-1]['desc'])
# s = ['horses pulling sled steered by smiling blond woman','hello']
# bert_vectors(s).shape

In [14]:
# save bert_train_new
pickle_out = open("bert_train_03032019.pickle","wb")
pickle.dump(bert_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("bert_test_03032019.pickle","wb")
pickle.dump(bert_test_new, pickle_out)
pickle_out.close()

In [15]:
# load bert_train_new
pickle_in = open("bert_train_03032019.pickle", "rb")
bert_train_new = pickle.load(pickle_in)

# load bert_test_new
pickle_in = open("bert_test_03032019.pickle", "rb")
bert_test_new = pickle.load(pickle_in)

In [16]:
bert_train_new.shape

(30000, 768)

In [17]:
df_train["desc_encoded"] = list(bert_train_new)
df_test["desc_encoded"] = list(bert_test_new)

In [18]:
df_train.head()

Unnamed: 0,id,label,image_id,desc,desc_encoded
0,0,0,1000268201_693b08cb0e,child in pink dress is climbing up set of sta...,"[-0.36486554, -0.2585366, 0.084249124, -0.1953..."
1,1,0,1000268201_693b08cb0e,girl going into wooden building,"[-0.30413604, -0.3162854, 0.20758209, -0.18385..."
2,2,0,1000268201_693b08cb0e,little girl climbing into wooden playhouse,"[-0.4010243, -0.29010335, 0.18450412, -0.17961..."
3,3,0,1000268201_693b08cb0e,little girl climbing the stairs to her playhouse,"[-0.3710625, -0.20777261, 0.188636, -0.2190543..."
4,4,0,1000268201_693b08cb0e,little girl in pink dress going into wooden c...,"[-0.4009231, -0.31878188, 0.16196373, -0.15704..."


In [19]:
df_test.head()

Unnamed: 0,id,label,image_id,desc,desc_encoded
0,0,0,1056338697_4f7d7ce270,blond woman in blue shirt appears to wait for...,"[-0.40801805, -0.47038484, 0.108725145, -0.183..."
1,1,0,1056338697_4f7d7ce270,blond woman is on the street hailing taxi,"[-0.268986, -0.35668743, 0.159542, -0.24152443..."
2,2,0,1056338697_4f7d7ce270,woman is signaling is to traffic as seen from...,"[-0.29578292, -0.2506964, 0.052882105, -0.4025..."
3,3,0,1056338697_4f7d7ce270,woman with blonde hair wearing blue tube top ...,"[-0.041516334, -0.34414628, 0.04007022, -0.266..."
4,4,0,1056338697_4f7d7ce270,woman in the blue dress is holding out her ar...,"[-0.06600636, -0.1293576, 0.08905828, -0.11362..."
