In [1]:
from gensim.models import Word2Vec
import numpy as np
from numpy import array
import os

# W2VEC MODEL

In [2]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [3]:
# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# load training dataset (6K)
filename = 'E://TFM/Flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

Dataset: 6000


In [4]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)

In [5]:
train_descriptions_list = list(train_descriptions.values())

# Creating data for the model training
train_data=[]
for i in train_descriptions_list:
    for j in i:
        train_data.append(j.split())

In [6]:
model = Word2Vec(train_data, size=300, min_count=2,window=5, sg=1,workers=4)

In [7]:
# Vocabulary size
print('Vocabulary size:', len(model.wv.vocab))

Vocabulary size: 4436


In [8]:
# Function returning vector reperesentation of a document
def get_embedding_w2v(doc_tokens):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(300)
    else:
        for tok in doc_tokens:
            if tok in model.wv.vocab:
                embeddings.append(model.wv.word_vec(tok))
            else:
                embeddings.append(np.random.rand(300))
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess(s):
    text_tokens = word_tokenize(s)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    tokens_without_sw_lw = [word.lower() for word in tokens_without_sw]
    return tokens_without_sw_lw

In [10]:
def cosine_distance_wordembedding_method(s1, s2):
    import scipy
    vector_1 = get_embedding_w2v(preprocess(s1))
    vector_2 = get_embedding_w2v(preprocess(s2))
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    print(round((1-cosine)*100,2),'%')

In [11]:
frase1 = "child in pink dress is climbing up set of stairs in an entry way"
frase2 = "girl going into wooden building"
frase3 = "oh my god"

cosine_distance_wordembedding_method(frase1,frase3)

45.11 %


# PRETRAINED GLOVE MODEL 

In [12]:
def loadGloveModel(File):
    print("Loading Glove Model")
    #f = open(File,'r')
    f = open(File, encoding="utf-8")
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [13]:
# Load Glove vectors
glove_dir = 'E://TFM/Glove/glove.6B.200d.txt'
model = loadGloveModel(glove_dir)

Loading Glove Model
400000  words loaded!


In [14]:
def cosine_distance_wordembedding_method(s1, s2):
    import scipy
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    print(round((1-cosine)*100,2),'%')

In [15]:
frase1 = "child in pink dress is climbing up set of stairs in an entry way"
frase2 = "park with childrens playing on it"
frase3 = "empty park with no childrens on it"
frase4 = "A park full of children playing and having fun"

cosine_distance_wordembedding_method(frase2,frase3)
cosine_distance_wordembedding_method(frase2,frase4)

82.09 %
73.92 %
