# Imports

In [1]:
import json
import gensim
from nltk.corpus import stopwords
import numpy as np
stop_words = set(stopwords.words('english'))
def clean_sentence(sentence):
    # Clean a sentence from unwanted characters
    sentence = sentence.replace('.','')
    sentence = sentence.replace(',','')
    sentence = sentence.replace('?','')
    sentence = sentence.replace('!','')
    sentence = sentence.replace(';','')
    sentence = sentence.replace(':','')
    sentence = sentence.replace('-','')
    return sentence

# Read data

In [None]:
# Train
with open('../../data/annotations/captions_train2014.json') as data_file:  
    captions_train = json.load(data_file)

# Validation
with open('../../data/annotations/captions_val2014.json') as data_file:  
    captions_val = json.load(data_file)

# Bigrams

In [None]:
sentences = []

# Create list of sentences train
for i in range(len(captions_train['annotations'])):
    sentence = captions_train['annotations'][i]['caption']
    sentence = clean_sentence(sentence)
    sentence = [word.lower() for word in sentence.split()]
    sentences.append(sentence)
    
# Appen list of sentences validation
for i in range(len(captions_val['annotations'])):
    sentence = captions_val['annotations'][i]['caption']
    sentence = clean_sentence(sentence)
    sentence = [word.lower() for word in sentence.split()]
    sentences.append(sentence)
    
# Train bigram
bigram_model = gensim.models.phrases.Phrases(sentences, min_count=4, threshold=200)

# word2vec

In [2]:
w2v_model = gensim.models.Word2Vec.load_word2vec_format('../../data/GoogleNews-vectors-negative300.bin.gz',binary=True)

# Create word2vec dictionary

In [None]:
# Choose train or validation
captions = captions_train

word2vec = {}

for i in range(len(captions['annotations'])):
    if (i % 10000) == 0:
        print(i)
    # Find image filename
    image_id = captions['annotations'][i]['image_id']
    j, found = 0, 0
    while found == 0:
        if captions['images'][j]['id'] == image_id:
            file_name = captions['images'][j]['file_name']
            found = 1
        j += 1
        
    # Hardcoded improvements
    caption = clean_sentence(captions['annotations'][i]['caption'])
    caption.replace('hot dog','sausage')
    caption.replace('hot dogs','sausages')
    caption = [word.lower() for word in caption.split()]

    
    # Create bigrams from caption
    caption = bigram_model[caption]
    
    # Remove unknown bigrams
    temp_bigram_sentence = []
    for j in range(len(caption)):
        if caption[j] not in w2v_model.vocab:
            temp_bigram_sentence += caption[j].split('_')
        else:
            temp_bigram_sentence.append(caption[j])
    caption = temp_bigram_sentence
    
    # Remove stop words in caption
    caption = [word for word in caption if word not in stop_words]
    
    # Create word2vec representation of caption
    w2v_vector = np.zeros(300)
    for word in caption:
        if word in w2v_model.vocab:
            w2v_vector += w2v_model[word]
    
    # Add w2v_vector to dictionary
    temp_list = word2vec.get(file_name,[])
    temp_list.append(w2v_vector)
    word2vec[file_name] = temp_list    

# Caption dictionary

In [None]:
# Choose train or validation
captions = captions_val

caption_dic = {}

for i in range(len(captions['annotations'])):
    if (i % 10000) == 0:
        print(i)
        
    # Find image filename
    image_id = captions['annotations'][i]['image_id']
    j, found = 0, 0
    while found == 0:
        if captions['images'][j]['id'] == image_id:
            file_name = captions['images'][j]['file_name']
            found = 1
        j += 1
    
    # Add caption
    temp_list = caption_dic.get(file_name, [])
    temp_list.append(captions['annotations'][i]['caption'])
    caption_dic[file_name] = temp_list
    

# Explanatory words

In [None]:
# Load
w2v_dic = np.load('../../data/word2vec_train.npy')
w2v_dic = w2v_dic[()]


# Iterate over all w2v-vectors and find most similar word
explanatory_words = {}
for count,file_name in enumerate(w2v_dic):
    if count%1000 == 0:
        print(count)
    for w2v in w2v_dic[file_name]:
        words = w2v_model.most_similar([w2v],topn=1)
        for word in words:
            weight = explanatory_words.get(word[0],0)
            explanatory_words[word[0]] = weight + word[1]
    

# Explanatory words to w2v

In [12]:
# read explanatory_words
import pickle
with open('../../data/explanatory_words.pkl','rb') as data_file:
    explanatory_words = pickle.load(data_file)

# Cut-off words
cut_off = 200
words_cut_off = []
for word in explanatory_words.items():
    if word[1] > cut_off:
        if len(word[0].split('_'))==1:
            words_cut_off.append(word[0])

# Create dictionary for word to w2v
word_dic = {}
for word in words_cut_off:
    word_dic[word] = w2v_model[word]



In [16]:
with open('../../data/explanatory_dic.npy','wb') as data_file:
    np.save(data_file,word_dic)

In [19]:
word_dic

{'airplane': array([ 0.16894531, -0.20019531, -0.16113281,  0.11669922, -0.16015625,
        -0.35742188, -0.17480469, -0.17773438,  0.453125  ,  0.01855469,
         0.25195312, -0.31640625,  0.24121094, -0.2890625 , -0.11523438,
         0.37109375,  0.01293945,  0.16796875,  0.0625    ,  0.07714844,
        -0.04492188, -0.00546265, -0.03588867, -0.22949219,  0.0055542 ,
         0.02612305, -0.12988281,  0.07568359,  0.13769531,  0.03808594,
        -0.27929688, -0.28710938, -0.25976562,  0.0480957 ,  0.02209473,
         0.0859375 , -0.00921631,  0.12597656,  0.05957031,  0.2890625 ,
         0.07421875,  0.06225586,  0.28515625,  0.07275391, -0.02124023,
        -0.27929688,  0.10302734,  0.03759766,  0.33203125, -0.125     ,
        -0.265625  ,  0.1484375 ,  0.11962891, -0.04370117, -0.125     ,
        -0.203125  ,  0.18554688, -0.07470703,  0.25      ,  0.08398438,
         0.26953125, -0.31640625, -0.26171875,  0.03759766,  0.10693359,
         0.02600098, -0.06689453,  0.06