In [28]:
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pickle
from sys import getsizeof

In [21]:
with open("../Data/inpainting/worddict.pkl", 'rb') as input:
    worddict = pickle.load(input, encoding='latin1')
    
with open("./Data/train_images_fn.pkl", 'rb') as input:
    train_fn = pickle.load(input)
    
with open("../Data/inpainting/dict_key_imgID_value_caps_train_and_valid.pkl", 'rb') as input:
    captions = pickle.load(input, encoding='latin1')

captions['COCO_val2014_000000162113']
print(getsizeof(captions))

In [62]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Construct corpus from images captions
n = 1000

tr_corpus = []
te_corpus = []

for k in range(n):
    im_name = train_fn[k].split(".")[0]
    im_captions = captions[im_name]
    im_corpus = " ".join(im_captions)
    tr_corpus.append(im_corpus)

    im_name = train_fn[k + n].split(".")[0]
    im_captions = captions[im_name]
    im_corpus = " ".join(im_captions)
    te_corpus.append(im_corpus)
    
# Construct counts vectors for images corpus
vectorizer = CountVectorizer(min_df=1)
X_tr = vectorizer.fit_transform(tr_corpus)
X_te = vectorizer.transform(te_corpus).toarray()

# Transform counters by tfidf transformation
transformer = TfidfTransformer(smooth_idf=True)
tfidf = transformer.fit_transform(vectorizer.transform(te_corpus))

# Construct tfidf vectors for image captions
vectorizer = TfidfVectorizer(max_df=0.5, min_df = 10e-4, smooth_idf=True)
X = vectorizer.fit_transform(tr_corpus + te_corpus)
X1 = vectorizer.transform(te_corpus).toarray()

In [66]:
# Transform results tfidf matrix by SVD transformation
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
svd.fit(X)
print(svd.explained_variance_ratio_.sum())

0.965037716745


In [68]:
def make_seq_embeddings(train_fn, val_fn, captions_dict, tfidf_args, svd_args):
    
    tr_corpus = []
    for fn in train_fn:
        im_captions = captions[train_fn[k].split(".")[0]]
        tr_corpus.append(" ".join(im_captions))

    val_corpus = []
    for fn in val_fn:
        im_captions = captions[val_fn[k].split(".")[0]]
        val_corpus.append(" ".join(im_captions))
    
    # Construct tfidf vectors for train and val captions
    vectorizer = TfidfVectorizer(max_df=0.5, min_df = 10e-5, smooth_idf=True)
    vectorizer.fit(tr_corpus + te_corpus)
    tr_tfidf = vectorizer.transform(tr_corpus)
    val_tfidf = vectorizer.transform(val_corpus)
    
    # Fit SVD transformation on train captions and
    # predict vectors for train and val captions
    svd = TruncatedSVD(n_components=2048, n_iter=7, random_state=42)
    svd.fit(tr_tfidf)
    tr_embeddings = svd.transform(tr_tfidf)
    val_embeddings = svd.transform(val_tfidf)
    
    # Construct dicts with embeddings
    tr_embeddings_dict = {}
    val_embeddings_dict = {}
    
    for k in range(len(train_fn)):
        tr_embeddings_dict[train_fn.split(".")[0]] = tr_embeddings[k]
    
    for k in range(len(val_fn)):
        val_embeddings_dict[val_fn.split(".")[0]] = val_embeddings[k]
        
    return tr_embeddings, val_embeddings