## FastText Embedding for Analysis

In [1]:
import json
import os
import pickle

import numpy as np
import torch

import poisevae
from poisevae.utils import sent_emb
from poisevae.datasets import CUB

from cca import pca_transform

In [2]:
HOME_PATH = os.path.expanduser('~')
DATA_PATH = os.path.join(HOME_PATH, 'Datasets/CUB/')

In [2]:
# The following code is adapted from https://github.com/iffsid/mmvae, the repository for the work
# Y. Shi, N. Siddharth, B. Paige and PHS. Torr.
# Variational Mixture-of-Experts Autoencoders for Multi-Modal Deep Generative Models.
# In Proceedings of the 33rd International Conference on Neural Information Processing Systems,
# Page 15718–15729, 2019

In [3]:
from collections import Counter, OrderedDict
class OrderedCounter(Counter, OrderedDict):
    """Counter that remembers the order elements are first encountered."""

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, OrderedDict(self))

    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)

In [6]:
from gensim.models import FastText
from nltk.tokenize import sent_tokenize, word_tokenize

with open(os.path.join(DATA_PATH, 'cub/text_trainvalclasses.txt'), 'r') as file:
    text = file.read()
    sentences = sent_tokenize(text)

occ_register = OrderedCounter() # For counting the occurrance and calc. weights
texts = [] # For embedding
for i, line in enumerate(sentences):
    words = word_tokenize(line)
    texts.append(words)
    occ_register.update(words)

In [7]:
# FastText embedding
model = FastText(vector_size=300, window=5, min_count=3)
model.build_vocab(corpus_iterable=texts)
model.train(corpus_iterable=texts, total_examples=len(texts), epochs=10)

(5586509, 15113120)

In [8]:
with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.vocab'), 'rb') as file:
    vocab = json.load(file)

In [9]:
# Output embedding
i2w = vocab['i2w']
base = np.ones((300,), dtype=np.float32)
emb = [base * (i - 1) for i in range(3)]
for word in list(i2w.values())[3:]:
    emb.append(model.wv[word])

emb = np.array(emb)
with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.emb'), 'wb') as file:
    pickle.dump(emb, file)

In [10]:
# Output weights
a = 1e-3
w2i = vocab['w2i']
weights = np.zeros(len(w2i))
total_occ = sum(list(occ_register.values()))
exc_occ = 0
for w, occ in occ_register.items():
    if w in w2i.keys():
        weights[w2i[w]] = a / (a + occ / total_occ)
    else:
        exc_occ += occ
weights[0] = a / (a + exc_occ / total_occ)

with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.weights'), 'wb') as file:
    pickle.dump(weights, file)

In [3]:
tx = lambda data: torch.Tensor(data)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
CUB_train = CUB(DATA_PATH, DATA_PATH, 'train', device, tx, return_idx=False)
CUB_test = CUB(DATA_PATH, DATA_PATH, 'test', device, tx, return_idx=True)

In [6]:
vocab_size, txt_len = CUB_train.CUBtxt.vocab_size, CUB_train.CUBtxt.max_sequence_length
vocab_size, txt_len

(1590, 32)

In [7]:
batch_size = 128
train_loader = torch.utils.data.DataLoader(CUB_train, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(CUB_test, batch_size=batch_size, shuffle=True)
len(train_loader), len(test_loader)

(692, 230)

In [8]:
CUB_train.CUBtxt.eos_idx

2

In [9]:
with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.emb'), 'rb') as file:
    emb = pickle.load(file)
with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.weights'), 'rb') as file:
    weights = pickle.load(file)

In [23]:
full_txt_data = torch.cat([d[1] for d in train_loader]).detach().cpu().numpy().astype(np.int32)

In [24]:
output = np.zeros((full_txt_data.shape[0], emb.shape[1]))
output.shape

(88548, 300)

In [25]:
full_img_data = torch.cat([d[0] for d in train_loader]).to('cuda')

In [26]:
sent_emb(full_txt_data, emb, weights, output)

In [27]:
output = torch.tensor(output).to('cuda')

In [28]:
_, _, V = torch.svd(output - output.mean(axis=0))
v = V[:, 0].unsqueeze(-1)
PC = v.mm(v.t())

In [29]:
torch.save(PC, 'sentence_emb_PC2.pt')

In [30]:
sent_pca = pca_transform(output, PC)

In [31]:
torch.save(sent_pca, 'true_data_sent_embedding2.pt')

In [32]:
_, _, V = torch.svd(full_img_data - full_img_data.mean(axis=0))
v = V[:, 0].unsqueeze(-1)
PC = v.mm(v.t())

In [33]:
torch.save(PC, 'image_PC2.pt')

In [34]:
img_pca = pca_transform(full_img_data, PC)

In [35]:
torch.save(img_pca, 'true_data_img_feature2.pt')