## FastText Embedding for Analysis

In [None]:
import json
import os
import pickle

In [None]:
# The following code is adapted from https://github.com/iffsid/mmvae, the repository for the work
# Y. Shi, N. Siddharth, B. Paige and PHS. Torr.
# Variational Mixture-of-Experts Autoencoders for Multi-Modal Deep Generative Models.
# In Proceedings of the 33rd International Conference on Neural Information Processing Systems,
# Page 15718â€“15729, 2019

In [None]:
from collections import Counter, OrderedDict
class OrderedCounter(Counter, OrderedDict):
    """Counter that remembers the order elements are first encountered."""

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, OrderedDict(self))

    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)

In [None]:
from gensim.models import FastText
from nltk.tokenize import sent_tokenize, word_tokenize

with open(os.path.join(DATA_PATH, 'cub/text_trainvalclasses.txt'), 'r') as file:
    text = file.read()
    sentences = sent_tokenize(text)

occ_register = OrderedCounter() # For counting the occurrance and calc. weights
texts = [] # For embedding
for i, line in enumerate(sentences):
    words = word_tokenize(line)
    texts.append(words)
    occ_register.update(words)

In [None]:
# FastText embedding
model = FastText(vector_size=300, window=3, min_count=3)
model.build_vocab(corpus_iterable=texts)
model.train(corpus_iterable=texts, total_examples=len(texts), epochs=10)

In [None]:
with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.vocab'), 'rb') as file:
    vocab = json.load(file)

In [None]:
# Output embedding
i2w = vocab['i2w']
base = np.ones((300,), dtype=np.float32)
emb = [base * (i - 1) for i in range(3)]
for word in list(i2w.values())[3:]:
    emb.append(model.wv[word])

emb = np.array(emb)
with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.emb'), 'wb') as file:
    pickle.dump(emb, file)

In [None]:
# Output weights
a = 1e-3
w2i = vocab['w2i']
weights = np.zeros(len(w2i))
total_occ = sum(list(occ_register.values()))
exc_occ = 0
for w, occ in occ_register.items():
    if w in w2i.keys():
        weights[w2i[w]] = a / (a + occ / total_occ)
    else:
        exc_occ += occ
weights[0] = a / (a + exc_occ / total_occ)

with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.weights'), 'wb') as file:
    pickle.dump(weights, file)

In [None]:
with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.emb'), 'rb') as file:
    emb = pickle.load(file)
with open(os.path.join(DATA_PATH, 'cub/oc:3_msl:32/cub.weights'), 'rb') as file:
    emb = pickle.load(file)

In [None]:
train_loader = torch.utils.data.DataLoader(txt_train, batch_size=2000)

In [None]:
for data in train_loader:
    print(data[0][0])
    break