In [1]:
from CUBDatasets import CUBImageFt, CUBSentences
import torch
import numpy as np

In [2]:
# If the below raise LookupError about punkt, run this
# import nltk
# nltk.download('punkt')

In [3]:
RAWDATA_PATH = 'CUB'

In [4]:
tx = lambda data: torch.Tensor(data)

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
img_train = CUBImageFt(RAWDATA_PATH, split='train', device=device)
img_test = CUBImageFt(RAWDATA_PATH, split='test', device=device)

In [7]:
len(img_train), len(img_test)

(8855, 2933)

In [8]:
maxSentLen = 32
txt_train = CUBSentences(RAWDATA_PATH, split='train', transform=tx, max_sequence_length=maxSentLen)
txt_test = CUBSentences(RAWDATA_PATH, split='test', transform=tx, max_sequence_length=maxSentLen)
# Each entry of the dataset is a 2-tuple (padded sentence embedding, actual length)

In [9]:
# Ten sentences for each image
len(txt_train), len(txt_test)

(88550, 29330)

## Test

In [10]:
img_training_generator = torch.utils.data.DataLoader(img_train, batch_size=200, shuffle=True)
len(img_training_generator)

45

In [11]:
# for i, img in enumerate(img_training_generator):
#     print(i, img.shape)

In [12]:
img_testing_generator = torch.utils.data.DataLoader(img_test, batch_size=200, shuffle=True)
len(img_testing_generator)

15

In [13]:
# for i, img in enumerate(img_testing_generator):
#     print(i, img.shape)

In [14]:
txt_training_generator = torch.utils.data.DataLoader(txt_train, batch_size=2000, shuffle=True)
len(txt_training_generator)

45

In [15]:
# for i, txt in enumerate(txt_training_generator):
#     print(i, len(txt), txt[0].shape, txt[1].shape)

In [16]:
txt_testing_generator = torch.utils.data.DataLoader(txt_test, batch_size=2000, shuffle=True)
len(txt_testing_generator)

15

In [17]:
# for i, txt in enumerate(txt_testing_generator):
#     print(i, len(txt), txt[0].shape, txt[1].shape)

In [18]:
# EOS = 2
# for i, l in enumerate(txt[1]):
#     try:
#         print(torch.where(txt[0][i] == EOS)[0][0] + 1 == l)
#     except:
#         print(txt[0][i])

## Joint Dataloader

In [19]:
class CUB(torch.utils.data.Dataset):
    def __init__(self, img_data_dir, txt_data_dir, split, device, transform=None, **kwargs):
        """split: 'train' or 'test' """
        super().__init__()
        self.CUBtxt = CUBSentences(txt_data_dir, split=split, transform=transform, **kwargs)
        self.CUBimg = CUBImageFt(img_data_dir, split=split, device=device)
        
    def __len__(self):
        return len(self.CUBtxt)
    
    def __getitem__(self, idx):
        txt = self.CUBtxt.__getitem__(idx)
        img = self.CUBimg.__getitem__(idx // 10)
        return img, txt

In [20]:
from joint_dataset import CUB

In [21]:
CUB_train = CUB(RAWDATA_PATH, RAWDATA_PATH, 'train', device, tx)

In [22]:
training_generator = torch.utils.data.DataLoader(CUB_train, batch_size=2000, shuffle=False)
len(training_generator)

45

In [None]:
for i, (img, txt) in enumerate(training_generator):
    print(i, torch.unique(img, dim=0).shape, txt[0].shape)

In [None]:
CUB_test = CUB(RAWDATA_PATH, RAWDATA_PATH, 'test', device, tx)

In [None]:
testing_generator = torch.utils.data.DataLoader(CUB_test, batch_size=2000, shuffle=False)
len(testing_generator)

In [None]:
for i, (img, txt) in enumerate(testing_generator):
    print(i, torch.unique(img, dim=0).shape, txt[0].shape)

## FastText Embedding for Analysis

In [None]:
import json
import os
import pickle

In [None]:
# The following code is adapted from https://github.com/iffsid/mmvae, the repository for the work
# Y. Shi, N. Siddharth, B. Paige and PHS. Torr.
# Variational Mixture-of-Experts Autoencoders for Multi-Modal Deep Generative Models.
# In Proceedings of the 33rd International Conference on Neural Information Processing Systems,
# Page 15718–15729, 2019

In [None]:
from collections import Counter, OrderedDict
class OrderedCounter(Counter, OrderedDict):
    """Counter that remembers the order elements are first encountered."""

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, OrderedDict(self))

    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)

In [None]:
from gensim.models import FastText
from nltk.tokenize import sent_tokenize, word_tokenize

with open(os.path.join(RAWDATA_PATH, 'cub/text_trainvalclasses.txt'), 'r') as file:
    text = file.read()
    sentences = sent_tokenize(text)

occ_register = OrderedCounter() # For counting the occurrance and calc. weights
texts = [] # For embedding
for i, line in enumerate(sentences):
    words = word_tokenize(line)
    texts.append(words)
    occ_register.update(words)

In [None]:
# FastText embedding
model = FastText(vector_size=300, window=3, min_count=3)
model.build_vocab(corpus_iterable=texts)
model.train(corpus_iterable=texts, total_examples=len(texts), epochs=10)

In [None]:
with open(os.path.join(RAWDATA_PATH, 'cub/oc:3_msl:32/cub.vocab'), 'rb') as file:
    vocab = json.load(file)

In [None]:
# Output embedding
i2w = vocab['i2w']
base = np.ones((300,), dtype=np.float32)
emb = [base * (i - 1) for i in range(3)]
for word in list(i2w.values())[3:]:
    emb.append(model.wv[word])

emb = np.array(emb)
with open(os.path.join(RAWDATA_PATH, 'cub/oc:3_msl:32/cub.emb'), 'wb') as file:
    pickle.dump(emb, file)

In [None]:
# Output weights
a = 1e-3
w2i = vocab['w2i']
weights = np.zeros(len(w2i))
total_occ = sum(list(occ_register.values()))
exc_occ = 0
for w, occ in occ_register.items():
    if w in w2i.keys():
        weights[w2i[w]] = a / (a + occ / total_occ)
    else:
        exc_occ += occ
weights[0] = a / (a + exc_occ / total_occ)

with open(os.path.join(RAWDATA_PATH, 'cub/oc:3_msl:32/cub.weights'), 'wb') as file:
    pickle.dump(weights, file)

In [None]:
with open(os.path.join(RAWDATA_PATH, 'cub/oc:3_msl:32/cub.emb'), 'rb') as file:
    emb = pickle.load(file)
with open(os.path.join(RAWDATA_PATH, 'cub/oc:3_msl:32/cub.weights'), 'rb') as file:
    emb = pickle.load(file)

In [None]:
train_loader = torch.utils.data.DataLoader(txt_train, batch_size=2000)

In [None]:
for data in train_loader:
    print(data[0][0])
    break