In [1]:
import torch
import os

import pandas as pd
from PIL import Image

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [2]:
class Dictionary(object):
    def __init__(self):
        self.idx2word = []
        self.word2idx = {}
        self.add_word('<pad>')

    def add_word(self, word):
        if word not in self.idx2word:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(Dataset):
    def __init__(self, caption_path, bsz = 20, transform = None):
        self.caption_path = caption_path

        self.dictionary = Dictionary()
        self.captions = self.tokenize()
        self.keys = list(self.captions)
        self.bsz = bsz
        self.num_batches = self.__len__() // self.bsz


    def tokenize(self):
        captions_csv = pd.read_csv(os.path.join(self.caption_path, 'cleaned_captions.csv'))

        for i in range(len(captions_csv)):
            line = captions_csv.iloc[i, 1]
            words = line.lower().split() + ['<eos>']
            for word in words:
                self.dictionary.add_word(word)
        self.dictionary.add_word('<pad>')

        path2cap = {}
        for i in range(len(captions_csv)):
            line = captions_csv.iloc[i, 1]
            words = line.lower().split() + ['<eos>']
            ids = []
            for word in words:
                ids.append(self.dictionary.word2idx[word])
            path2cap[captions_csv.iloc[i, 0]] = ids

        return path2cap

    def __len__(self):
        return len(self.captions)

    def __getitem__(self, idx):
        transform = transforms.ToTensor()

        img_paths = self.keys[idx*self.bsz : (idx+1) * self.bsz]
        batch_captions = []
        for path in img_paths:
            batch_captions.append(torch.Tensor(self.captions[path]))

        # Pad captions
        padded_captions = torch.nn.utils.rnn.pad_sequence(batch_captions, batch_first = True)
        captions = padded_captions.view(-1)

        # Fill a torch.Tensor with the img values. Final tensor should be of shape [20, 3, 300, 300]
        img_tensor = torch.zeros(0, 3, 300, 300)
        for path in img_paths:
            img = Image.open(os.path.join(os.path.join(self.caption_path, 'Flickr8k_Dataset'),path))
            img = img.resize((300, 300))
            img = transform(img)
            img.unsqueeze_(0)

            img_tensor = torch.cat((img_tensor, img), 0)

        return captions, img_tensor


In [3]:
corpus = Corpus('data')
ntokens = len(corpus.dictionary)

In [4]:
class RNN(nn.Module):
    def __init__(self, ntokens, ninp, nhid, nlayers, dropout = 0.5):
        super(RNN, self).__init__()

        self.ntokens = ntokens
        self.nhid = nhid
        self.nlayers = nlayers

        self.drop = nn.Dropout(dropout)

        # CNN SECTION
        self.conv1 = nn.Conv2d(3, 16, 3) # Size [n, 16, 298, 298]
        self.pool = nn.MaxPool2d(2) # Size [n, 16, 149, 149]
        self.conv2 = nn.Conv2d(16, 16, 5) # Size [n, 16, 72, 72] (after pool)
        self.fc1 = nn.Linear(16 * 72 * 72, 64) # Size [n, 285]

        # RNN SECTION
        self.encode = nn.Embedding(ntokens, ninp)
        self.lstm = nn.LSTM(ninp, nhid, nlayers, dropout = dropout)
        self.decode = nn.Linear(nhid, ntokens)

    def forward(self, input, h):
        # Input should be Tensor representing the images with shape [bsz, 3, 300, 300]
        # CNN
        img = self.pool(F.relu(self.conv1(input)))
        img = self.pool(F.relu(self.conv2(img)))
        img = img.view(-1, 16 * 72 * 72)
        img_info = F.relu(self.fc1(img)).type(torch.LongTensor)

        # Reshaping for RNN
        img_info = img_info.view(64, -1)

        # RNN
        emb = self.drop(self.encode(img_info))
        out, hidden = self.lstm(emb, h)
        out = self.drop(out)

        decoded = self.decode(out)
        decoded = decoded.view(-1, self.ntokens)

        return F.log_softmax(decoded, dim = 1), hidden

    def init_hidden(self, bsz):
        return (torch.zeros(self.nlayers, bsz, self.nhid), torch.zeros(self.nlayers, bsz, self.nhid))


In [5]:
#model = RNN(ntokens, 200, 200, 2)

In [6]:
with open('model.pt', 'rb') as f:
    model = torch.load(f)

In [7]:
model.eval()

RNN(
  (drop): Dropout(p=0.5, inplace=False)
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=82944, out_features=64, bias=True)
  (encode): Embedding(4528, 200)
  (lstm): LSTM(200, 200, num_layers=2, dropout=0.5)
  (decode): Linear(in_features=200, out_features=4528, bias=True)
)

In [9]:
def get_img(path):
    transform = transforms.ToTensor()

    img = Image.open(os.path.join('data/Flickr8k_Dataset',path))
    img = img.resize((300, 300))
    img = transform(img)
    img.unsqueeze_(0)
    
    return img

In [69]:
def generate():
    model.eval()
    hidden = model.init_hidden(1)
    
    img = get_img('23445819_3a458716c1.jpg')
    out, hidden = model(img, hidden)
    print(out.shape)
    word_weights = out.squeeze().div(1.0).exp().cpu()
    print(word_weights.shape)
    word_idx = torch.multinomial(word_weights, 1)[0:10]
    word_idx_og = torch.multinomial(word_weights, 1)[0]
    
    
    print(word_idx.shape)
    sen = [corpus.dictionary.idx2word[i.item()] for i in word_idx]
    print(sen)
    print(corpus.dictionary.idx2word[word_idx_og])

In [70]:
generate()

torch.Size([64, 4528])
torch.Size([64, 4528])
torch.Size([10, 1])
['lavish', 'in', 'a', 'fields', 'with', '<eos>', 'building', 'are', 'young', ',']
water


In [47]:
x = torch.ones(10,1)
print(x.shape)

torch.Size([10, 1])


In [48]:
for i in x:
    print(i)

tensor([1.])
tensor([1.])
tensor([1.])
tensor([1.])
tensor([1.])
tensor([1.])
tensor([1.])
tensor([1.])
tensor([1.])
tensor([1.])
