# Image caption generator

## Part 1 - seq2seq auto encoder

In [61]:
from fastai.text import *
import pandas as pd
from pathlib import Path

In [63]:
PATH = Path('data/')
PATH.mkdir(exist_ok=True)

In [64]:
seq_PATH = PATH/'seq2deq_model'
seq_PATH.mkdir(exist_ok=True)

In [7]:
captions_and_links_valid = pd.read_csv("Validation_GCC-1.1.0-Validation.tsv", sep="\t",header=None)
captions_and_links_train = pd.read_csv("Train_GCC-training.tsv", sep="\t",header=None)

In [8]:
len(captions_and_links_valid), len(captions_and_links_train)

(15840, 3318333)

In [9]:
captions_and_links_valid.head()

Unnamed: 0,0,1
0,author : a life in photography -- in pictures,https://i.pinimg.com/736x/66/01/6c/66016c3ba27...
1,an angler fishes river on a snowy day .,http://www.standard.net/image/2015/02/04/800x_...
2,photograph of the sign being repaired by brave...,http://indianapolis-photos.funcityfinder.com/f...
3,the player staring intently at a computer scre...,http://www.abc.net.au/news/image/9066492-3x2-7...
4,globes : the green 3d person carrying in hands...,https://www.featurepics.com/StockImage/2009031...


In [30]:
captions_valid = captions_and_links_valid[0].values
captions_train = captions_and_links_train[0].values

In [32]:
len(captions_train), len(captions_valid)

(3318333, 15840)

### Tokenization

In [33]:
tokenizer = Tokenizer()

In [34]:
tokens_captions_valid = tokenizer.process_all(captions_valid)

In [36]:
tokens_captions_train = tokenizer.process_all(captions_train)

**Let's make sure everything went right:**

In [40]:
print(captions_train[777], ' '.join(tokens_captions_train[777]))

people click into their skis . people click into their skis .


### Length of captions
#### Average length of captions

In [45]:
np.mean([len(c) for c in tokens_captions_valid])

10.415467171717172

In [46]:
np.mean([len(c) for c in tokens_captions_train])

10.322056888202601

#### Discard captions that are too long

In [50]:
tokens_captions_valid = np.array(tokens_captions_valid)[np.array([len(c) < 30 for c in tokens_captions_valid])]

In [52]:
np.max([len(c) for c in tokens_captions_valid])

29

In [53]:
tokens_captions_train = np.array(tokens_captions_train)[np.array([len(c) < 30 for c in tokens_captions_train])]

In [55]:
np.max([len(c) for c in tokens_captions_train])

29

In [65]:
pickle.dump((tokens_captions_valid), (seq_PATH/'tok_cap_v.pkl').open('wb'))

In [66]:
pickle.dump((tokens_captions_train), (seq_PATH/'tok_cap_t.pkl').open('wb'))

In [68]:
tokens_captions_valid = pickle.load((seq_PATH/'tok_cap_v.pkl').open('rb'))

In [67]:
tokens_captions_train = pickle.load((seq_PATH/'tok_cap_t.pkl').open('rb'))

In [70]:
print(len(tokens_captions_train), len(tokens_captions_valid))

3302516 15764


### Numericalization

In [71]:
vocab = Vocab.create(tokens_captions_train, max_vocab=50000, min_freq=2)

**Let's quickly test this:**

In [96]:
idx = 12575

In [97]:
test_caption = tokens_captions_valid[idx]
print(' '.join(w for w in test_caption))

actor attending the world premiere of crime fiction film


In [98]:
test_caption_num = vocab.numericalize(test_caption)
print(test_caption_num)

[31, 1841, 8, 108, 43, 11, 2274, 1018, 76]


In [99]:
print(vocab.textify(test_caption_num))

actor attending the world premiere of crime fiction film


### Word vectors

In [None]:
#! pip install git+https://github.com/facebookresearch/fastText.git

In [108]:
import fastText as ft

In [109]:
en_vecs = ft.load_model(str(seq_PATH/'wiki.en.bin'))

In [110]:
vec_dict = {w : en_vecs.get_word_vector(w) for w in en_vecs.get_words()}

In [113]:
pickle.dump(vec_dict, open(seq_PATH/'vec_dict.pkl', 'wb'))

In [114]:
vec_dict = pickle.load(open(seq_PATH/'vec_dict.pkl','rb'))

**Let's take a look at the most frequent words from fastText:**

In [115]:
ft_words = en_vecs.get_words(include_freq=True)

In [117]:
ft_word_dict = {k:v for k,v in zip(*ft_words)}

In [118]:
ft_words = sorted(ft_word_dict.keys(), key=lambda x: ft_word_dict[x])

10 most frequent words:

In [123]:
ft_words[-10:]

[')', "'", 'and', 'in', '-', 'of', '</s>', 'the', '.', ',']

**Mean and stdv of the word vectors:**

In [124]:
vecs = np.stack(list(vec_dict.values()))

In [125]:
vecs.mean(), vecs.std()

(0.0075652334, 0.29283327)

### Dataset

In [129]:
def A(*a):
    """convert iterable object into numpy array"""
    return np.array(a[0]) if len(a)==1 else [np.array(o) for o in a]

In [130]:
class Caption2CaptionDataset(Dataset):
    def __init__(self, x):
        self.x = x
    def __getitem__(self, idx):
        return A(self.x[idx], self.x[idx])
    def __len__(self):
        return len(self.x)