In [1]:
import pandas as pd

df = pd.read_csv('results.csv', sep='|')

df.head()

Unnamed: 0,image_name,comment_number,comment
0,10002456.jpg,0,Several men in hard hats are operating a gian...
1,10002456.jpg,1,Workers look down from up above on a piece of...
2,10002456.jpg,2,Two men working on a machine wearing hard hats .
3,10002456.jpg,3,Four men on top of a tall structure .
4,10002456.jpg,4,Three men on a large rig .


In [2]:
df[' comment'].tolist()

[' Several men in hard hats are operating a giant pulley system .',
 ' Workers look down from up above on a piece of equipment .',
 ' Two men working on a machine wearing hard hats .',
 ' Four men on top of a tall structure .',
 ' Three men on a large rig .']

In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List


token_transform = get_tokenizer("basic_english")

In [4]:
tokens = token_transform("You can now install TorchText using pip!")
tokens

['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']

In [5]:
# helper function to yield list of tokens
def yield_tokens(list_of_captions: Iterable):

    for _caption in list_of_captions:
        yield token_transform(_caption)


# Define special symbols and indices
# UNK - unknown token, when the token is not in the vocabulary
# PAD - padding of token, when the batch is shorter than the max length
# BOS - beginning of sentence token
# EOS - end of sentence token
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ["<unk>", "<pad>", "<bos>", "<eos>"]

# Create torchtext's Vocab object
vocab_transform = build_vocab_from_iterator(
    yield_tokens(df[' comment'].tolist()),
    min_freq=1,
    specials=special_symbols,
    special_first=True,
)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
vocab_transform.set_default_index(UNK_IDX)

In [6]:
vocab_transform.get_itos(), vocab_transform["deependu"]

(['<unk>',
  '<pad>',
  '<bos>',
  '<eos>',
  '.',
  'a',
  'men',
  'on',
  'hard',
  'hats',
  'of',
  'above',
  'are',
  'down',
  'equipment',
  'four',
  'from',
  'giant',
  'in',
  'large',
  'look',
  'machine',
  'operating',
  'piece',
  'pulley',
  'rig',
  'several',
  'structure',
  'system',
  'tall',
  'three',
  'top',
  'two',
  'up',
  'wearing',
  'workers',
  'working'],
 0)

In [7]:
import torch

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device="cpu")) == 1).transpose(0, 1)
    mask = (
        mask.float()
        .masked_fill(mask == 0, float("-inf"))
        .masked_fill(mask == 1, float(0.0))
    )
    return mask

In [8]:
def create_mask(src, tgt):
    # src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    # src_mask = torch.zeros((src_seq_len, src_seq_len), device="cpu").type(torch.bool)

    # src_padding_mask = (src == PAD_IDX).transpose(0, 1) # we don't need src padding mask, since we are using vision transformer encoder
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    # return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask
    return tgt_mask, tgt_padding_mask

In [15]:
a = torch.randn(10, 5)
b = torch.randn(10, 5)

b[2][1:]=1
a.shape, a.shape[0]

(torch.Size([10, 5]), 10)

In [16]:
create_mask(a, b)

(tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
         [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 tensor([[False, False, False, False, False, False, False, False, False, False],
         [False, False,  True, False, False, False, False, False, False, False],
         [False, False,  True, False, False, False, False, False, False, False],
         [False, False,  True, False, False, False, False, False, False, False],
         [False, False,  True, False, False, False, False, False, False, F