In [1]:
import numpy as np
import pandas as pd
import chainer

### pytorch packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

### PTB Data
#### Question 1: Should the text be broken down to samples split on "eos" token?
YES
#### Question 2: Should the text be prepended with a sos token?
YES

In [3]:
# unpacking the data from chainer
train, val, test = chainer.datasets.get_ptb_words()

In [4]:
# the data is already separated into a numpy array
print(f"train data: {type(train)} {train.shape} {train}")
print(f"val data: {type(val)} {val.shape} {val}")
print(f"test data: {type(test)} {test.shape} {test}")

train data: <class 'numpy.ndarray'> (929589,) [ 0  1  2 ... 39 26 24]
val data: <class 'numpy.ndarray'> (73760,) [2211  396 1129 ...  108   27   24]
test data: <class 'numpy.ndarray'> (82430,) [142  78  54 ...  87 214  24]


In [5]:
# ptb_dict is a dictionary containing words (key) to idx (value)
vocab2idx = chainer.datasets.get_ptb_words_vocabulary()
vocab2idx = {k:v for k, v in vocab2idx.items()}

#NOTE: PAD = 10000, <sos> = 10001, <m> = 10002
vocab2idx['PAD'] = len(vocab2idx)
vocab2idx['<sos>'] = 10001
vocab2idx['<m>'] = 10002

#creating a reverse dict to turn an index back into word for sanity check
idx2vocab = {v:k for k, v in vocab2idx.items()}
print(f"Number of vocabulary: {len(vocab2idx)}")

Number of vocabulary: 10003


In [6]:
def split_sentence(data):
    """
    This function splits the text data into individual sentences split on the <eos> token
    and prepends the <sos> token in the front.
    """
    samples, sentence, eos_idx = [], [vocab2idx['<sos>']], vocab2idx['<eos>']
    for idx in data:
        if idx != eos_idx:  #25 is the idx for the <eos> token
            sentence.append(idx)
        else:
            sentence.append(idx)
            samples.append(sentence)
            sentence = [vocab2idx['<sos>']]
    return samples

In [7]:
#splitting each sequence as an individual sample
train_samples = split_sentence(train)
val_samples = split_sentence(val)
test_samples = split_sentence(test)

In [8]:
sentence = [idx2vocab[idx] for idx in train_samples[5]]
print(' '.join(sentence))

<sos> the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said <eos>


In [9]:
#val_samples sequence
sentence = [idx2vocab[idx] for idx in val_samples[5]]
print(' '.join(sentence))

<sos> eventually viewers may grow <unk> with the technology and <unk> the cost <eos>


In [10]:
#test_samples sequence
sentence = [idx2vocab[idx] for idx in test_samples[5]]
print(' '.join(sentence))

<sos> heavy selling of standard & poor 's 500-stock index futures in chicago <unk> beat stocks downward <eos>


In [11]:
def mask_tokens(sentence, mask_prob, sub_prob):
    mask_sent, count = [], int(len(sentence)*sub_prob)
    for idx, token in enumerate(sentence):
        if np.random.uniform() < mask_prob and token not in [vocab2idx['<sos>'], vocab2idx['<eos>']]:
            mask_sent.append(vocab2idx['<m>'])
            count -= 1
        else:
            mask_sent.append(token)
        if count == 0:
            return mask_sent + sentence[idx+1:]
    return mask_sent

In [12]:
# sanity check that the masking worked
len(mask_tokens(train_samples[0], .2, .2)), len(train_samples[0])

(26, 26)

In [13]:
class PTBDataset(Dataset):
    """
    Setting up the Penn Tree Bank Dataset.
    """
    def __init__(self, X, mask_prob, sub_prob):
        self.X = X
        self.masked_X = [mask_tokens(x, mask_prob, sub_prob) for x in X]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.masked_X[idx])

In [14]:
# calling on the dataset
train_ds = PTBDataset(train_samples, .2, .2)
val_ds = PTBDataset(val_samples, .2, .2)
test_ds = PTBDataset(test_samples, .2, .2)

In [15]:
def collate(batch):
    """
    NOTE: batch without any labels is just a list of tensors coming from Dataset.
    padding each sequence.
    """
    (X, mask_X) = zip(*batch)
    x_len = [len(x) for x in mask_X]
    mask_x_pad = pad_sequence(mask_X, batch_first=True, padding_value=10000)
    x_pad = pad_sequence(X, batch_first=True, padding_value=10000)
    return mask_x_pad, x_pad, x_len

In [16]:
batch_size = 3
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
valid_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate)
test_dl = DataLoader(test_ds, batch_size=batch_size, collate_fn=collate)

### Generator
#### Figure out how does bidirectional go with all of this? #### How to pass in bidirectional encoder vectors into a unidirectional decoder?
* The encoder will generate hidden states that are of size 2*hidden_dimension (Encoder) size. I have to concatenate the two vectors of the hidden state. The cell state isnt really needed per Yannet but could still be used. The decoder hidden dimension size needs to be the same as the encoder. The concatenation should be done in the Seq2Seq.
   
#### Figure out how to do the masking for each sentence. How many tokens do we mask? Do they have to be in sequential order?
* This was done setting a prob p of masking each token. The max number of tokens to mask is also set of a proportion length of the text. As a result, they were not done in sequential order.
#### How to setup the Seq2Seq to generate text?
* Perhaps it may be easier to go with Yannet's setup than benvrett? Need to discuss with Shirkar.

#### Getting this error
* AttributeError: 'int' object has no attribute 'backward'. The weird thing is that it trains for a few epochs.

In [17]:
class GenEncoder(nn.Module):
    def __init__(self, emb_dim, hidden_dim, vocab_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(.5)
        
    def forward(self, x, lengths):
        x = self.dropout(self.emb(x))
        x_pack = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(x_pack)  #NOTE: If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.
        return hidden, cell

In [18]:
class GenDecoder(nn.Module):
    def __init__(self, emb_dim, hidden_dim, vocab_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, h0, c0):
        x = self.dropout(self.emb(x))
        out, (hidden, cell) = self.lstm(x, (h0, c0))  #passing in the initial hidden state and cell state
        return self.linear(hidden[-1]), hidden, cell

In [19]:
mask_x, x, lengths = next(iter(train_dl))
mask_x, x, lengths

(tensor([[10001,    48, 10002,  4135, 10002, 10002,  1485,    64,  3779,    26,
            795,    32,  2525,    54,  2887,    64,    26,    24, 10000, 10000,
          10000, 10000, 10000, 10000, 10000],
         [10001,  4766, 10002, 10002,    27,    64, 10002,    27,    64,  2073,
           2474,  2361,   710,   114,  1275,   853,    24, 10000, 10000, 10000,
          10000, 10000, 10000, 10000, 10000],
         [10001,  3526,    64,   307,  3993,    32,   938,  3440,    42,  1805,
             48,   245,   392,   587,  1153,   566,  1932,  5625,    64,    32,
          10002,    34,    35,  3039,    24]]),
 tensor([[10001,    48,  1040,  4135,   432,   142,  1485,    64,  3779,    26,
            795,    32,  2525,    54,  2887,    64,    26,    24, 10000, 10000,
          10000, 10000, 10000, 10000, 10000],
         [10001,  4766,   886,    27,    27,    64,    27,    27,    64,  2073,
           2474,  2361,   710,   114,  1275,   853,    24, 10000, 10000, 10000,
          1000

In [20]:
encoder = GenEncoder(5, 10, len(vocab2idx))
contexts, cells = encoder(mask_x, lengths)
contexts.size(), cells.size()  # the context and cell tensors are shape: (n_layers*n_dir, batch_size, hidden_dim)

(torch.Size([2, 3, 10]), torch.Size([2, 3, 10]))

In [21]:
flatten_contexts, flatten_cells = torch.flatten(contexts.transpose(1,0), 1), torch.flatten(cells.transpose(1,0), 1)
flatten_contexts.size(), flatten_cells.size()  # after concatenating

(torch.Size([3, 20]), torch.Size([3, 20]))

In [22]:
flatten_contexts.unsqueeze(0).size()  #unsqueeze(0) to make shape: (n_layers*n_dir, batch_size, hidden_dim)

torch.Size([1, 3, 20])

In [32]:
mask_x[:, 2]

tensor([10002, 10002,    64])

In [24]:
decoder = GenDecoder(20, 20, len(vocab2idx))
out, hidden, cell = decoder(x, flatten_contexts.unsqueeze(0), flatten_cells.unsqueeze(0))

In [49]:
x.size()

torch.Size([3, 32])

In [29]:
hidden.size(), cell.size(), out.size()

(torch.Size([1, 3, 20]), torch.Size([1, 3, 20]), torch.Size([3, 10003]))

In [50]:
out.size()

torch.Size([3, 10003])

In [33]:
out.argmax(dim=1)

tensor([3250, 4169, 3450])

In [59]:
mask_x[:, 6].unsqueeze(1).size()

torch.Size([3, 1])

In [64]:
x[:,0].unsqueeze(1).size()

(torch.Size([3, 32]), torch.Size([3, 1]))

In [46]:
torch.LongTensor([1,2,3])

tensor([1, 2, 3])

In [38]:
def train_batch(encoder, decoder, enc_optimizer, dec_optimizer, mask_x, x, lengths):
    # zero grad for both optimizers
    enc_optimizer.zero_grad()
    dec_optimizer.zero_grad()
    loss = 0
    
    # **ENCODER**
    #passing the masked tokens into the encoder to retrieve context vectors and final cells
    contexts, cells = encoder(mask_x, lengths)  # context, cell shape: (n_layers*n_dir, batch_size, hidden_dim)
    
    #first concatenate the bidirectional hidden & cell states into one context & cell tensors
    #unsqueeze(0) to shape: (n_layers*n_dir, batch_size, hidden_dim)
    hidden = torch.flatten(contexts.transpose(1,0), 1).unsqueeze(0)
    cell = torch.flatten(cells.transpose(1,0), 1).unsqueeze(0) 
    
    # **DECODER**
    batch_size = mask_x.size(0)  #batch_size
    batch_target_length = mask_x.size(1)  # this target length is the max seq length of batch_size
    decoder_input  = mask_x[:, 0].unsqueeze(1)  #unsqueeze to make sure its still batch_size, idx_dim
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        x_idx = x[:,idx]   # dont think .unsqueeze(1) is necessary
        
        #if (x_idx.eq(10000)).sum() > 0:  <- discuss with shrikar whether this is necessary
            # ignore the padding index so it doesnt count towards the loss
        loss += F.cross_entropy(output, x_idx, ignore_index = 10000)
        
        # setting up for the next input USE torch.where!!!!
        decoder_input = torch.where(mask_x[:,idx].eq(10002), output.argmax(dim=1), x[:,idx]).unsqueeze(1)
        
    # updating the gradient
    loss.backward()  #one loss for both optimizers?
    enc_optimizer.step()
    dec_optimizer.step()
    return loss.item()

In [39]:
def train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, epochs=10):
    for epoch in range(epochs):
        total, total_loss = 0, 0
        encoder.train()
        decoder.train()
        for mask_x, x, lengths in train_dl:
            loss = train_batch(encoder, decoder, enc_optimizer, dec_optimizer, mask_x.cuda(), x.cuda(), lengths)
            total_loss += loss*mask_x.size(0)
            total += mask_x.size(0)
        print(f"Epoch {epoch+1}  Training Loss: {total_loss/total:.3f}")

### Model Training

In [40]:
encoder = GenEncoder(5, 10, len(vocab2idx)).cuda()
decoder = GenDecoder(20, 20, len(vocab2idx)).cuda()
enc_optimizer = optim.Adam(encoder.parameters(), lr=.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=.001)

In [41]:
# small test
batch_size=2
train_ds = PTBDataset(train_samples[:10], .2, .2)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)

In [42]:
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, 10)

Epoch 1  Training Loss: 235.880
Epoch 2  Training Loss: 264.250
Epoch 3  Training Loss: 251.400
Epoch 4  Training Loss: 263.756
Epoch 5  Training Loss: 248.807
Epoch 6  Training Loss: 250.126
Epoch 7  Training Loss: 256.592
Epoch 8  Training Loss: 256.050
Epoch 9  Training Loss: 246.233
Epoch 10  Training Loss: 247.188
