In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import numpy as np
import pandas as pd
import chainer

### pytorch packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import time

In [3]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### PTB Data
#### Question 1: Should the text be broken down to samples split on "eos" token?
YES
#### Question 2: Should the text be prepended with a sos token?
YES

In [5]:
# unpacking the data from chainer
train, val, test = chainer.datasets.get_ptb_words()

In [6]:
# the data is already separated into a numpy array
print(f"train data: {type(train)} {train.shape} {train}")
print(f"val data: {type(val)} {val.shape} {val}")
print(f"test data: {type(test)} {test.shape} {test}")

train data: <class 'numpy.ndarray'> (929589,) [ 0  1  2 ... 39 26 24]
val data: <class 'numpy.ndarray'> (73760,) [2211  396 1129 ...  108   27   24]
test data: <class 'numpy.ndarray'> (82430,) [142  78  54 ...  87 214  24]


In [7]:
# ptb_dict is a dictionary containing words (key) to idx (value)
vocab2idx = chainer.datasets.get_ptb_words_vocabulary()
vocab2idx = {k:v for k, v in vocab2idx.items()}

#NOTE: PAD = 10000, <sos> = 10001, <m> = 10002
vocab2idx['PAD'] = len(vocab2idx)
vocab2idx['<sos>'] = 10001
vocab2idx['<m>'] = 10002

#creating a reverse dict to turn an index back into word for sanity check
idx2vocab = {v:k for k, v in vocab2idx.items()}
print(f"Number of vocabulary: {len(vocab2idx)}")

Number of vocabulary: 10003


In [8]:
def split_sentence(data):
    """
    This function splits the text data into individual sentences split on the <eos> token
    and prepends the <sos> token in the front.
    """
    samples, sentence, eos_idx = [], [vocab2idx['<sos>']], vocab2idx['<eos>']
    for idx in data:
        if idx != eos_idx:  #25 is the idx for the <eos> token
            sentence.append(idx)
        else:
            sentence.append(idx)
            samples.append(sentence)
            sentence = [vocab2idx['<sos>']]
    return samples

In [9]:
#splitting each sequence as an individual sample
train_samples = split_sentence(train)
val_samples = split_sentence(val)
test_samples = split_sentence(test)

In [10]:
sentence = [idx2vocab[idx] for idx in train_samples[5]]
print(' '.join(sentence))

<sos> the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said <eos>


In [11]:
#val_samples sequence
sentence = [idx2vocab[idx] for idx in val_samples[5]]
print(' '.join(sentence))

<sos> eventually viewers may grow <unk> with the technology and <unk> the cost <eos>


In [12]:
#test_samples sequence
sentence = [idx2vocab[idx] for idx in test_samples[5]]
print(' '.join(sentence))

<sos> heavy selling of standard & poor 's 500-stock index futures in chicago <unk> beat stocks downward <eos>


In [13]:
def mask_tokens(sentence, mask_prob, sub_prob):
    mask_sent, count = [], int(len(sentence)*sub_prob)
    for idx, token in enumerate(sentence):
        if np.random.uniform() < mask_prob and token not in [vocab2idx['<sos>'], vocab2idx['<eos>']]:
            mask_sent.append(vocab2idx['<m>'])
            count -= 1
        else:
            mask_sent.append(token)
        if count == 0:
            return mask_sent + sentence[idx+1:]
    return mask_sent

In [14]:
# sanity check that the masking worked
len(mask_tokens(train_samples[0], .2, .2)), len(train_samples[0])

(26, 26)

In [15]:
class PTBDataset(Dataset):
    """
    Setting up the Penn Tree Bank Dataset.
    """
    def __init__(self, X, mask_prob, sub_prob):
        self.X = X
        self.masked_X = [mask_tokens(x, mask_prob, sub_prob) for x in X]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.masked_X[idx])

In [16]:
# calling on the dataset
train_ds = PTBDataset(train_samples, .2, .2)
val_ds = PTBDataset(val_samples, .2, .2)
test_ds = PTBDataset(test_samples, .2, .2)

In [17]:
def collate(batch):
    """
    NOTE: batch without any labels is just a list of tensors coming from Dataset.
    padding each sequence.
    """
    (X, mask_X) = zip(*batch)
    x_len = [len(x) for x in mask_X]
    mask_x_pad = pad_sequence(mask_X, batch_first=True, padding_value=10000)
    x_pad = pad_sequence(X, batch_first=True, padding_value=10000)
    return mask_x_pad, x_pad, x_len

### Generator
#### Figure out how does bidirectional go with all of this? #### How to pass in bidirectional encoder vectors into a unidirectional decoder?
* The encoder will generate hidden states that are of size 2*hidden_dimension (Encoder) size. I have to concatenate the two vectors of the hidden state. The cell state isnt really needed per Yannet but could still be used. The decoder hidden dimension size needs to be the same as the encoder. The concatenation should be done in the Seq2Seq.
   
#### Figure out how to do the masking for each sentence. How many tokens do we mask? Do they have to be in sequential order?
* This was done setting a prob p of masking each token. The max number of tokens to mask is also set of a proportion length of the text. As a result, they were not done in sequential order.
#### How to setup the Seq2Seq to generate text?
* Perhaps it may be easier to go with Yannet's setup than benvrett? Need to discuss with Shirkar.

#### Getting this error
* AttributeError: 'int' object has no attribute 'backward'. The weird thing is that it trains for a few epochs.

In [18]:
class GenEncoder(nn.Module):
    def __init__(self, emb_dim, hidden_dim, vocab_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(.5)
        
    def forward(self, x, lengths):
        x = self.dropout(self.emb(x))
        x_pack = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(x_pack)  #NOTE: If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.
        return hidden, cell

In [19]:
class GenDecoder(nn.Module):
    def __init__(self, emb_dim, hidden_dim, vocab_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, h0, c0):
        x = self.dropout(self.emb(x))
        out, (hidden, cell) = self.lstm(x, (h0, c0))  #passing in the initial hidden state and cell state
        return self.linear(hidden[-1]), hidden, cell

In [21]:
batch_size = 3
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
mask_x, x, lengths = next(iter(train_dl))
# mask_x, x, lengths

In [22]:
encoder = GenEncoder(5, 10, len(vocab2idx))
contexts, cells = encoder(mask_x, lengths)
contexts.size(), cells.size()  # the context and cell tensors are shape: (n_layers*n_dir, batch_size, hidden_dim)

(torch.Size([2, 3, 10]), torch.Size([2, 3, 10]))

In [23]:
flatten_contexts, flatten_cells = torch.flatten(contexts.transpose(1,0), 1), torch.flatten(cells.transpose(1,0), 1)
flatten_contexts.size(), flatten_cells.size()  # after concatenating

(torch.Size([3, 20]), torch.Size([3, 20]))

In [24]:
decoder = GenDecoder(20, 20, len(vocab2idx))
#unsqueeze(0) to make shape: (n_layers*n_dir, batch_size, hidden_dim)
out, hidden, cell = decoder(x, flatten_contexts.unsqueeze(0), flatten_cells.unsqueeze(0))

In [21]:
def batch(encoder, decoder, enc_optimizer, dec_optimizer, mask_x, x, lengths, train=True):
    if train:
        encoder.train()
        decoder.train()
    else:
        encoder.eval()
        decoder.eval()
    # zero grad for both optimizers
    enc_optimizer.zero_grad()
    dec_optimizer.zero_grad()
    loss = 0
    
    # **ENCODER**
    #passing the masked tokens into the encoder to retrieve context vectors and final cells
    contexts, cells = encoder(mask_x, lengths)  # context, cell shape: (n_layers*n_dir, batch_size, hidden_dim)
    
    #first concatenate the bidirectional hidden & cell states into one context & cell tensors
    #unsqueeze(0) to shape: (n_layers*n_dir, batch_size, hidden_dim)
    hidden = torch.flatten(contexts.transpose(1,0), 1).unsqueeze(0)
    cell = torch.flatten(cells.transpose(1,0), 1).unsqueeze(0) 
    
    # **DECODER**
    batch_size = mask_x.size(0)  #batch_size
    batch_target_length = mask_x.size(1)  # this target length is the max seq length of batch_size
    decoder_input  = mask_x[:, 0].unsqueeze(1)  #unsqueeze to make sure its still batch_size, idx_dim
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        x_idx = x[:,idx]   # dont think .unsqueeze(1) is necessary
        
        #if (x_idx.eq(10000)).sum() > 0:  <- discuss with shrikar whether this is necessary
        # ignore the padding index so it doesnt count towards the loss
        loss += F.cross_entropy(output, x_idx, ignore_index = 10000)
        
        # setting up for the next input USE torch.where!!!!
        decoder_input = torch.where(mask_x[:,idx].eq(10002), output.argmax(dim=1), x[:,idx]).unsqueeze(1)
        
    # updating the gradient
    if train:
        loss.backward()  #one loss for both optimizers?
        enc_optimizer.step()
        dec_optimizer.step()
    return loss.item()

In [22]:
def train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, val_dl, epochs=10):
    for epoch in range(epochs):
        start = time.time()
        total, total_loss = 0, 0
        total_v = 0
        encoder.train()
        decoder.train()
        val_loss = 0
        for mask_x, x, lengths in train_dl:
            loss = batch(encoder, decoder, enc_optimizer, dec_optimizer, mask_x.to(device), x.to(device), lengths)
            total_loss += loss*mask_x.size(0)
            total += mask_x.size(0)
        for mask_x, x, lengths in val_dl:
            v_loss = batch(encoder, decoder, enc_optimizer, dec_optimizer, mask_x.to(device), x.to(device), lengths, False)
            val_loss += v_loss*mask_x.size(0)
            total_v += mask_x.size(0)
        print(f"Epoch {epoch+1}  Training Loss: {total_loss/total:.3f} Val Loss: {val_loss/total_v:.3f} Time: {time.time()-start:.3f}")

### Model Training

In [23]:
encoder = GenEncoder(32, 64, len(vocab2idx)).to(device)
decoder = GenDecoder(128, 128, len(vocab2idx)).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=3e-4)
dec_optimizer = optim.Adam(decoder.parameters(), lr=3e-4)

In [24]:
batch_size = 256
train_ds = PTBDataset(train_samples, .2, .2)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
val_ds = PTBDataset(val_samples, .2, .2)
val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate)

In [25]:
# enc_dim (32, 64) dec_dim (128, 128)
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, val_dl, 10)

Epoch 1  Training Loss: 465.348 Val Loss: 367.445 Time: 22.478
Epoch 2  Training Loss: 368.996 Val Loss: 359.982 Time: 21.576
Epoch 3  Training Loss: 366.848 Val Loss: 352.303 Time: 21.960
Epoch 4  Training Loss: 359.195 Val Loss: 347.145 Time: 21.867
Epoch 5  Training Loss: 351.423 Val Loss: 343.290 Time: 21.786
Epoch 6  Training Loss: 349.426 Val Loss: 339.886 Time: 21.827
Epoch 7  Training Loss: 345.624 Val Loss: 337.359 Time: 21.937
Epoch 8  Training Loss: 346.331 Val Loss: 335.091 Time: 22.017
Epoch 9  Training Loss: 342.321 Val Loss: 332.801 Time: 21.914
Epoch 10  Training Loss: 337.794 Val Loss: 330.642 Time: 21.795


In [27]:
# enc_dim (64, 128) dec_dim (128, 256)
encoder = GenEncoder(64, 128, len(vocab2idx)).to(device)
decoder = GenDecoder(128, 256, len(vocab2idx)).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=3e-4)
dec_optimizer = optim.Adam(decoder.parameters(), lr=3e-4)
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, val_dl, 50)

Epoch 1  Training Loss: 422.180 Val Loss: 362.060 Time: 27.409
Epoch 2  Training Loss: 365.786 Val Loss: 350.452 Time: 27.456
Epoch 3  Training Loss: 355.386 Val Loss: 343.195 Time: 27.535
Epoch 4  Training Loss: 350.261 Val Loss: 339.206 Time: 27.637
Epoch 5  Training Loss: 343.994 Val Loss: 334.989 Time: 27.447
Epoch 6  Training Loss: 339.982 Val Loss: 331.438 Time: 27.545
Epoch 7  Training Loss: 335.620 Val Loss: 328.770 Time: 27.457
Epoch 8  Training Loss: 332.161 Val Loss: 325.541 Time: 27.622
Epoch 9  Training Loss: 329.105 Val Loss: 323.082 Time: 27.601
Epoch 10  Training Loss: 324.382 Val Loss: 320.997 Time: 27.472
Epoch 11  Training Loss: 320.734 Val Loss: 319.923 Time: 27.484
Epoch 12  Training Loss: 318.205 Val Loss: 316.890 Time: 27.500
Epoch 13  Training Loss: 315.016 Val Loss: 315.244 Time: 27.480
Epoch 14  Training Loss: 311.844 Val Loss: 313.329 Time: 27.481
Epoch 15  Training Loss: 308.919 Val Loss: 312.056 Time: 27.411
Epoch 16  Training Loss: 306.655 Val Loss: 310.17