# Baseline LSTM Model

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable
from torch.nn import functional as F
from glob import glob
import numpy as np
from tqdm.auto import tqdm

## Data Loader for labeled episodes with embedded sentences

Need to have embeddings saved from Model_Pretraining

Initiating the class takes some time to load all the embeddings into memory. 



In [3]:
## Code for LSTM Data loader
def ls_data_loader(path):
    # get path to all _emb files
    emb_files = glob(path + '/*_emb.bin')
    lab_files = glob(path + '/*_lab.bin')
    data, labels = [],[]
    for e,l in zip(emb_files, lab_files):
        data.append(np.loadtxt(e))
        labels.append(np.loadtxt(l))
    return data, labels

class Dataset_seq_ep(torch.utils.data.Dataset):
    def __init__(self, train_path):
        # self.sent_id = sent_id
        self.train_path = train_path
        self.data, self.labels = ls_data_loader(train_path)

    def __getitem__(self, index):
        # return sequence of sentences and labels
        seq = torch.Tensor(self.data[index])
        labels = torch.Tensor(self.labels[index])
        return seq, labels

    def __len__(self):
        return(len(self.data))


def collate_fn(batch):
	'''  
	custom collate_fn as the size of every episode is different and merging sequences (including padding) 
	is not supported in default. 
	'''

	(xx, yy) = zip(*batch)
	x_lens = [len(x) for x in xx]
	y_lens = [len(y) for y in yy]

	xx_pad = pad_sequence(xx, batch_first=True, padding_value=-1)
	yy_pad = pad_sequence(yy, batch_first=True, padding_value=-1)

	return xx_pad, yy_pad, x_lens, y_lens

train_dataset = Dataset_seq_ep('labeled_subs')

train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=1,
    shuffle=True,
    collate_fn=collate_fn)
    


Check dataloader and see that batches are padded to seq with max_length per batch and the length of each sequence is returned

In [None]:
for i in range(1):
    loop = tqdm(train_dataloader)
    for batch in loop:
        data, labels, in_len, lab_len = batch
        print('Data shape', data.shape)
        print('labels shape', labels.shape)

### Build LSTM model

adapting from 

Blog post:
Taming LSTMs: Variable-sized mini-batches and why PyTorch is good for your health:
https://medium.com/@_willfalcon/taming-lstms-variable-sized-mini-batches-and-why-pytorch-is-good-for-your-health-61d35642972e



In [70]:

# this is still a work in progress

class PerpLSTM(nn.Module):
    def __init__(self, nb_lstm_layers, nb_lstm_units=100, fc_hidden_units=100, embedding_dim=3, batch_size=1):
        super(PerpLSTM, self).__init__()
        self.vocab = {'<PAD>': -1} # not sure we need this
        self.tags = {'<PAD>':-1, 'N': 0, 'Y': 1}

        self.nb_lstm_layers = nb_lstm_layers
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.fc_hidden_units = fc_hidden_units

        # don't count the padding tag for the classifier output
        self.nb_tags = len(self.tags)-1

        # when the model is bidirectional we double the output dimension
        # self.lstm

        # build actual NN
        self.__build_model()

    def __build_model(self):
        # build embedding layer first
        # nb_vocab_words = len(self.vocab)

        # # whenever the embedding sees the padding index it'll make the whole vector zeros
        # padding_idx = self.vocab['<PAD>']
        # self.word_embedding = nn.Embedding(
        #     num_embeddings=nb_vocab_words,
        #     embedding_dim=self.embedding_dim,
        #     padding_idx=padding_idx
        # )

        self.relu = nn.ReLU()

        # design LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_lstm_layers,
            batch_first=True,
        )

        # output layer which projects back to tag space
        self.fc1 = nn.Linear(self.nb_lstm_units, self.fc_hidden_units)
        self.fc2 = nn.Linear(self.fc_hidden_units, 1) # change out to 1 for sigmoid activation
        # self.relu2 = nn.ReLU()

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(self.nb_lstm_layers, self.batch_size, self.nb_lstm_units)
        hidden_b = torch.randn(self.nb_lstm_layers, self.batch_size, self.nb_lstm_units)

        # if self.on_gpu:
        #     hidden_a = hidden_a.cuda()
        #     hidden_b = hidden_b.cuda()

        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)

    def forward(self, X, X_lengths):
        # reset the LSTM hidden state. Must be done before you run a new batch. Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        self.hidden = self.init_hidden()

        batch_size, seq_len, _ = X.size()

        # run through ReLu
        X = self.relu(X)

        # run through LSTM
        X, self.hidden = self.lstm(X, self.hidden)

        # undo the packing operation
        # X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        # ---------------------
        # 3. Project to tag space
        # Dim transformation: (batch_size, seq_len, nb_lstm_units) -> (batch_size * seq_len, nb_lstm_units)

        # this one is a bit tricky as well. First we need to reshape the data so it goes into the linear layer
        X = X.contiguous()
        X = X.view(-1, X.shape[2])

        # run through actual linear layer
        X = self.fc1(X)
        X = self.relu(X)
        X = torch.sigmoid(self.fc2(X))

        # ---------------------
        # 4. Create softmax activations bc we're doing classification
        # Dim transformation: (batch_size * seq_len, nb_lstm_units) -> (batch_size, seq_len, nb_tags)
        # X = F.log_softmax(X, dim=1)

        # I like to reshape for mental sanity so we're back to (batch_size, seq_len, nb_tags)
        # X = X.view(batch_size, seq_len, self.nb_tags)

        Y_hat = X
        return Y_hat

    def loss(self, Y_hat, Y, X_lengths):
        # TRICK 3 ********************************
        # before we calculate the negative log likelihood, we need to mask out the activations
        # this means we don't want to take into account padded items in the output vector
        # simplest way to think about this is to flatten ALL sequences into a REALLY long sequence
        # and calculate the loss on that.

        # flatten all the labels
        Y = Y.view(-1)
        print('Y',Y)
        abs_y = torch.abs(Y)

        # flatten all predictions
        Y_hat = Y_hat.view(-1, self.nb_tags)
        print('Y hat',Y_hat)
        # create a mask by filtering out all tokens that ARE NOT the padding token
        tag_pad_token = self.tags['<PAD>']
        mask = (Y > tag_pad_token).float()

        print('mask', mask)

        # count how many tokens we have
        # nb_tokens = int(torch.sum(mask).data[0])
        nb_tokens = 2
        print(Y_hat.shape)
        
        # pick the values for the label and zero out the rest with the mask

        # this part is broken

        Y_hat = Y_hat[range(Y_hat.shape[0]), abs_y] * mask
        

        # compute cross entropy loss which ignores all <PAD> tokens
        ce_loss = -torch.sum(Y_hat) / nb_tokens

        return ce_loss

In [76]:

perp_model = PerpLSTM(
    nb_lstm_layers=10, 
    nb_lstm_units=10, 
    embedding_dim=30522, batch_size=1)


## Testing forward

In [77]:
for i in range(1):
    loop = tqdm(train_dataloader)
    for batch in loop:
        data, labels, in_len, lab_len = batch
        print('Data shape', data.shape)
        print('labels shape', labels.shape)
        output = perp_model.forward(data, in_len)
        print(output.shape)
        break

  0%|          | 0/39 [00:00<?, ?it/s]

Data shape torch.Size([1, 573, 30522])
labels shape torch.Size([1, 573])


  0%|          | 0/39 [00:01<?, ?it/s]

torch.Size([573, 1])





## Testing backward

In [79]:

optimizer = torch.optim.Adam(perp_model.parameters(), lr=0.001) 
# Using Binary Cross Entropy Loss function since we are using batch size = 1
criterion = nn.BCELoss()

for epoch in range(10):
    loop = tqdm(train_dataloader)
    epoch_total = 0
    for batch in loop:
        data, labels, in_len, lab_len = batch

        outputs = perp_model.forward(data, in_len) #forward pass
        optimizer.zero_grad() #calculate the gradient, manually setting to 0
 
        # obtain the loss function
        # loss = perp_model.loss(outputs, labels, lab_len)
        # m = nn.Sigmoid()
        # sig_out = m(outputs)
        # print(sig_out.shape)

        # loss = criterion(outputs, F.one_hot(labels.view(-1).type(torch.int64)).type(torch.float32))
        loss = criterion(outputs.view(1,-1), labels.type(torch.float32))
        loss.backward() #calculates the loss of the loss function
        
        optimizer.step() #improve from loss, i.e backprop
        loop.set_postfix(loss=loss.item())
        epoch_total += loss.item()
    print("Epoch: %d, loss: %1.5f" % (epoch, epoch_total/len(loop)))


100%|██████████| 39/39 [02:23<00:00,  3.68s/it, loss=0.368]


Epoch: 0, loss: 0.41181


100%|██████████| 39/39 [01:30<00:00,  2.33s/it, loss=0.526]


Epoch: 1, loss: 0.41219


100%|██████████| 39/39 [02:00<00:00,  3.10s/it, loss=0.398]


Epoch: 2, loss: 0.41157


100%|██████████| 39/39 [01:51<00:00,  2.85s/it, loss=0.703]


Epoch: 3, loss: 0.41155


100%|██████████| 39/39 [01:34<00:00,  2.43s/it, loss=0.533]


Epoch: 4, loss: 0.41122


100%|██████████| 39/39 [01:33<00:00,  2.39s/it, loss=0.335]


Epoch: 5, loss: 0.41115


100%|██████████| 39/39 [01:33<00:00,  2.39s/it, loss=0.709]


Epoch: 6, loss: 0.41087


100%|██████████| 39/39 [01:17<00:00,  2.00s/it, loss=0.39] 


Epoch: 7, loss: 0.41121


100%|██████████| 39/39 [01:17<00:00,  2.00s/it, loss=0.332]


Epoch: 8, loss: 0.41141


100%|██████████| 39/39 [01:20<00:00,  2.06s/it, loss=0.479]

Epoch: 9, loss: 0.41121





In [82]:
perp_model(data, 1)

tensor([[0.1389],
        [0.1468],
        [0.1474],
        [0.1473],
        [0.1467],
        [0.1463],
        [0.1461],
        [0.1459],
        [0.1457],
        [0.1456],
        [0.1454],
        [0.1454],
        [0.1453],
        [0.1453],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0.1452],
        [0

In [None]:
del batch, data, labels
