In [1]:
import os
import sys
import torch

PROJ_DIR = os.path.join(os.environ['WORKSPACE'], 'tutorial/')

if PROJ_DIR not in sys.path:
    sys.path.append(PROJ_DIR)

# Load the data to get the vocabulary

In [2]:
import pickle
from src.dataset import IMDBDatset
from src.utilities import flatten

with open('data.pickle', 'rb') as fp:
    corpus = pickle.load(fp)
    
vocab = set(flatten(corpus['train'].tokens + corpus['dev'].tokens))
print("Vocabulary:", len(vocab))

Vocabulary: 45572


# Word Embedding Layer

In [3]:
!cd .. && [ ! -f glove.6B.zip ] \
&& wget http://nlp.stanford.edu/data/glove.6B.zip \
&& mkdir glove.6B \
&& tar -xzf glove.6B.zip -C glove.6B || echo "GloVe vectors (most likely) already downloaded"

GloVe vectors (most likely) already downloaded


In [4]:
import numpy as np
import torch.nn as nn

class WordEmbedder(nn.Module):
    def __init__(self, vocab, glove_file):
        super(WordEmbedder, self).__init__()
        assert os.path.exists(glove_file) and glove_file.endswith('.txt'), glove_file
        
        self.emb_dim = None
        
        self.PAD_TOKEN = '<PAD>'
        self.UNK_TOKEN = '<UNK>'
        
        index_to_word = [self.PAD_TOKEN, self.UNK_TOKEN]
        index_to_vect = [None, None]
        
        with open(glove_file, 'r') as fp:
            for line in fp:
                line = line.split()
                
                if line[0] not in vocab:
                    continue
                
                w = line[0]
                v = np.array([float(value) for value in line[1:]])
                
                if self.emb_dim is None:
                    self.emb_dim = v.shape[0]
            
                index_to_word.append(w)
                index_to_vect.append(v)
                
        index_to_vect[0] = np.zeros(self.emb_dim)
        index_to_vect[1] = np.mean(index_to_vect[2:], axis=0)
    
        self.embeddings = torch.from_numpy(np.array(index_to_vect)).float()
        self.embeddings = nn.Embedding.from_pretrained(self.embeddings, freeze=False)
        
        self.index_to_word = {i: w for i, w in enumerate(index_to_word)}
        self.word_to_index = {w: i for i, w in self.index_to_word.items()}
    
    def forward(self, samples):
        pad_ix = self.word_to_index[self.PAD_TOKEN]
        unk_ix = self.word_to_index[self.UNK_TOKEN]
        
        maxlen = max([len(s) for s in samples])
        
        encoded = [[self.word_to_index.get(token, unk_ix) for token in tokens] for tokens in samples]
        masks = torch.zeros(len(samples), maxlen).long()
        
        # Padding and masking
        for i in range(len(encoded)):
            masks[i, :len(encoded[i])] = 1
            encoded[i] += [pad_ix] * max(0, (maxlen - len(encoded[i])))
        
        encoded = torch.tensor(encoded).long()
        
        if torch.cuda.is_available():
            encoded = encoded.cuda()
            masks = masks.cuda()
        
        result = {
            'output': self.embeddings(encoded),
            'mask': masks,
            'encoded': encoded
        }
        
        return result

In [5]:
embedder = WordEmbedder(vocab, os.path.join(PROJ_DIR, 'glove.6B/glove.6B.100d.txt'))
embedder

WordEmbedder(
  (embeddings): Embedding(21695, 100)
)

In [6]:
tokens, labels = corpus['train'][1718-1:1718+1]
embedder_result = embedder(tokens)

print("Encoded:\n{}".format(embedder_result['encoded']))
print("Embedding Shape:\n{}\n".format(embedder_result['output'].shape))
print("Mask:\n{}".format(embedder_result['mask']))
print("Labels:\n{}\n".format(labels))
print("Tokens:\n{}".format(tokens))

Encoded:
tensor([[    1,   120,   698,  2756,    38,    14,     2,     1,  1500,     4,
             1,  1076,    11,     2,   312,    16,     6,   876,   109,  7369,
             3, 12054,     8,    45,   255,   123,     3,     6,   159,   603,
          1464,     3,  4852,     5,   101,     3,    65,     8,    49,   312,
             3,  8543,    14,   120,    49,  1560,    46,     1,     1, 11046,
         13783,   268,  8244, 11046, 13783,   268,  8244,     1,     1,    38,
           935,     1,    95,  1254],
        [    1,     1,     1,   254,     9,   326,     1,     7,     9,   935,
            97,  5165,   754,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
     

# LSTM layer

In [7]:
class LSTMLayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, bidirectional=False, num_layers=1, drop_prob=0.3):
        super(LSTMLayer, self).__init__()
        
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(input_dim, hidden_dim // 2 if bidirectional else hidden_dim,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            dropout=drop_prob if num_layers > 1 else 0,
                            batch_first=True)

    def forward(self, vectors, mask):
        batch_size = vectors.size(0)
        max_length = vectors.size(1)
        lengths = mask.sum(-1)

        lstm_out, _ = self.lstm(vectors)  # (batch, seq_len, num_directions * hidden_size)

        assert lstm_out.size(0) == batch_size
        assert lstm_out.size(1) == max_length
        assert lstm_out.size(2) == self.hidden_dim

        if self.bidirectional:
            # Separate the directions of the LSTM
            lstm_out = lstm_out.view(batch_size, max_length, 2, self.hidden_dim // 2)

            # Pick up the last hidden state per direction
            fw_last_hn = lstm_out[range(batch_size), lengths - 1, 0]  # (batch, hidden // 2)
            bw_last_hn = lstm_out[range(batch_size), 0, 1]            # (batch, hidden // 2)

            last_hn = torch.cat([fw_last_hn, bw_last_hn], dim=1)      # (batch, hidden // 2) -> (batch, hidden)
        else:
            last_hn = lstm_out[range(batch_size), lengths - 1]        # (batch, hidden)

        return {'output': last_hn, 'outputs': lstm_out}

In [8]:
lstm_layer = LSTMLayer(embedder.emb_dim, 64)
lstm_layer

LSTMLayer(
  (lstm): LSTM(100, 64, batch_first=True)
)

In [9]:
lstm_result = lstm_layer(embedder_result['output'], embedder_result['mask'])
print(lstm_result['output'])
print(lstm_result['output'].shape)

tensor([[-7.6868e-02,  8.1113e-02, -5.3292e-02,  6.6632e-03, -2.4285e-01,
         -1.3585e-01, -1.8223e-01,  5.1247e-02, -4.3143e-01, -7.2476e-02,
         -1.5521e-01, -1.3031e-01,  1.5079e-01,  5.0297e-02,  1.7093e-01,
          2.0144e-01, -5.0039e-02,  7.8581e-02, -1.8033e-01, -7.7575e-02,
          1.5991e-01,  1.7170e-01, -2.7888e-01,  1.2124e-01,  6.5989e-02,
          6.6409e-02,  6.9906e-02, -1.9961e-01,  2.2071e-01,  1.0483e-01,
          5.9449e-02, -6.1370e-02, -1.4324e-01, -2.2716e-02,  1.9821e-01,
         -4.2869e-02, -7.4581e-02,  1.7959e-04,  2.1500e-01,  5.6030e-02,
         -2.0585e-01,  1.1483e-01, -1.9692e-01,  9.8408e-02,  6.4696e-02,
         -1.4615e-02,  2.8201e-02,  7.4419e-02,  2.9322e-01, -9.9421e-02,
         -1.1486e-01, -2.3422e-02, -5.7116e-02,  2.3961e-01, -1.7119e-01,
          1.9581e-01, -7.9659e-02,  8.1953e-02,  1.2482e-01,  1.1906e-01,
          4.4853e-02,  1.3501e-01,  8.4314e-02,  1.6695e-01],
        [-1.7681e-01,  3.6196e-02,  2.0196e-03,  9

# LSTM Classifier (putting all together)

In [10]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedder, extractor):
        super(LSTMClassifier, self).__init__()
        self.embedder = embedder
        self.extractor = extractor
        self.classifier = nn.Linear(extractor.hidden_dim, 1)
        self.xentropy = nn.BCEWithLogitsLoss()
        
    def forward(self, tokens, targets=None):
        embedded = self.embedder(tokens)
        extracted = self.extractor(embedded['output'], embedded['mask'])
        
        logits = self.classifier(extracted['output'])
        loss = None
        
        if targets is not None:
            logits = logits.view(-1)
            targets = targets.float()
            loss = self.xentropy(logits, targets)            
        
        return {'output': logits, 'loss': loss}

In [11]:
lstm_model = LSTMClassifier(embedder, lstm_layer)
lstm_model

LSTMClassifier(
  (embedder): WordEmbedder(
    (embeddings): Embedding(21695, 100)
  )
  (extractor): LSTMLayer(
    (lstm): LSTM(100, 64, batch_first=True)
  )
  (classifier): Linear(in_features=64, out_features=1, bias=True)
  (xentropy): BCEWithLogitsLoss()
)

In [12]:
from src.utilities import count_params
    
print("Embd params:  {:,}".format(count_params(lstm_model.embedder)))
print("LSTM params:  {:,}".format(count_params(lstm_model.extractor)))
print("Clfr params:  {:,}".format(count_params(lstm_model.classifier)))
print("Total params: {:,}".format(count_params(lstm_model)))

Embd params:  2,169,500
LSTM params:  42,496
Clfr params:  65
Total params: 2,212,061


In [13]:
dummy_tokens = ['this is bad'.split(), 'this is not bad !'.split()]
dummy_labels = torch.tensor([0, 1]).long()

result = lstm_model(dummy_tokens, dummy_labels)
result

{'loss': tensor(0.6909, grad_fn=<BinaryCrossEntropyWithLogitsBackward>),
 'output': tensor([0.0624, 0.0736], grad_fn=<ViewBackward>)}

In [14]:
from src.utilities import process_logits

preds, probs = process_logits(result['output'])

print("Preds: {}".format(preds))
print("Probs: {}".format(probs))

Preds: [1.0, 1.0]
Probs: [0.5155900716781616, 0.5183917880058289]
