In [1]:
import os
import sys
import torch

PROJ_DIR = os.path.join(os.environ['WORKSPACE'], 'tutorial/')

if PROJ_DIR not in sys.path:
    sys.path.append(PROJ_DIR)

# The IMDB Dataset

In [2]:
!cd .. \
&& [ ! -f aclImdb_v1.tar.gz ] \
&& wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz \
&& tar -xzf  aclImdb_v1.tar.gz || echo "Data (most likely) already downloaded"

Data (most likely) already downloaded


In [3]:
import random
import copy
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset

random.seed(2)

MAXLEN = 64

def read_files(datadir, sentiment, maxlen):
    sent_dir = os.path.join(datadir, sentiment)
    
    tokens = [word_tokenize(open(os.path.join(sent_dir, sent_file)).read())[:maxlen]
              for sent_file in os.listdir(sent_dir)[:1500]
              if sent_file.endswith('.txt')]
    labels = [sentiment] * len(tokens)
    
    return tokens, labels
    
    
def shuffle(tokens, labels):
    z = list(zip(tokens, labels))
    random.shuffle(z)
    return zip(*z)
    
    
class IMDBDatset(Dataset):
    def __init__(self, datadir, maxlen):
        assert os.path.exists(datadir), datadir
        
        self.tokens = []
        self.labels = []

        pos_tokens, pos_labels = read_files(datadir, 'pos', maxlen)
        neg_tokens, neg_labels = read_files(datadir, 'neg', maxlen)
        
        self.tokens.extend(pos_tokens + neg_tokens)
        self.labels.extend(pos_labels + neg_labels)
        
        self.tokens, self.labels = shuffle(self.tokens, self.labels)

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, item):
        return self.tokens[item], self.labels[item]


train_dataset = IMDBDatset(os.path.join(PROJ_DIR, 'aclImdb/train'), MAXLEN)
test_dataset = IMDBDatset(os.path.join(PROJ_DIR, 'aclImdb/test'), MAXLEN)

dev_dataset = copy.deepcopy(train_dataset)
dev_dataset.tokens = dev_dataset.tokens[-1000:]
dev_dataset.labels = dev_dataset.labels[-1000:]

train_dataset.tokens = train_dataset.tokens[:2000]
train_dataset.labels = train_dataset.labels[:2000]

In [4]:
len(train_dataset), len(dev_dataset), len(test_dataset)

(2000, 1000, 3000)

In [5]:
for i in range(len(train_dataset)):
    tokens, label = train_dataset[i]
    if len(tokens) <= 32:
        print(i, label)
        print(tokens)

1430 pos
['My', 'favorite', 'movie', '.', 'What', 'a', 'great', 'story', 'this', 'really', 'was', '.', 'I', "'d", 'just', 'like', 'to', 'be', 'able', 'to', 'buy', 'a', 'copy', 'of', 'it', 'but', 'this', 'does', 'not', 'seem', 'possible', '.']


In [6]:
tokens, labels = train_dataset[1430-1:1430+1]
for t in tokens:
    print(t, '\n')

['This', 'is', 'the', 'least', 'scary', 'film', 'i', 'have', 'ever', 'seen', '.', 'How', 'the', 'blob', 'manages', 'to', 'eat', 'anyone', 'is', 'the', 'biggest', 'mystery', 'of', 'the', 'film', '.', 'The', 'blob', 'moves', 'so', 'slowly', 'that', 'an', 'o.a.p', 'in', 'a', 'zimmerframe', 'could', 'escape', 'it', '.', 'The', 'blob', 'has', 'a', 'large', 'slice', 'of', 'luck', 'coming', 'across', 'a', 'typical', 'horror', 'film', 'woman', 'who', 'instead', 'of', 'running', 'away', 'stands', 'still', 'for'] 

['My', 'favorite', 'movie', '.', 'What', 'a', 'great', 'story', 'this', 'really', 'was', '.', 'I', "'d", 'just', 'like', 'to', 'be', 'able', 'to', 'buy', 'a', 'copy', 'of', 'it', 'but', 'this', 'does', 'not', 'seem', 'possible', '.'] 



# Word Embeddings

In [7]:
!cd .. && [ ! -f glove.6B.zip ] \
&& wget http://nlp.stanford.edu/data/glove.6B.zip \
&& mkdir glove.6B \
&& tar -xzf glove.6B.zip -C glove.6B || echo "GloVe vectors (most likely) already downloaded"

GloVe vectors (most likely) already downloaded


In [8]:
def flatten(posts):
    return [t for tokens in posts for t in tokens]

vocab = set(flatten(train_dataset.tokens + dev_dataset.tokens))
len(vocab)

18757

In [14]:
import numpy as np
import torch.nn as nn

class WordEmbedder(nn.Module):
    def __init__(self, vocab, glove_file):
        super(WordEmbedder, self).__init__()
        assert os.path.exists(glove_file) and glove_file.endswith('.txt'), glove_file
        
        self.emb_dim = None
        
        self.PAD_TOKEN = '<PAD>'
        self.UNK_TOKEN = '<UNK>'
        
        index_to_word = [self.PAD_TOKEN, self.UNK_TOKEN]
        index_to_vect = [None, None]
        
        with open(glove_file, 'r') as fp:
            for line in fp:
                line = line.split()
                
                if line[0] not in vocab:
                    continue
                
                w = line[0]
                v = np.array([float(value) for value in line[1:]])
                
                if self.emb_dim is None:
                    self.emb_dim = v.shape[0]
            
                index_to_word.append(w)
                index_to_vect.append(v)
                
        index_to_vect[0] = np.zeros(self.emb_dim)
        index_to_vect[1] = np.mean(index_to_vect[2:], axis=0)
    
        self.embeddings = torch.from_numpy(np.array(index_to_vect)).float()
        self.embeddings = nn.Embedding.from_pretrained(self.embeddings, freeze=False)
        
        self.index_to_word = {i: w for i, w in enumerate(index_to_word)}
        self.word_to_index = {w: i for i, w in self.index_to_word.items()}
    
    def forward(self, samples):
        pad_ix = self.word_to_index[self.PAD_TOKEN]
        unk_ix = self.word_to_index[self.UNK_TOKEN]
        
        maxlen = max([len(s) for s in samples])
        
        encoded = [[self.word_to_index.get(token, unk_ix) for token in tokens] for tokens in samples]
        masks = torch.zeros(len(samples), maxlen).long()
        
        # Padding and masking
        for i in range(len(encoded)):
            masks[i, :len(encoded[i])] = 1
            encoded[i] += [pad_ix] * max(0, (maxlen - len(encoded[i])))
        
        encoded = torch.tensor(encoded).long()
        
        if torch.cuda.is_available():
            encoded = encoded.cuda()
            masks = mask.cuda()
        
        result = {
            'output': self.embeddings(encoded),
            'mask': masks,
            'encoded': encoded
        }
        
        return result

In [15]:
embedder = WordEmbedder(vocab, os.path.join(PROJ_DIR, 'glove.6B/glove.6B.100d.txt'))

tokens, labels = train_dataset[1430-1:1430+1]
embedder_result = embedder(tokens)

print("Encoded:\n{}".format(embedder_result['encoded']))
print("Mask:\n{}".format(embedder_result['mask']))

Encoded:
tensor([[   1,   15,    2,  317, 5226,  298,   42,   34,  587,  486,    4,    1,
            2, 9139, 3938,    6, 2646, 1293,   15,    2,  771, 3523,    5,    2,
          298,    4,    1, 9139, 1951,   98, 2789,   13,   30,    1,    8,    9,
            1,   94, 2272,   21,    4,    1, 9139,   32,    9,  395, 5317,    5,
         3616,  688,  477,    9, 2671, 3808,  298,  692,   39,  682,    5,  702,
          389, 2017,  144,   11],
        [   1, 2104,  873,    4,    1,    9,  331,  472,   38,  529,   16,    4,
            1,  987,  117,  114,    6,   31,  593,    6,  857,    9, 3109,    5,
           21,   35,   38,  246,   37, 1556,  498,    4,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])
Mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1

# LSTM model

In [16]:
class LSTMLayer(nn.Module):
    def __init__(self,
                 input_dim,
                 hidden_dim,
                 bidirectional=False,
                 num_layers=1,
                 drop_prob=0.3):
        super(LSTMLayer, self).__init__()

        self.input_dim = input_dim
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(self.input_dim,
                            self.hidden_dim // 2 if self.bidirectional else self.hidden_dim,
                            num_layers=self.num_layers,
                            bidirectional=self.bidirectional,
                            dropout=drop_prob if self.num_layers > 1 else 0,
                            batch_first=True)

    def forward(self, vectors, mask):
        batch_size = vectors.size(0)
        max_length = vectors.size(1)
        lengths = mask.sum(-1)

        lstm_out, _ = self.lstm(vectors)  # (batch, seq_len, num_directions * hidden_size)

        assert len(lstm_out.size()) == 3
        assert lstm_out.size(0) == batch_size, "The batch size should be the first dimension of the LSMT output"
        assert lstm_out.size(1) == max_length, "The sequence length should be the second dimension"
        assert lstm_out.size(2) == self.hidden_dim

        if self.bidirectional:
            # Separate the directions of the LSTM
            lstm_out = lstm_out.view(batch_size, max_length, 2, self.hidden_dim // 2)

            # Pick up the last hidden state per direction
            fw_last_hn = lstm_out[range(batch_size), lengths - 1, 0]  # (batch, hidden // 2)
            bw_last_hn = lstm_out[range(batch_size), 0, 1]            # (batch, hidden // 2)

            last_hn = torch.cat([fw_last_hn, bw_last_hn], dim=1)      # (batch, hidden // 2) -> (batch, hidden)
        else:
            last_hn = lstm_out[range(batch_size), lengths - 1]        # (batch, hidden)

        result = {
            'last_output': last_hn,
            'outputs': lstm_out
        }
            
        return result    

In [17]:
lstm_layer = LSTMLayer(embedder.emb_dim, 64)
lstm_result = lstm_layer(embedder_result['output'], embedder_result['mask'])

In [20]:
lstm_result['last_output']

tensor([[ 0.3110, -0.1096, -0.0936, -0.0133,  0.0684,  0.1158,  0.0070, -0.0261,
          0.3608, -0.0616, -0.0180, -0.1673,  0.0874,  0.0647, -0.1276,  0.2160,
          0.1235,  0.0969,  0.0941,  0.0459,  0.0888,  0.1548,  0.2680,  0.0922,
         -0.2873, -0.0229, -0.0592, -0.1303, -0.0644,  0.0026, -0.1048,  0.1234,
         -0.0633,  0.2180,  0.0135,  0.0428, -0.1438, -0.0828,  0.1079,  0.1771,
          0.3920, -0.3413, -0.0553,  0.2152, -0.2983, -0.3738, -0.1413,  0.0825,
         -0.0946, -0.0011,  0.0431,  0.2841, -0.0211, -0.1301,  0.0652, -0.1148,
          0.0383,  0.0912,  0.1139, -0.0533, -0.0022,  0.1915, -0.1320,  0.1496],
        [ 0.2932, -0.0846, -0.1677,  0.0756, -0.0269,  0.1424, -0.0061,  0.0374,
          0.3842, -0.0945, -0.0394, -0.1346,  0.0454, -0.0832, -0.1512,  0.2612,
          0.0763, -0.0879,  0.0513,  0.0515, -0.1689,  0.1486,  0.1977,  0.1775,
         -0.3069, -0.0157,  0.1164, -0.1034, -0.0488, -0.0016, -0.0595,  0.0881,
          0.0191,  0.0950, 