In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
import re
prog = re.compile('[A-Za-z0-9]+')

path = 'aclImdb/train/unsup/'

def load_imdb_data(path, seq_len=40, gen=False):
    """
    Loads IMDB 50k unsupervised reviews
    
    path: str, path to the unsupervised reviews data
    seq_len: minimum length of sequence
    gen: if True all the reviews will be length of seq_len
    """
    reviews = []
    
    for i in tqdm(range(50000)):
        with open(path + f'{i}_0.txt', 'r') as f:
            rev = f.read()
        
        rev = rev.replace(' br ', ' ')
        if len(prog.findall(rev)) >= seq_len:
            if gen:
                reviews.append(['<sos>'] + prog.findall(rev)[:seq_len])
                if len(prog.findall(rev)[:seq_len]) == 39:
                    print(len(rev.split()))
            else:
                reviews.append(['<sos>'] + prog.findall(rev))
    return reviews

In [3]:
reviews = load_imdb_data(path, gen=False)

100%|██████████| 50000/50000 [00:09<00:00, 5266.27it/s]


In [4]:
reviews_40 = load_imdb_data(path, gen=True)

100%|██████████| 50000/50000 [00:11<00:00, 4260.07it/s]


In [5]:
def vocab_idxs(data):
    """
    Returns vocab, word2id and id2word, where
    vocab: set of all words in data
    word2id: dictionary that maps words on idxs
    id2word: inverse dictionary to word2id
    
    data: 
    type: list
    format: list of lists of words
    """
    vocab = set()
    for sentence in tqdm(data):
        for s in sentence:
            vocab.add(s)
    word2id = {k:v for v, k in enumerate(vocab, 1)}
    word2id['<m>'] = 0
    id2word = {v:k for k, v in word2id.items()}
    return vocab, word2id, id2word

In [6]:
vocab, word2id, id2word = vocab_idxs(reviews)

100%|██████████| 49668/49668 [00:01<00:00, 25725.00it/s]


In [33]:
def sents2matrix(data, word2id, seq_len=41):
    """
    Returns a matrix of integers
    where each row represents a sentence
    
    data:
    type: list
    format: list of lists of words of the seq_len length
    example: [['hello', 'world'], ['nice', 'day']]
    ----------------------------------------------------
    
    word2id: dict that maps word on idxs
    ----------------------------------------------------
    
    seq_len: len of lists contained in data
    """
    
    matrix = np.zeros((len(data), seq_len))
    for i in tqdm(range(len(data))):
        matrix[i] = np.array([int(word2id[word]) for word in data[i]])
    return np.array(matrix)

In [34]:
matrix = sents2matrix(reviews_40, word2id)

100%|██████████| 49668/49668 [00:00<00:00, 55766.00it/s]


In [43]:
torch.Tensor(matrix)

tensor([[ 82807., 123392., 120229.,  ..., 128217.,  65581.,  18679.],
        [ 82807.,  71281., 123869.,  ...,  15339.,  49196.,  73518.],
        [ 82807.,  94551.,  55759.,  ...,  90901.,  73820., 113699.],
        ...,
        [ 82807., 123392.,   7545.,  ...,  84054.,  42275.,  57089.],
        [ 82807.,  82950.,  71197.,  ..., 112087.,  35728.,  42114.],
        [ 82807.,  82084.,  37982.,  ...,  18679., 107110.,  69555.]])

In [59]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.LongTensor(matrix))

# dataloaders
batch_size = 64

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

In [65]:
sample_x = next(iter(train_loader))[0]

In [66]:
import torch.nn as nn

In [67]:
vocab_len = len(vocab) + 1
emb = nn.Embedding(vocab_len, embedding_dim=5)

In [68]:
sample_x

tensor([[ 82807,  89939, 112087,  ..., 114825,  49196,  91807],
        [ 82807,  34319,  24766,  ...,  37694,  34859,  49702],
        [ 82807,  53540,  65552,  ...,  18679,  94425,  84054],
        ...,
        [ 82807, 123392,  36531,  ..., 118881,  14797,  65247],
        [ 82807,  82084,  10544,  ..., 120161,  16404, 126683],
        [ 82807, 123392,  31758,  ...,  84054,  44568,  73518]])

In [71]:
e = emb(sample_x)

In [72]:
lstm = nn.LSTM(input_size=5, hidden_size=15)

In [75]:
o, h = lstm(e)

In [78]:
o.view(-1, 15)

tensor([[-0.0578, -0.0562,  0.2077,  ...,  0.0768,  0.2422, -0.2429],
        [ 0.1235, -0.0814,  0.0661,  ..., -0.0843,  0.0201, -0.0285],
        [ 0.1894, -0.0769,  0.0841,  ..., -0.0043,  0.0642, -0.0264],
        ...,
        [ 0.1291, -0.1846,  0.1610,  ..., -0.0525,  0.0530, -0.0905],
        [ 0.2648, -0.2240,  0.1628,  ..., -0.1315,  0.0554, -0.1148],
        [ 0.0044, -0.1702,  0.1952,  ...,  0.1190,  0.2143, -0.3363]],
       grad_fn=<ViewBackward>)

In [104]:
class PretrainedLSTM(nn.Module):
    
    def __init__(self, hidden_dim, output_size, embedding_dim, vocab_size, n_layers=2, train_on_gpu=False):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.train_on_gpu = train_on_gpu
        
        # Embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_dim=embedding_dim)
        
        # LSTM
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, )
        
        # fully-connected layes
        self.fc = nn.Sequential(nn.Linear(hidden_dim, 256),
                                nn.ReLU(),
                                nn.Linear(256, 128),
                                nn.ReLU(),
                                nn.Linear(128, vocab_size))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        x = self.embeddings(x)
        
        output, hidden = self.lstm(x)
        output = output.view(-1, self.hidden_dim)
        
        output = self.fc(output)
        
        return output, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (self.train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [107]:
hidden_dim = 256
output_size = len(vocab)
embedding_dim = 128
vocab_size = len(vocab)

model = PretrainedLSTM(hidden_dim, output_size, embedding_dim, vocab_size)

In [108]:
h = model.init_hidden(64)

In [110]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch import optim
from IPython.display import clear_output

In [114]:
def train_epoch(model, optimizer, train_loader):
    criterion = nn.CrossEntropyLoss()
    loss_log = []
    model.train()
    for sequence in train_loader:
        optimizer.zero_grad()
        X = sequence[0][:, :-1]
        y = sequence[0][:, 1:]
        hidden = model.init_hidden(X.size(0))
        output, hidden = model(X, hidden)
        criterion = nn.CrossEntropyLoss()
        loss = criterion(output, y.contiguous().view(-1))
        loss.backward()
        optimizer.step()
        loss_log.append(loss.item())
    return loss_log   

def test(model, test_loader):
    criterion = nn.CrossEntropyLoss()
    loss_log = []
    model.eval()
    for batch in test_batches:  
        X = sequence[0][:, :-1]
        y = sequence[0][:, 1:]
        hidden = model.init_hidden(X.size(0))
        output, hidden = model(X, hidden)
        loss = criterion(output, y.contiguous().view(-1))
        loss_log.append(loss.item())
    return loss_log

def plot_history(train_history, title='loss'):
    plt.figure()
    plt.title('{}'.format(title))
    plt.plot(train_history, label='train', zorder=1)    
    plt.xlabel('train steps')
    plt.legend(loc='best')
    plt.grid()
    plt.show()
    
def train(model, opt, n_epochs, train_loader):
    train_log = []
    
    total_steps = 0
    for epoch in range(n_epochs):
        train_loss = train_epoch(model, opt, train_loader)
        train_log.extend(train_loss)
        total_steps += len(train_loader)
        
        train_log.extend(train_loss)
        
        clear_output()
        plot_history(train_log)

In [116]:
opt = torch.optim.Adam(model.parameters(), lr=1e-4)
train(model, opt, 20, train_loader)

torch.Size([64, 40, 256])
torch.Size([64, 40, 256])


KeyboardInterrupt: 