In [4]:
import torchtext, random, torch

import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import init

import numpy as np
from tqdm import tqdm_notebook

global use_cuda
use_cuda = torch.cuda.is_available()
device = 0 if use_cuda else -1

TEXT = torchtext.data.Field()
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path="../data", train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)
TEXT.build_vocab(train, max_size=1000) if False else TEXT.build_vocab(train)
TEXT.vocab.load_vectors('glove.840B.300d')
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits((train, val, test), batch_size=10, device=device, bptt_len=32, repeat=False)

In [5]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, hidden_dim = 100, TEXT = TEXT, batch_size = 10):
        super(LSTMLanguageModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        
        vocab_size, embedding_dim = TEXT.vocab.vectors.shape
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight.data.copy_(TEXT.vocab.vectors)
        
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = self.hidden_dim, dropout = 0.50)
        self.linear = nn.Linear(in_features = self.hidden_dim, out_features = vocab_size)
        self.drop = nn.Dropout(p = 0.50)
        
        self.init_lstm_params_uniformly(bound = 0.04)
        
    def init_lstm_params_uniformly(self, bound):
        for layer_params in self.lstm._all_weights:
            for param in layer_params:
                if 'weight' in param:
                    init.uniform(self.lstm.__getattr__(param), -bound, bound)
                    
                    
    def init_hidden(self):
        direction = 2 if self.lstm.bidirectional else 1
        if use_cuda:
            return (Variable(torch.zeros(direction*self.lstm.num_layers, self.batch_size, self.hidden_dim)).cuda(), 
                    Variable(torch.zeros(direction*self.lstm.num_layers, self.batch_size, self.hidden_dim)).cuda())
        else:
            return (Variable(torch.zeros(direction*self.lstm.num_layers, self.batch_size, self.hidden_dim)), 
                    Variable(torch.zeros(direction*self.lstm.num_layers, self.batch_size, self.hidden_dim)))
    
    def detach_hidden(self, hidden):
        """ util function to keep down number of graphs """
        return tuple([h.detach() for h in hidden])
        
    def forward(self, x, hidden, train = True):
        """ predict, return hidden state so it can be used to intialize the next hidden state """
        embedded = self.embeddings(x)
        embedded = self.drop(embedded) if train else embedded
        
        lstm_output, hdn = self.lstm(embedded, hidden)
        reshaped = lstm_output.view(-1, lstm_output.size(2))
        dropped = self.drop(reshaped) if train else reshaped
        
        decoded = self.linear(dropped)
        
        logits = F.log_softmax(decoded, dim = 1)
                
        return logits, self.detach_hidden(hdn)    
    
class Trainer:
    def __init__(self, train_iter, val_iter):
        self.train_iter = train_iter
        self.val_iter = val_iter
        
    def string_to_batch(self, string):
        relevant_split = string.split() # last two words, ignore ___
        ids = [self.word_to_id(word) for word in relevant_split]
        if use_cuda:
            return Variable(torch.LongTensor(ids)).cuda()
        else:
            return Variable(torch.LongTensor(ids))
        
    def word_to_id(self, word, TEXT = TEXT):
        return TEXT.vocab.stoi[word]
    
    def batch_to_input(self, batch):
        ngrams = self.collect_batch_ngrams(batch)
        x = Variable(torch.LongTensor([ngram[:-1] for ngram in ngrams]))
        y = Variable(torch.LongTensor([ngram[-1] for ngram in ngrams]))
        if use_cuda:
            return x.cuda(), y.cuda()
        else:
            return x, y
    
    def collect_batch_ngrams(self, batch, n = 3):
        data = batch.text.view(-1).data.tolist()
        return [tuple(data[idx:idx + n]) for idx in range(0, len(data) - n + 1)]
    
    def train_model(self, model, num_epochs):
        parameters = filter(lambda p: p.requires_grad, model.parameters())
        optimizer = torch.optim.Adam(params = parameters, lr=1e-3)
        criterion = nn.NLLLoss()
        
        for epoch in tqdm_notebook(range(num_epochs)):

            epoch_loss = []
            hidden = model.init_hidden()
            model.train()

            for batch in tqdm_notebook(train_iter):
                x, y = batch.text, batch.target.view(-1)
                if use_cuda: x, y = x.cuda(), y.cuda()

                optimizer.zero_grad()

                y_pred, hidden = model.forward(x, hidden, train = True)

                loss = criterion(y_pred, y)
                loss.backward()

                torch.nn.utils.clip_grad_norm(model.lstm.parameters(), 1)

                optimizer.step()

                epoch_loss.append(loss.data[0])
                
            model.eval()
            train_ppl = np.exp(np.mean(epoch_loss))
            val_ppl = self.validate(model)

            print('Epoch {0} | Loss: {1} | Train PPL: {2} | Val PPL: {3}'.format(epoch+1, np.mean(epoch_loss), train_ppl,  val_ppl))
    
        print('Model trained.')
        self.write_kaggle(model)
        print('Output saved.')
        
    def validate(self, model):
        criterion = nn.NLLLoss()
        hidden = model.init_hidden()
        aggregate_loss = []
        for batch in self.val_iter:
            y_p, _ = model.forward(batch.text, hidden, train = False)
            y_t = batch.target.view(-1)
            
            loss = criterion(y_p, y_t)
            aggregate_loss.append(loss.data[0])        
        val_ppl = np.exp(np.mean(aggregate_loss))
        return val_ppl
    
    def predict_sentence(self, string, model, TEXT = TEXT):
        string = string[:-4]
        model.batch_size = 1
        hidden = model.init_hidden()
        x = self.string_to_batch(string)
        logits, _ = model.forward(x, hidden, train = False)
        argsort_ids = np.argsort(logits[-1].data.tolist())
        out_ids = argsort_ids[-20:][::-1]
        out_words = ' '.join([TEXT.vocab.itos[out_id] for out_id in out_ids])
        return out_words
    
    def write_kaggle(self, model, input_file = 'input.txt'):        
        inputs = open(input_file, 'r').read().splitlines()
        outputs = [self.predict_sentence(sentence, model) for sentence in inputs]
        with open('lstm_initialized_output.txt', 'w') as f:
            f.write('id,word')
            for idx, line in enumerate(outputs):
                f.write('\n')
                f.write(str(idx) + ',')
                f.write(line) 

In [6]:
model = LSTMLanguageModel(hidden_dim = 1024)
if use_cuda: 
    model.cuda()
trainer = Trainer(train_iter = train_iter, val_iter = val_iter)
trainer.train_model(model = model, num_epochs = 10)




KeyboardInterrupt: 

In [8]:
batch = next(iter(train_iter))
x = batch.text
hidden = model.init_hidden()

embedded = model.embeddings(x)
embedded = model.drop(embedded) #if train else embedded

lstm_output, hdn = model.lstm(embedded, hidden)
reshaped = lstm_output.view(-1, lstm_output.size(2))
dropped = model.drop(reshaped) if train else reshaped

decoded = model.linear(dropped)

In [19]:
reshaped

Variable containing:
-1.3221e-02 -2.7291e-02  1.4944e-01  ...   5.5913e-02  9.4392e-02  7.7707e-02
 6.4081e-02 -1.0069e-01 -8.2317e-02  ...   1.4140e-01  9.4296e-03  1.4276e-01
 5.8650e-02  6.7410e-02 -1.2782e-01  ...   1.0673e-01  3.2066e-02  7.1340e-02
                ...                   ⋱                   ...                
 8.4197e-01 -1.3047e-01 -8.0163e-01  ...   9.4175e-01 -8.4742e-01  8.5667e-01
 9.0017e-01 -1.0958e-01 -8.2342e-01  ...   9.4636e-01 -8.0056e-01  8.3597e-01
 8.6568e-01 -1.4711e-01 -7.2957e-01  ...   9.4486e-01 -8.0473e-01  8.1715e-01
[torch.FloatTensor of size 320x1024]