In [44]:
%matplotlib inline

import os
import random
import time

import numpy as np
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
   
from tests import test_prediction, test_generation
import pdb

In [45]:
# load all that we need

dataset = np.load('../dataset/wiki.train.npy', allow_pickle=True)
devset = np.load('../dataset/wiki.valid.npy', allow_pickle=True)
fixtures_pred = np.load('../fixtures/prediction.npz')  # dev
fixtures_gen = np.load('../fixtures/generation.npy')  # dev
fixtures_pred_test = np.load('../fixtures/prediction_test.npz')  # test
fixtures_gen_test = np.load('../fixtures/generation_test.npy')  # test
vocab = np.load('../dataset/vocab.npy')

In [None]:
# import pandas as pd
# readable_vocab = pd.read_csv('../dataset/vocab.npy', encoding = "utf-8")
# readable_vocab.sample(10)

In [46]:
len(dataset[0][:7])

7

In [47]:
class WikiTextDataset(Dataset):
    
    def __init__(self, data):
        self.data = [torch.from_numpy(data) for data in dataset]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, key):
        return self.data[key]

In [134]:
# data loader

class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size:int, seq_len:int, shuffle:bool=True, batch_first=True):
#         super(LanguageModelDataLoader, self).__init__(dataset)
        data = torch.cat([torch.from_numpy(record) for record in dataset])
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.start = 0
        self.end = self.start + self.batch_size
        self.seq_len = seq_len
        self.batch_first = batch_first
#         indices = list(range(len(dataset)))
        if shuffle:
            random.shuffle(data)
            
        # flatten the dataset and reshape it by the sequence length
        labels = torch.cat([data[1:], data[0:1]])
        valid_tensors = (len(data) - len(data) % seq_len )
        shape = (int(valid_tensors / seq_len), seq_len)
        reshaped_input = torch.reshape(data[:valid_tensors], shape)
        reshaped_labels = torch.reshape(labels[:valid_tensors], shape)
        new_indices = len(reshaped_labels)
        new_indices = list(range(new_indices))
        random.shuffle(new_indices)
        data = [reshaped_input[i] for i in new_indices]
        labels = [reshaped_labels[i] for i in new_indices]
        
        # build batches
        
        num_batches = len(data) / self.batch_size
        batch_indices = np.arange(start=0, stop=len(data), step=self.batch_size)
        self.inputs = []
        self.labels = []
        start = 0
        for end in batch_indices[1:]:
            self.inputs.append(
                torch.cat(data[start:end]).reshape(
                    self.batch_size,self.seq_len
                )
            )
            self.labels.append(
                torch.cat(labels[start:end]).reshape(
                    self.batch_size,self.seq_len)
            )
            start += self.batch_size


    def __iter__(self):
        # concatenate your articles and build into batches
        i = -1
        while i < len(self.inputs) - 1:
            if self.batch_first:
                 yield (self.inputs[i].T, self.labels[i].T)
            else:
                 yield (self.inputs[i], self.labels[i])
            
    
        
        

In [135]:
self = LanguageModelDataLoader(dataset=dataset[:30],seq_len=5, batch_size=10)

In [123]:
# # torch.cat(self.inputs[0]).reshape((int(len(self.inputs[0])/5),5))
# torch.cat(self.inputs[0]).reshape(self.batch_size,self.seq_len)

In [138]:
for i in self:
    print(i[0].shape)
    break

torch.Size([5, 10])


In [78]:
class Foo:
    def __init__(self):
        self.data = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    def __iter__(self):
        print("Whatever we do here executes once")
        i = -1
        while i < len(self.data) - 1:
            i += 1
            yield self.data[i]

In [None]:
wikiTextDataset = WikiTextDataset(data=dataset)

In [None]:
# model

class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
    def __init__(self, vocab_size):
        super(LanguageModel, self).__init__()
        self.vocab_size = vocab_size
        
        raise NotImplemented


    def forward(self, x):
        # Feel free to add extra arguments to forward (like an argument to pass in the hiddens)
        raise NotImplemented

    


In [None]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = None
        self.criterion = None

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        for batch_num, (inputs, targets) in enumerate(self.loader):
            epoch_loss += self.train_batch(inputs, targets)
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        
        """
        raise NotImplemented

    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generation(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, nll))
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [None]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        raise NotImplemented

        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """        
        raise NotImplemented
        

In [None]:
# TODO: define other hyperparameters here

NUM_EPOCHS = None
BATCH_SIZE = None


In [None]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

In [None]:
model = LanguageModel(len(vocab))
loader = LanguageModelDataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

In [None]:
best_nll = 1e30 
for epoch in range(NUM_EPOCHS):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
    

In [None]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output