In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import re

from collections import Counter
from torch.utils.data import DataLoader
from collections import Counter
from datetime import datetime

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [23]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        seq_length,
        train_path,
        mode='word',
        train=True,
        train_split=0.8
    ):
        self.seq_length = seq_length
        self.mode = mode
        self.train_split = train_split
        self.train_path = train_path
        self.all_data, self.train_data, self.eval_data = self._read_data()
        
        self.unique_data = self._find_unique()

        self.idx_data = {idx: data for idx, data in enumerate(self.unique_data)}
        self.data_idx = {data: idx for idx, data in enumerate(self.unique_data)}
        
        self.data = self.train_data if train else self.eval_data

        self.indexed_data = np.array([self.data_idx[i] for i in self.data])
        
        self.indexed_data = torch.from_numpy(self.indexed_data)

    def _read_data(self):
        text = open(self.train_path, 'rb').read().decode(encoding='utf-8')
        data = pd.Series(list(re.sub("[" + '\r\ufeff' + "]", '', text))) if self.mode =='char' \
                    else pd.Series(re.findall(r"[\w']+|[.,!?;]", text))
        return data, data[:int(len(data) * self.train_split)], \
                            data[int(len(data) * (1 - self.train_split)):]

    def _find_unique(self):
        data_count = Counter(self.all_data)
        return sorted(data_count, key=data_count.get, reverse=True)

    def __len__(self):
        return (len(self.indexed_data) - self.seq_length)

    def __getitem__(self, idx):
        return (
            self.indexed_data[idx:idx+self.seq_length],
            self.indexed_data[idx+1:idx+self.seq_length+1],
        )

In [None]:
train_dataset = Dataset(20, 'language_modelling/data/penn/train.txt', 
                  'language_modelling/data/penn/valid.txt', mode='word', train=True)
trainloader = DataLoader(train_dataset, batch_size=None)

In [None]:
eval_dataset = Dataset(20, 'language_modelling/data/penn/train.txt', 
                  'language_modelling/data/penn/valid.txt', mode='word', train=False)
evalloader = DataLoader(eval_dataset, batch_size=None)

In [24]:
train_dataset = Dataset(20, 'language_modelling/data/alice_wonderland.txt', mode='word', train=True)
trainloader = DataLoader(train_dataset, batch_size=20)
eval_dataset = Dataset(20, 'language_modelling/data/alice_wonderland.txt', mode='word', train=False)
evalloader = DataLoader(eval_dataset, batch_size=20)

In [None]:
class RNNModel(nn.Module):

    def __init__(self, ntoken, embed_size=128, hidden_size=128,
                 hidden_layers=3, seq_length=20, dropout=0.2):
        super(RNNModel, self).__init__()
        
        self.hidden_layers = hidden_layers
        self.ntoken = ntoken
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.dropout = dropout
        
        self.embed = nn.Embedding(
            num_embeddings=self.ntoken,
            embedding_dim=self.embed_size,
        )
        self.rnn = nn.RNN(
            input_size=self.embed_size,
            hidden_size=self.hidden_size,
            num_layers=self.hidden_layers,
            dropout=self.dropout
        )
        self.linear = nn.Linear(self.hidden_size, ntoken)

    def forward(self, x, state_h):
        embed = self.embed(x)
        output, state = self.rnn(embed, state_h)
        logits = self.linear(output)
        return logits, (state)
        
        return out
    
    def initialize(self, seq_length=None):
        if seq_length is None:
            seq_length = self.seq_length
        return torch.zeros(self.hidden_layers, seq_length, self.hidden_size)

In [4]:
class LSTMModel(nn.Module):

    def __init__(self, ntoken, embed_size=128, hidden_size=128,
                 hidden_layers=3, seq_length=20, dropout=0.2):
        super(LSTMModel, self).__init__()
        
        self.hidden_layers = hidden_layers
        self.ntoken = ntoken
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.dropout = dropout
        
        self.embed = nn.Embedding(
            num_embeddings=self.ntoken,
            embedding_dim=self.embed_size,
        )
        self.rnn = nn.LSTM(
            input_size=self.embed_size,
            hidden_size=self.hidden_size,
            num_layers=self.hidden_layers,
            dropout=dropout
        )
        self.linear = nn.Linear(self.hidden_size, ntoken)

    def forward(self, x, state):
        embed = self.embed(x)
        output, state = self.rnn(embed, state)
        logits = self.linear(output)
        return logits, state
        
        return out
    
    def initialize(self, seq_length=None):
        if seq_length is None:
            seq_length = self.seq_length
        return (torch.zeros(self.hidden_layers, seq_length, self.hidden_size),
                torch.zeros(self.hidden_layers, seq_length, self.hidden_size))

In [5]:
def train_one_epoch(model, criterion, optimizer, dataloader,
                    log=True, log_interval=200):
    model.train()
    state = model.initialize()
    total_loss = 0
    for batch, (x, y) in enumerate(dataloader):
        optimizer.zero_grad()
        y_pred, state = model(x, state)
        loss = criterion(y_pred.transpose(-2, -1), y)
        total_loss += loss.item()

        
        state = tuple([s.detach() for s in state]) if type(state) is tuple else state.detach()

        loss.backward()
        optimizer.step()
        
        if batch % log_interval == 0 and log:
            print({'Batch {} / {}, loss: {}'.format(batch, len(dataloader),
                                                    total_loss / (batch + 1)) })
    return total_loss / (len(dataloader))

In [6]:
def evaluate(model, criterion, dataloader, log=True):
    model.eval()
    state = model.initialize()
    total_loss = 0
    for batch, (x, y) in enumerate(dataloader):
        with torch.no_grad():
            y_pred, state = model(x, state)
        loss = criterion(y_pred.transpose(-2, -1), y)
        total_loss += loss.item()
    if log:
        print({'Evaluation loss': total_loss / (len(dataloader)) })
    return total_loss / (len(dataloader))

In [7]:
def train(trainloader, evalloader, model, optimizer, 
          criterion, nepoch, log_interval=1, eval_during_train=True, 
          eval_interval=1, save_interval=1, model_name='model'):
    for ep in range(nepoch):
        train_loss = train_one_epoch(model, criterion, 
                                     optimizer, trainloader)
        if ep % log_interval == 0:
            print({'Epoch': ep, 'loss': train_loss})
        if eval_during_train and ep % eval_interval == 0:
            eval_loss = evaluate(model, criterion, evalloader)
        if ep % save_interval == 0:
            torch.save(model.state_dict(), 'language_modelling/models/'
                       + model_name + '.p')
    

In [8]:
def generate(model, data, data_idx_dict, 
             idx_data_dict, len_hist=50, len_gen=50):
    model.eval()
    state = model.initialize(len_hist)
    for i in range(len_gen):
        x = torch.tensor([[data_idx_dict[d] for d in data[-len_hist-1:-1]]])
        with torch.no_grad():
            y_pred, state = model(x, state)
        last_logits = y_pred[0][-1]
        prob = nn.functional.softmax(last_logits, dim=0).detach().numpy()
        idx = np.random.choice(len(last_logits), p=prob)
        data.append(idx_data_dict[idx])
    return ' '.join(data[-(len_hist+len_gen):])

In [None]:
nepoch = 1000
lr = 0.001

ntoken = len(train_dataset.unique_data)

model = LSTMModel(ntoken)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
train(trainloader, evalloader, model, optimizer, criterion, nepoch)

{'Batch 0 / 1415, loss: 8.175182342529297'}
{'Batch 200 / 1415, loss: 6.495981456035405'}
{'Batch 400 / 1415, loss: 6.292173188226181'}
{'Batch 600 / 1415, loss: 6.191937619556802'}
{'Batch 800 / 1415, loss: 6.1231154722816195'}
{'Batch 1000 / 1415, loss: 6.065866774731464'}
{'Batch 1200 / 1415, loss: 5.9941749658116095'}
{'Batch 1400 / 1415, loss: 5.9446944896022735'}
{'Epoch': 0, 'loss': 5.940430346478843}
{'Evaluation loss': 5.722703741969995}
{'Batch 0 / 1415, loss: 6.667312145233154'}
{'Batch 200 / 1415, loss: 5.525613713620314'}
{'Batch 400 / 1415, loss: 5.390993892403315'}
{'Batch 600 / 1415, loss: 5.331671307765307'}
{'Batch 800 / 1415, loss: 5.271353118875053'}
{'Batch 1000 / 1415, loss: 5.2132663974514255'}
{'Batch 1200 / 1415, loss: 5.163024153141654'}
{'Batch 1400 / 1415, loss: 5.126555526367176'}
{'Epoch': 1, 'loss': 5.123882277020296}
{'Evaluation loss': 5.352085865681247}
{'Batch 0 / 1415, loss: 6.13288688659668'}
{'Batch 200 / 1415, loss: 5.057281788308822'}
{'Batch 400

In [None]:
print(train_dataset.indexed_data.shape)

In [14]:
evaluate(model, criterion, evalloader)

{'Evaluation loss': 1.5122261361696234}


1.5122261361696234

In [21]:
generate(model, list(train_dataset.data.values[:300]), train_dataset.data_idx, train_dataset.idx_data, len_hist=100, len_gen=200)

"h t   b e a r   h i s   m e m o r y : \n     B u t   t h o u ,   c o n t r a c t e d   t o   t h i n e   o w n   b r i g h t   e y e s , \n     F e e d ' s t   t h y   l i g h t ' s   f l a m e   w i t f   o e f   \n o   u v p e r n \n e     A   n d a o y k , g   e l i t d h   e o a r t s h     v t e a n l s e \n d     m   e a n r t u ' a n d d   ' e g t r i e a   t v h e e r , n \n       S   o n f   u n n h e i r l s y     n n e o i r d ,     y c o e u d t   h w e a n r t e   d d i i n n   e d n d o \n ,   - R F u e r v   e e n v d e e r t , -   - a t n   e f v v e o r t     I n   f I r l ' e d \n ."