In [1]:
from fastai.text.learner import *
from fastai.basic_data import DataBunch
import torch
from torch import nn
import numpy as np
import pickle
from collections import Counter
import spacy,html
import re
import math
%matplotlib inline

In [2]:
# data I/O
data = open('input_bible.txt', 'r').read() # should be simple plain text file
words = Counter(data.split())
words = [k for k in words.keys()]
data_size, vocab_size = len(data), len(words)
word_to_ix  = { ch:i for i,ch in enumerate(words) }
ix_to_word = { i:ch for i,ch in enumerate(words) }

In [3]:
#tokenize the dataset
#export
#special tokens
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

def sub_n(t):
    "Replaces the \n by space"
    re_br = re.compile(r'\n', re.IGNORECASE)
    return re_br.sub(" ", t)

def spec_add_spaces(t):
    "Add spaces around / # , . ; :"
    return re.sub(r'([/#,.;:])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:str):
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)

def replace_section_number(t):
    "Replace section numbers by NEWSECTION"
    return re.sub(r'(\d+:\d+)', 'NEWSECTION', t)

def sep_special(t):
    return re.sub(r'[\.,]', '  ', t)
    
def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text,  replace_section_number, spec_add_spaces, replace_rep
                     , sub_n]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]


In [4]:
#process the data
def read_and_tokenize():
    data = open('input_bible.txt', 'r').read()
    data = data.lower()
    for rule in default_pre_rules:
        data = rule(data)
    data = rm_useless_spaces(data)
    return data

In [5]:
df = read_and_tokenize().split()
vocab = Counter(df)
vocab = [k for k in vocab.keys()]
word_to_ix  = {wr:i for i,wr in enumerate(vocab)}
ix_to_word = {i:wr for i,wr in enumerate(vocab)} 

In [6]:
#create a data loader
from torch.utils.data import Dataset, DataLoader
#create a custom data dataset / dataloader
class bible_dataset(Dataset):

    def __init__(self, seq_len):
        self.df = read_and_tokenize().split()
        self.vocab = Counter(self.df)
        self.vocab = [k for k in self.vocab.keys()]
        self.word_to_ix  = {wr:i for i,wr in enumerate(self.vocab)}
        self.ix_to_word = {i:wr for i,wr in enumerate(self.vocab)}        
        self.seq_len = seq_len
        
    def __len__(self):
        return len(self.vocab)

    def __getitem__(self, idx):
        return (np.array([self.word_to_ix[wrd] for wrd in self.df[idx:idx+self.seq_len]]),
                np.array([self.word_to_ix[wrd] for wrd in self.df[idx+1:idx+self.seq_len+1]]))

In [7]:
# dict_size = len(bible_dataset)
# seq_len = maxlen - 1
# batch_size = len(text)
#one hot encode over the whole vocab

def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
    
    # Replacing the 0 at the relevant character index with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            features[i, u, sequence[i][u]] = 1
    return features

In [85]:
#define a vanilla rnn 

class lstm_rnn_model(nn.Module):
    def __init__(self, input_size, emb_size, 
                 hidden_dim, layers):
        super(lstm_rnn_model, self).__init__()
        
        self.size = input_size
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.drop_out = nn.Dropout(0.5)
        
        self.encoder = nn.Embedding(emb_size, input_size)
        self.LSTM = nn.RNN(input_size, hidden_size=hidden_dim, num_layers = layers,
                      batch_first=True)
        #a fully connected layer (decoder)
        self.decoder = nn.Linear(hidden_dim, emb_size)
    
    #initialise the hidden states before training
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.layers, batch_size, 
                            self.hidden_dim)
        return hidden
    
    #forward pass
    def forward(self, x):
#         print(x)
        x_emb = self.drop_out(x)
        batch_size = x_emb.shape[0]
        hidden = self.init_hidden(batch_size)
#         print(hidden.shape)
        out, hidden = self.LSTM(x_emb, hidden)
#         print(hidden.shape)
        out = self.drop_out(out)
        out = out.view(-1, self.hidden_dim)
#         print(out.shape)
#         print(out)
        out = self.decoder(out)
#         print(out.shape)
        return out, hidden


# class lstm_rnn(nn.Module):
#     """
#     Container module with an encoder, a recurrent module, and a decoder.
#     PyTorch provides the facility to write custom models
#     """

#     def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
#         super(lstm_rnn, self).__init__()
#         self.drop = nn.Dropout(dropout)
#         self.encoder = nn.Embedding(ntoken, ninp)

#         self.LSTM = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
#         self.decoder = nn.Linear(nhid, ntoken)

#         self.init_weights()

#         self.nhid = nhid
#         self.nlayers = nlayers

#     def init_weights(self):
#         initrange = 0.1
#         self.encoder.weight.data.uniform_(-initrange, initrange)
#         self.decoder.bias.data.zero_()
#         self.decoder.weight.data.uniform_(-initrange, initrange)

#     def forward(self, input, hidden):
#         emb = self.drop(self.encoder(input))
#         output, hidden = self.LSTM(emb, hidden)
#         output = self.drop(output)
#         decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
#         return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

#     def init_hidden(self, bsz):
#         weight = next(self.parameters())

#         return (weight.new_zeros(self.nlayers, bsz, self.nhid),
#                 weight.new_zeros(self.nlayers, bsz, self.nhid))

In [86]:

from torch.optim.lr_scheduler import _LRScheduler
class CyclicLR(_LRScheduler):
    def __init__(self, optimizer, schedule, last_epoch=-1):
        assert callable(schedule)
        self.schedule = schedule
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        return [self.schedule(self.last_epoch, lr) for lr in self.base_lrs]
    
def cosine(t_max, eta_min=0):  
    def scheduler(epoch, base_lr):
        t = epoch % t_max
        return eta_min + (base_lr - eta_min)*(1 + math.cos(math.pi*t/t_max))/2
    
    return scheduler


In [87]:
seq_len = 10

bible_df = DataLoader(bible_dataset(seq_len), 1)
# bible_df

lstm_rnn = lstm_rnn_model(input_size=vocab_size,
                    emb_size=vocab_size,
                    hidden_dim=100,
                    layers=1)

# device = torch.device("cuda:0")
# lstm_rnn.cuda()

n_epochs = 1
lr=1e-2
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_rnn.parameters(), lr=lr)

iterations_per_epoch = len(bible_df)
scheduler = CyclicLR(optimizer, cosine(t_max=iterations_per_epoch * 2, eta_min=lr/10))

loss_values = []
#training loop
for epoch in range(1,2):
    loss= 0.
    for local_batch in enumerate(bible_df, 0):
        input_seq = local_batch[1][0]
        target_seq = local_batch[1][1]
        optimizer.zero_grad()
        output, hidden = lstm_rnn(torch.from_numpy(one_hot_encode(input_seq, vocab_size, seq_len, 1)))
#         a = lstm_rnn.state_dict()['rnn.weight_ih_l0'][0,input_seq.unsqueeze(-1)].squeeze(-1).clone()
        loss = loss_func(input=output, target=target_seq.transpose(0,1).long().squeeze(1))
        loss.backward()
        optimizer.step()
        scheduler.step()
#         b = lstm_rnn.state_dict()['rnn.weight_ih_l0'][0,input_seq.unsqueeze(-1)].squeeze(-1).clone()
        #check if it's trained. if it hasn't then stop the training
#         assert torch.equal(b.data, a.data)==False
        loss_values.append(loss.data.item())                        
        if (len(loss_values) % 100 == 0) | (len(loss_values) == 1):
            print(loss.data.item())
            loss_values.append(loss.data.item())

10.33159351348877
6.998532295227051
9.577683448791504
5.406111240386963
7.468782901763916
10.63502025604248
7.044700622558594
9.3641357421875
8.263531684875488
4.66226053237915
4.886398792266846
7.212177276611328
7.6011810302734375
7.0822319984436035
6.845032691955566
9.466495513916016
6.56728982925415
9.399690628051758
7.765040397644043
9.912711143493652
8.16933536529541
7.688700199127197
7.3140549659729
6.381771087646484
7.071802616119385
7.208020210266113
5.879783630371094
7.392008304595947
6.101510047912598
6.453305244445801


KeyboardInterrupt: 

In [79]:
output[0]

tensor([-0.3565, -0.0479,  0.4708,  0.2986, -0.1556, -0.2139,  0.6235, -0.0968,
        -0.2618, -0.3934], grad_fn=<SelectBackward>)

In [83]:
loss_func(input=output, target=target_seq.transpose(0,1).long().squeeze(1))

RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed.  at C:\Users\builder\AppData\Local\Temp\pip-req-build-9msmi1s9\aten\src\THNN/generic/ClassNLLCriterion.c:97

In [None]:
#define a vanilla rnn 

class vanilla_rnn(nn.Module):
    def __init__(self, input_size, output_size, 
                 hidden_dim, layers):
        super(vanilla_rnn, self).__init__()
        
        self.size = input_size
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.dropout = nn.Dropout(0.5)
        self.rnn = nn.RNN(input_size, hidden_size=hidden_dim, num_layers = layers,
                      batch_first=True)
        #a fully connectd layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    #initialise the hidden states before training
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.layers, batch_size, 
                            self.hidden_dim)
        return hidden
    
    #forward pass
    def forward(self, x):
#         print(x)
        batch_size = x.shape[0]
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(self.dropout(x), hidden)
        out = out.view(-1, self.hidden_dim)
        out = self.fc(out)
        return out, hidden
        

In [36]:
seq_len = 10

bible_df = DataLoader(bible_dataset(seq_len), 1)
# bible_df

v_rnn = vanilla_rnn(input_size=vocab_size,
                    output_size=50,
                    hidden_dim=12,
                    layers=1)
n_epochs = 1
lr=1e-2
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(v_rnn.parameters(), lr=lr)

iterations_per_epoch = len(bible_df)
scheduler = CyclicLR(optimizer, cosine(t_max=iterations_per_epoch * 2, eta_min=lr/10))

loss_values = []
#training loop
for epoch in range(1,2):
    loss= 0.
    for local_batch in enumerate(bible_df, 0):
        input_seq = local_batch[1][0]
        target_seq = local_batch[1][1]
        optimizer.zero_grad()
        output, hidden = v_rnn(torch.from_numpy(one_hot_encode(input_seq, vocab_size, 10, 1)))
        loss = loss_func(input=output, target=target_seq.transpose(0,1).long().squeeze(1))
        loss.backward()
        optimizer.step()
        scheduler.step()
        loss_values.append(loss.data.item())                        
        if (len(loss_values) % 100 == 0) | (len(loss_values) == 1):
            print(loss.data.item())
            loss_values.append(loss.data.item())

3.9069831371307373
3.679192304611206


RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed.  at C:\Users\builder\AppData\Local\Temp\pip-req-build-9msmi1s9\aten\src\THNN/generic/ClassNLLCriterion.c:97