In [None]:
from local.imports import *
from local.test import *
from local.core import *
from local.layers import *
from local.data.all import *
from local.notebook.showdoc import show_doc
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.text.data import *
from local.text.models.core import *
from local.text.models.awdlstm import *
from local.callback.rnn import *
from local.callback.all import *

# Integration test on Wikitext-2

> Training a Language Model on WT2

In [None]:
path = untar_data(URLs.WIKITEXT_TINY)

The dataset comes with all the wrticles concatenated. We split them to be able to shuffle at the beginning of each epoch.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

def read_file(filename):
    articles = L()
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article.split(' '))
            current_article = ''
    articles.append(current_article.split(' '))
    return articles

Then we put our list of tokenized texts together in an `LM_Dataset`. It will return tuples of sequences of `seq_len`, with the second sequence between the first one shifted by one on the right.

In [None]:
bs,sl = 104,72
train = LM_Dataset(read_file(path/'train.txt'), bs=bs, seq_len=sl, shuffle=True)
valid = LM_Dataset(read_file(path/'valid.txt'), bs=bs, seq_len=sl)

In [None]:
print(train[0])

We can then wrap our `LM_Dataset`s in a `TfmdList` to apply the `Numericalize` transform. We can't use a `TfmdDS` because our elements are already tuples and `TfmdDS` is there to create such tuples from individual items. Since we already have tuples, we specify `as_item=False`.

In [None]:
count = Counter([p for t in train.ds for p in t])
vocab = make_vocab(count)
train_ds = TfmdList(train, tfms=Numericalize(vocab), as_item=False, wrap_l=False)
valid_ds = TfmdList(valid, tfms=Numericalize(vocab), as_item=False, wrap_l=False)

Last but not least, we need to use a special sampler that will make sure we ask for the correct sequences to form a batch: in the first batch we don't want the sequences 0,1,2,3... (they are contiguous in the source obtained by concatenating all texts) but the sequences 0,`num_batches`,`2*num_batches`,...

In [None]:
train_dl = TfmdDL(train_ds, bs=bs, sampler=LM_Sampler(train), tfms=Cuda(), num_workers=0)
valid_dl = TfmdDL(valid_ds, bs=bs, sampler=LM_Sampler(valid), tfms=Cuda(), num_workers=0)

In [None]:
dbch = DataBunch(train_dl, valid_dl)
dbch.show_batch()

In [None]:
config = awd_lstm_lm_config.copy()
config.update({'input_p': 0.6, 'output_p': 0.4, 'weight_p': 0.5, 'embed_p': 0.1, 'hidden_p': 0.2})
model = get_language_model(AWD_LSTM, len(vocab), config=config)

In [None]:
opt_func = partial(Adam, wd=0.1, eps=1e-7)
cb_funcs = [partial(MixedPrecision, clip=0.1), partial(RNNTrainer, alpha=3, beta=2)]

In [None]:
learn = Learner(model, dbch, loss_func=CrossEntropyLossFlat(), opt_func=opt_func, cb_funcs=cb_funcs, metrics=[accuracy, Perplexity()])

In [None]:
%prun learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7,0.8), div=10)