In [None]:
from local.torch_basics import *
from local.test import *
from local.core import *
from local.layers import *
from local.data.all import *
from local.notebook.showdoc import show_doc
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.text.core import *
from local.text.data import *
from local.text.models.core import *
from local.text.models.awdlstm import *
from local.callback.rnn import *
from local.callback.all import *

# Integration test on Wikitext-2

> Training a Language Model on WT2

## Data

In [None]:
path = untar_data(URLs.WIKITEXT_TINY)

The dataset comes with all the wrticles concatenated. We split them to be able to shuffle at the beginning of each epoch.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

def read_file(filename):
    articles = L()
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line.replace('<unk>', UNK)
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article.split(' '))
            current_article = ''
    articles.append(current_article.split(' '))
    return articles

Then we put our list of tokenized texts together in an `LM_Dataset`. It will return tuples of sequences of `seq_len`, with the second sequence between the first one shifted by one on the right.

In [None]:
trn_txt = read_file(path/'train.txt')
val_txt = read_file(path/'valid.txt')
tst_txt = read_file(path/'test.txt')

In [None]:
all_texts = np.concatenate([val_txt, trn_txt, tst_txt])
df = pd.DataFrame({'texts':all_texts})
df.head()

Unnamed: 0,texts
0,"[, \n, =, Homarus, gammarus, =, \n, \n, Homarus, gammarus, ,, known, as, the, European, lobster, or, common, lobster, ,, is, a, species, of, xxunk, lobster, from, the, eastern, Atlantic, Ocean, ,, Mediterranean, Sea, and, parts, of, the, Black, Sea, ., It, is, closely, related, to, the, American, lobster, ,, H., americanus, ., It, may, grow, to, a, length, of, 60, cm, (, 24, in, ), and, a, mass, of, 6, kilograms, (, 13, lb, ), ,, and, bears, a, conspicuous, pair, of, claws, ., In, life, ,, the, lobsters, are, blue, ,, only, becoming, "", lobster, red, "", on, ...]"
1,"[, \n, =, Frank, xxunk, =, \n, \n, Air, Vice, Marshal, Frank, xxunk, ,, CB, ,, CBE, (, 15, July, 1914, –, 23, December, 1976, ), was, a, senior, commander, in, the, Royal, Australian, Air, Force, (, RAAF, ), ., Born, and, educated, in, Tasmania, ,, he, joined, the, RAAF, as, an, air, cadet, in, January, 1934, ., He, specialised, in, flying, instruction, and, navigation, before, the, outbreak, of, World, War, II, ., In, April, 1941, ,, he, became, commanding, officer, of, No., 2, Squadron, ,, which, operated, Lockheed, xxunk, ., The, squadron, was, deployed, to, Dutch, Timor, in, December, ..."
2,"[, \n, =, M, @-@, 82, (, Michigan, highway, ), =, \n, \n, M, @-@, 82, is, a, state, trunkline, in, the, Lower, Peninsula, in, the, US, state, of, Michigan, that, travels, between, xxunk, and, Howard, City, ., The, section, between, xxunk, and, Howard, City, travels, through, xxunk, and, along, the, southern, edge, of, xxunk, National, Forest, ., The, current, version, of, M, @-@, 82, is, actually, the, second, in, the, state, ;, the, first, usage, appeared, in, the, Upper, Peninsula, by, 1919, ., The, Lower, Peninsula, routing, has, been, in, use, since, the, 1920s, ., Various, extensions,..."
3,"[, \n, =, xxunk, xxunk, =, \n, \n, xxunk, xxunk, (, xxunk, xxunk, ,, xxunk, xxunk, ), is, a, fictional, character, in, the, xxunk, manga, and, anime, series, created, by, xxunk, xxunk, ., In, the, anime, and, manga, ,, xxunk, is, a, ninja, affiliated, with, the, village, of, xxunk, ., He, is, a, member, of, Team, 10, ,, a, group, of, ninja, consisting, of, himself, ,, xxunk, xxunk, ,, xxunk, xxunk, ,, and, team, leader, xxunk, xxunk, ., xxunk, is, portrayed, as, a, lazy, character, ,, unwilling, to, apply, his, prodigious, intelligence, ;, xxunk, has, noted, that, he, likes, xxunk, ...]"
4,"[, \n, =, Meridian, ,, Mississippi, =, \n, \n, Meridian, is, the, sixth, largest, city, in, the, state, of, Mississippi, ,, in, the, United, States, ., It, is, the, county, seat, of, Lauderdale, County, and, the, principal, city, of, the, Meridian, ,, Mississippi, xxunk, Statistical, Area, ., Along, major, highways, ,, the, city, is, 93, mi, (, 150, km, ), east, of, Jackson, ,, Mississippi, ;, 154, mi, (, xxunk, km, ), west, of, Birmingham, ,, Alabama, ;, 202, mi, (, 325, km, ), northeast, of, New, Orleans, ,, Louisiana, ;, and, 231, mi, (, 372, km, ), southeast, of, ...]"


In [None]:
#df_tok,count = tokenize_df(df, ['texts'])

In [None]:
count = Counter([p for t in df["texts"].values for p in t])
vocab = make_vocab(count)

In [None]:
splits = [list(range(len(val_txt), len(df))), list(range(len(val_txt)))]
tfm = Numericalize(make_vocab(count))

In [None]:
dsrc = DataSource(df["texts"].values, [tfm], filts=splits)

In [None]:
bs,sl = 104,72
train_dl = LMDataLoader(dsrc.train, bs=bs,   seq_len=sl, after_batch=[Cuda()], shuffle=True, num_workers=8)
valid_dl = LMDataLoader(dsrc.valid, bs=2*bs, seq_len=sl, after_batch=[Cuda()], num_workers=8)

In [None]:
dbch = DataBunch(train_dl, valid_dl)
dbch.show_batch()

Unnamed: 0,text
0,"\n = Dangerously in Love Tour = \n \n The Dangerously in Love Tour was the debut concert tour by American recording artist Beyoncé . Although the tour was intended to showcase songs from her debut solo album , Dangerously in Love , ( 2003 ) the set list also contained a special segment dedicated to Beyoncé 's girl group Destiny 's Child and featured songs from her 2003 film The"
1,"Dictionary states that by 1891 the term Mrs Beeton had become used as a generic name for a domestic authority . She is also considered a strong influence in the building or shaping of a middle @-@ class identity of the Victorian era . \n \n = = Biography = = \n \n \n = = = Early life , 1836 – 54 = = = \n \n Isabella Mayson was born"
2,"Jenkins was called up to the squad to replace xxunk . \n On 7 June , Aaron Jarvis was called up to the squad as injury cover for Paul James . Jarvis later became a permanent replacement after James failed to recover form his injury . \n On 11 June , xxunk Davies joined the squad as an injury replacement for Lloyd Williams . \n Following the first test , xxunk Giles"
3,". \n The Royal Oldham Hospital , at Oldham 's northern boundary with Royton , is a large NHS hospital xxunk by xxunk Acute xxunk NHS Trust . It was opened under its existing name on 1 December 1989 . xxunk known as Oldham District and General , and occupying the site of the town 's former workhouse ( named Oldham Union xxunk in 1851 ) , the hospital is notable for"
4,". On September 25 , this tropical storm made landfall near Long Beach , California , and dissipated inland . \n The tropical storm caught Southern xxunk unprepared . It brought heavy rain and flooding to the area , which killed 45 people . At sea , 48 were killed . The storm caused heavy property damage amounting to $ 2 million ( 1939 USD ) in total , mostly to crops"
5,"= = = Academic reviews = = = \n \n The Archaeology of Ritual and Magic was reviewed by John Hutchings for the Folklore journal , the published arm of The Folklore Society . He highlighted how the work would be of benefit to folklorists , by putting various charms then in museum exhibits – such as dead cats , buried shoes and witch bottles – into the wider context of ritual"
6,"Corpus Christi Bays , did not grow as rapidly as Corpus Christi , following its 1891 founding . During the 2000 census , it had 14 @,@ xxunk residents . \n For transportation on the bay , steamboats were commonplace between Corpus Christi and Ingleside during the 1930s . Native Americans used a route made up of a series of shallow xxunk beds , dubbed Reef Road . The passage , which"
7,"games . \n Rob xxunk a xxunk and columnist for ESPN.com , contradicted anyone who asserted Hamels ' skills had deteriorated , instead commenting , \n "" Last October , everybody was ready to xxunk Hamels some sort of superhero . This was largely because he went 4 – 0 during the Phillies ' championship run , but also because he went 14 – 10 with a 3 @.@ 09 ERA during"
8,"divisional headquarters starting receiving messages from the front ; just after 17 : 00 Lieutenant @-@ Colonel George xxunk on Walker 's Ridge advised he was holding his position and "" if reinforced could advance "" . At 17 : 37 Maclagen reported they were being "" heavily attacked "" , at 18 : 15 the 3rd Battalion signalled , "" 3rd Brigade being driven back "" . At 19 : 15"
9,"six hours thereafter , Abby had maximum sustained winds of only 45 mph ( 75 km / h ) . Abby re @-@ intensified at a relatively quick pace , as it was near hurricane status again when it passed just to the north of Honduras on July 14 . \n Late on July 14 , Abby had re @-@ intensified into a hurricane . A few hours later , Abby passed"


## Model

In [None]:
config = awd_lstm_lm_config.copy()
config.update({'input_p': 0.6, 'output_p': 0.4, 'weight_p': 0.5, 'embed_p': 0.1, 'hidden_p': 0.2})
model = get_language_model(AWD_LSTM, len(vocab), config=config)

In [None]:
opt_func = partial(Adam, wd=0.1, eps=1e-7)
cb_funcs = [partial(MixedPrecision, clip=0.1), partial(RNNTrainer, alpha=2, beta=1)]

In [None]:
learn = Learner(dbch, model, loss_func=CrossEntropyLossFlat(), opt_func=opt_func, cb_funcs=cb_funcs, metrics=[accuracy, Perplexity()])

In [None]:
learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7,0.8), div=10)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,6.197361,5.156197,0.229459,173.503296,00:29


Full training

In [None]:
learn.fit_one_cycle(90, 5e-3, moms=(0.8,0.7,0.8), div=10)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,7.259736,6.236506,0.163128,511.069946,00:27
1,6.469081,5.837082,0.192298,342.777771,00:27
2,6.158576,5.588233,0.212249,267.262939,00:27
3,5.924496,5.389403,0.223954,219.072632,00:27
4,5.726844,5.222465,0.233149,185.390533,00:28
5,5.551997,5.089438,0.241372,162.298691,00:28
6,5.417057,4.954817,0.248025,141.856613,00:28
7,5.295718,4.852958,0.254183,128.118774,00:28
8,5.189019,4.785351,0.258528,119.743416,00:28
9,5.102731,4.730751,0.261571,113.38063,00:28
