In [None]:
from local.torch_basics import *
from local.test import *
from local.core import *
from local.layers import *
from local.data.all import *
from local.notebook.showdoc import show_doc
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.text.core import *
from local.text.data import *
from local.text.models.core import *
from local.text.models.awdlstm import *
from local.callback.rnn import *
from local.callback.all import *

# Integration test on Wikitext-2

> Training a Language Model on WT2

## Data

In [None]:
path = untar_data(URLs.WIKITEXT_TINY)

The dataset comes with all the wrticles concatenated. We split them to be able to shuffle at the beginning of each epoch.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

def read_file(filename):
    articles = L()
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article)
            current_article = ''
    articles.append(current_article)
    return articles

Then we put our list of tokenized texts together in an `LM_Dataset`. It will return tuples of sequences of `seq_len`, with the second sequence between the first one shifted by one on the right.

In [None]:
trn_txt = read_file(path/'train.txt')
val_txt = read_file(path/'valid.txt')
tst_txt = read_file(path/'test.txt')

In [None]:
all_texts = np.concatenate([val_txt, trn_txt, tst_txt])
df = pd.DataFrame({'texts':all_texts})
df.head()

Unnamed: 0,texts
0,"\n = Homarus gammarus = \n \n Homarus gammarus , known as the European lobster or common lobster , is a species of <unk> lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming "" lobster red "" on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into <unk> l..."
1,"\n = Frank <unk> = \n \n Air Vice Marshal Frank <unk> , CB , CBE ( 15 July 1914 – 23 December 1976 ) was a senior commander in the Royal Australian Air Force ( RAAF ) . Born and educated in Tasmania , he joined the RAAF as an air cadet in January 1934 . He specialised in flying instruction and navigation before the outbreak of World War II . In April 1941 , he became commanding officer of No. 2 Squadron , which operated Lockheed <unk> . The squadron was deployed to Dutch Timor in December , and saw action against Japanese forces in the South West Pacific . After returning to Australia in ..."
2,\n = M @-@ 82 ( Michigan highway ) = \n \n M @-@ 82 is a state trunkline in the Lower Peninsula in the US state of Michigan that travels between <unk> and Howard City . The section between <unk> and Howard City travels through <unk> and along the southern edge of <unk> National Forest . The current version of M @-@ 82 is actually the second in the state ; the first usage appeared in the Upper Peninsula by 1919 . The Lower Peninsula routing has been in use since the 1920s . Various extensions and <unk> have shifted the terminus as far west as New Era or <unk> in the past . The current rout...
3,"\n = <unk> <unk> = \n \n <unk> <unk> ( <unk> <unk> , <unk> <unk> ) is a fictional character in the <unk> manga and anime series created by <unk> <unk> . In the anime and manga , <unk> is a ninja affiliated with the village of <unk> . He is a member of Team 10 , a group of ninja consisting of himself , <unk> <unk> , <unk> <unk> , and team leader <unk> <unk> . <unk> is portrayed as a lazy character , unwilling to apply his prodigious intelligence ; <unk> has noted that he likes <unk> due to his <unk> nature . Outside of the <unk> anime and manga , <unk> has appeared in four of the feature f..."
4,"\n = Meridian , Mississippi = \n \n Meridian is the sixth largest city in the state of Mississippi , in the United States . It is the county seat of Lauderdale County and the principal city of the Meridian , Mississippi <unk> Statistical Area . Along major highways , the city is 93 mi ( 150 km ) east of Jackson , Mississippi ; 154 mi ( <unk> km ) west of Birmingham , Alabama ; 202 mi ( 325 km ) northeast of New Orleans , Louisiana ; and 231 mi ( 372 km ) southeast of Memphis , Tennessee . \n Established in 1860 , at the intersection of the Mobile and Ohio Railroad and Southern Railway of ..."


In [None]:
df_tok,count = tokenize_df(df, ['texts'])

In [None]:
df_tok.head()

Unnamed: 0,text,text_lengths
0,"[xxbos, =, xxmaj, homarus, gammarus, =, \n▁\n▁, xxmaj, homarus, gammarus, ,, known, as, the, xxmaj, european, lobster, or, common, lobster, ,, is, a, species, of, xxunk, lobster, from, the, eastern, xxmaj, atlantic, xxmaj, ocean, ,, xxmaj, mediterranean, xxmaj, sea, and, parts, of, the, xxmaj, black, xxmaj, sea, ., xxmaj, it, is, closely, related, to, the, xxmaj, american, lobster, ,, xxup, h., americanus, ., xxmaj, it, may, grow, to, a, length, of, 60, cm, (, 24, in, ), and, a, mass, of, 6, kilograms, (, 13, lb, ), ,, and, bears, a, conspicuous, pair, of, claws, ., xxmaj, in, life, ,, ...]",1866
1,"[xxbos, =, xxmaj, frank, xxunk, =, \n▁\n▁, xxmaj, air, xxmaj, vice, xxmaj, marshal, xxmaj, frank, xxunk, ,, xxup, cb, ,, xxup, cbe, (, 15, xxmaj, july, 1914, –, 23, xxmaj, december, 1976, ), was, a, senior, commander, in, the, xxmaj, royal, xxmaj, australian, xxmaj, air, xxmaj, force, (, xxup, raaf, ), ., xxmaj, born, and, educated, in, xxmaj, tasmania, ,, he, joined, the, xxup, raaf, as, an, air, cadet, in, xxmaj, january, 1934, ., xxmaj, he, specialised, in, flying, instruction, and, navigation, before, the, outbreak, of, xxmaj, world, xxmaj, war, xxup, ii, ., xxmaj, in, xxmaj, april, 19...",3339
2,"[xxbos, =, xxup, m-82, (, xxmaj, michigan, highway, ), =, \n▁\n▁, xxup, m-82, is, a, state, trunkline, in, the, xxmaj, lower, xxmaj, peninsula, in, the, xxup, us, state, of, xxmaj, michigan, that, travels, between, xxunk, and, xxmaj, howard, xxmaj, city, ., xxmaj, the, section, between, xxunk, and, xxmaj, howard, xxmaj, city, travels, through, xxunk, and, along, the, southern, edge, of, xxunk, xxmaj, national, xxmaj, forest, ., xxmaj, the, current, version, of, xxup, m-82, is, actually, the, second, in, the, state, ;, the, first, usage, appeared, in, the, xxmaj, upper, xxmaj, peninsula, by...",1024
3,"[xxbos, =, xxunk, xxunk, =, \n▁\n▁, xxunk, xxunk, (, xxunk, xxunk, ,, xxunk, xxunk, ), is, a, fictional, character, in, the, xxunk, manga, and, anime, series, created, by, xxunk, xxunk, ., xxmaj, in, the, anime, and, manga, ,, xxunk, is, a, ninja, affiliated, with, the, village, of, xxunk, ., xxmaj, he, is, a, member, of, xxmaj, team, 10, ,, a, group, of, ninja, consisting, of, himself, ,, xxunk, xxunk, ,, xxunk, xxunk, ,, and, team, leader, xxunk, xxunk, ., xxunk, is, portrayed, as, a, lazy, character, ,, unwilling, to, apply, his, prodigious, intelligence, ;, xxunk, has, noted, that, he,...",1780
4,"[xxbos, =, xxmaj, meridian, ,, xxmaj, mississippi, =, \n▁\n▁, xxmaj, meridian, is, the, sixth, largest, city, in, the, state, of, xxmaj, mississippi, ,, in, the, xxmaj, united, xxmaj, states, ., xxmaj, it, is, the, county, seat, of, xxmaj, lauderdale, xxmaj, county, and, the, principal, city, of, the, xxmaj, meridian, ,, xxmaj, mississippi, xxunk, xxmaj, statistical, xxmaj, area, ., xxmaj, along, major, highways, ,, the, city, is, 93, mi, (, 150, km, ), east, of, xxmaj, jackson, ,, xxmaj, mississippi, ;, 154, mi, (, xxunk, km, ), west, of, xxmaj, birmingham, ,, xxmaj, alabama, ;, 202, mi, ...",13035


In [None]:
vocab = make_vocab(count)

In [None]:
splits = [list(range(len(val_txt), len(df_tok))), list(range(len(val_txt)))]
tfm = Numericalize(make_vocab(count))

In [None]:
dsrc = DataSource(df_tok["text"].values, [tfm], filts=splits)

In [None]:
bs,sl = 104,72
train_dl = LMDataLoader(dsrc.train, lens=df_tok["text_lengths"].values[splits[0]], bs=bs,   seq_len=sl, after_batch=[Cuda()], shuffle=True, num_workers=8)
valid_dl = LMDataLoader(dsrc.valid, lens=df_tok["text_lengths"].values[splits[1]], bs=2*bs, seq_len=sl, after_batch=[Cuda()], num_workers=8)

In [None]:
dbch = DataBunch(train_dl, valid_dl)
dbch.show_batch()

Unnamed: 0,text
0,"xxbos = xxmaj clocks ( song ) = \n▁\n▁ "" xxmaj clocks "" is a song by xxmaj british alternative rock band xxmaj coldplay . xxmaj it was written and composed , as a collaboration between all the members of the band , for their second album , a xxmaj rush of xxmaj blood to the xxmaj head . xxmaj built around a piano riff , the song features cryptic lyrics of"
1,"xxmaj ho - xxunk "" or "" xxmaj winnebago "" ) of xxmaj wisconsin and xxmaj illinois and the xxmaj cheyenne , amongst others . xxmaj to the xxunk and xxunk of xxmaj arizona , the xxunk of the cougar was a xxunk of death . xxmaj the xxunk and xxmaj ojibwe believe that the cougar lived in the underworld and was wicked , whereas it was a sacred animal among the"
2,"xxmaj jane xxmaj austen , who lived in xxmaj bath with her parents and sister from 1801 to 1805 . xxmaj her two novels set in xxmaj bath , xxunk xxmaj abbey and xxunk , were published in 1818 and both mention the xxmaj assembly xxmaj rooms : \n▁ xxmaj mrs xxmaj allen was so long in dressing , that they did not enter the ball - room till late . xxmaj"
3,"small garrison in xxmaj gacko , consisting of only 20 gendarmes and 30 xxmaj ustaše , were holding out but expecting more attacks by the rebels . xxmaj in the morning , the attack by elements of the 10th xxmaj battalion stalled until the battalion commander , xxmaj lieutenant xxmaj colonel xxunk xxunk , personally took command of the operation , clearing the way for the xxmaj italians . xxmaj the promised"
4,", with the destroyers in the middle . xxmaj on arrival at xxmaj st xxmaj nazaire the xxunk mls were to head for the xxmaj old xxmaj mole to disembark their xxmaj commandos , while the starboard lane would make for the old entrance to the basin to disembark theirs . xxmaj not having the range to reach xxmaj st xxmaj nazaire unaided , the xxup mtb and xxup mgb were taken"
5,"at number 46 on the xxmaj billboard xxmaj hot r & b / hip - hop xxmaj songs chart . \n▁\n▁ = = xxmaj background = = \n▁\n▁ "" xxmaj forbidden xxmaj fruit "" was the last song recorded and produced by xxup j. xxmaj cole for his second album , xxmaj born xxmaj sinner ( 2013 ) . xxmaj it was recorded as a replacement for another song which xxmaj cole"
6,"xxmaj george xxmaj washington , xxmaj president xxmaj grant , and xxmaj susquehanna . xxmaj the convoy was escorted by xxmaj huntington when it departed from xxmaj new xxmaj york on 18 xxmaj february , and arrived at saint - nazaire on 4 xxmaj march . xxmaj ten animals on board xxmaj el xxmaj sol died or were destroyed during the crossing . \n▁ xxmaj the next recorded activity of xxmaj el"
7,"a set of xxunk and diamond jewels ( a tiara , brooch and xxunk ) that were originally in xxmaj lady xxmaj rosebery 's ownership . \n▁ xxmaj today , xxmaj lady xxmaj rosebery is a mere footnote in the long history of her husband 's family , rather as xxunk xxmaj vanderbilt is regarded in the spencer - churchill family . xxmaj her husband , once one of the "" most"
8,"xxmaj black xxmaj flag ) will play the role of xxmaj constantine in the xxup tv series pilot . xxmaj on xxmaj may 8 , xxup nbc announced it had officially picked up xxmaj constantine for the xxmaj fall 2014 season . xxmaj the show ran for 13 episodes , and on xxmaj may 8 , 2015 , xxup nbc cancelled xxmaj constantine after the end of its first season . xxmaj"
9,"an enrollment of more than 2 @,@ 500 students . xxmaj the xxunk ( 340 @,@ xxrep 3 0 m2 ) campus includes schools of medicine , nursing , allied health professions , and a graduate school of xxunk sciences , as well as three institutes for advanced studies & medical humanities , a major medical library , seven hospitals , a network of clinics that provide a full range of primary"


## Model

In [None]:
config = awd_lstm_lm_config.copy()
config.update({'input_p': 0.6, 'output_p': 0.4, 'weight_p': 0.5, 'embed_p': 0.1, 'hidden_p': 0.2})
model = get_language_model(AWD_LSTM, len(vocab), config=config)

In [None]:
opt_func = partial(Adam, wd=0.1, eps=1e-7)
cb_funcs = [partial(MixedPrecision, clip=0.1), partial(RNNTrainer, alpha=2, beta=1)]

In [None]:
learn = Learner(model, dbch, loss_func=CrossEntropyLossFlat(), opt_func=opt_func, cb_funcs=cb_funcs, metrics=[accuracy, Perplexity()])

In [None]:
learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7,0.8), div=10)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,6.197361,5.156197,0.229459,173.503296,00:29
