In [None]:
from local.torch_basics import *
from local.test import *
from local.core import *
from local.layers import *
from local.data.all import *
from local.notebook.showdoc import show_doc
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.text.core import *
from local.text.data import *
from local.text.models.core import *
from local.text.models.awdlstm import *
from local.callback.rnn import *
from local.callback.all import *

# Integration test on Wikitext-2

> Training a Language Model on WT2

## Data

In [None]:
path = untar_data(URLs.WIKITEXT_TINY)

The dataset comes with all the wrticles concatenated. We split them to be able to shuffle at the beginning of each epoch.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

def read_file(filename):
    articles = L()
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article)
            current_article = ''
    articles.append(current_article)
    return articles

Then we put our list of tokenized texts together in an `LM_Dataset`. It will return tuples of sequences of `seq_len`, with the second sequence between the first one shifted by one on the right.

In [None]:
trn_txt = read_file(path/'train.txt')
val_txt = read_file(path/'valid.txt')
tst_txt = read_file(path/'test.txt')

In [None]:
all_texts = np.concatenate([val_txt, trn_txt, tst_txt])
df = pd.DataFrame({'texts':all_texts})
df.head()

Unnamed: 0,texts
0,"\n = Homarus gammarus = \n \n Homarus gammarus , known as the European lobster or common lobster , is a species of <unk> lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming "" lobster red "" on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into <unk> l..."
1,"\n = Frank <unk> = \n \n Air Vice Marshal Frank <unk> , CB , CBE ( 15 July 1914 – 23 December 1976 ) was a senior commander in the Royal Australian Air Force ( RAAF ) . Born and educated in Tasmania , he joined the RAAF as an air cadet in January 1934 . He specialised in flying instruction and navigation before the outbreak of World War II . In April 1941 , he became commanding officer of No. 2 Squadron , which operated Lockheed <unk> . The squadron was deployed to Dutch Timor in December , and saw action against Japanese forces in the South West Pacific . After returning to Australia in ..."
2,\n = M @-@ 82 ( Michigan highway ) = \n \n M @-@ 82 is a state trunkline in the Lower Peninsula in the US state of Michigan that travels between <unk> and Howard City . The section between <unk> and Howard City travels through <unk> and along the southern edge of <unk> National Forest . The current version of M @-@ 82 is actually the second in the state ; the first usage appeared in the Upper Peninsula by 1919 . The Lower Peninsula routing has been in use since the 1920s . Various extensions and <unk> have shifted the terminus as far west as New Era or <unk> in the past . The current rout...
3,"\n = <unk> <unk> = \n \n <unk> <unk> ( <unk> <unk> , <unk> <unk> ) is a fictional character in the <unk> manga and anime series created by <unk> <unk> . In the anime and manga , <unk> is a ninja affiliated with the village of <unk> . He is a member of Team 10 , a group of ninja consisting of himself , <unk> <unk> , <unk> <unk> , and team leader <unk> <unk> . <unk> is portrayed as a lazy character , unwilling to apply his prodigious intelligence ; <unk> has noted that he likes <unk> due to his <unk> nature . Outside of the <unk> anime and manga , <unk> has appeared in four of the feature f..."
4,"\n = Meridian , Mississippi = \n \n Meridian is the sixth largest city in the state of Mississippi , in the United States . It is the county seat of Lauderdale County and the principal city of the Meridian , Mississippi <unk> Statistical Area . Along major highways , the city is 93 mi ( 150 km ) east of Jackson , Mississippi ; 154 mi ( <unk> km ) west of Birmingham , Alabama ; 202 mi ( 325 km ) northeast of New Orleans , Louisiana ; and 231 mi ( 372 km ) southeast of Memphis , Tennessee . \n Established in 1860 , at the intersection of the Mobile and Ohio Railroad and Southern Railway of ..."


In [None]:
df_tok,count = tokenize_df(df, ['texts'])

In [None]:
df_tok.head()

Unnamed: 0,text,text_lengths
0,"[xxbos, =, xxmaj, homarus, gammarus, =, \n▁\n▁, xxmaj, homarus, gammarus, ,, known, as, the, xxmaj, european, lobster, or, common, lobster, ,, is, a, species, of, xxunk, lobster, from, the, eastern, xxmaj, atlantic, xxmaj, ocean, ,, xxmaj, mediterranean, xxmaj, sea, and, parts, of, the, xxmaj, black, xxmaj, sea, ., xxmaj, it, is, closely, related, to, the, xxmaj, american, lobster, ,, xxup, h., americanus, ., xxmaj, it, may, grow, to, a, length, of, 60, cm, (, 24, in, ), and, a, mass, of, 6, kilograms, (, 13, lb, ), ,, and, bears, a, conspicuous, pair, of, claws, ., xxmaj, in, life, ,, ...]",1866
1,"[xxbos, =, xxmaj, frank, xxunk, =, \n▁\n▁, xxmaj, air, xxmaj, vice, xxmaj, marshal, xxmaj, frank, xxunk, ,, xxup, cb, ,, xxup, cbe, (, 15, xxmaj, july, 1914, –, 23, xxmaj, december, 1976, ), was, a, senior, commander, in, the, xxmaj, royal, xxmaj, australian, xxmaj, air, xxmaj, force, (, xxup, raaf, ), ., xxmaj, born, and, educated, in, xxmaj, tasmania, ,, he, joined, the, xxup, raaf, as, an, air, cadet, in, xxmaj, january, 1934, ., xxmaj, he, specialised, in, flying, instruction, and, navigation, before, the, outbreak, of, xxmaj, world, xxmaj, war, xxup, ii, ., xxmaj, in, xxmaj, april, 19...",3339
2,"[xxbos, =, xxup, m-82, (, xxmaj, michigan, highway, ), =, \n▁\n▁, xxup, m-82, is, a, state, trunkline, in, the, xxmaj, lower, xxmaj, peninsula, in, the, xxup, us, state, of, xxmaj, michigan, that, travels, between, xxunk, and, xxmaj, howard, xxmaj, city, ., xxmaj, the, section, between, xxunk, and, xxmaj, howard, xxmaj, city, travels, through, xxunk, and, along, the, southern, edge, of, xxunk, xxmaj, national, xxmaj, forest, ., xxmaj, the, current, version, of, xxup, m-82, is, actually, the, second, in, the, state, ;, the, first, usage, appeared, in, the, xxmaj, upper, xxmaj, peninsula, by...",1024
3,"[xxbos, =, xxunk, xxunk, =, \n▁\n▁, xxunk, xxunk, (, xxunk, xxunk, ,, xxunk, xxunk, ), is, a, fictional, character, in, the, xxunk, manga, and, anime, series, created, by, xxunk, xxunk, ., xxmaj, in, the, anime, and, manga, ,, xxunk, is, a, ninja, affiliated, with, the, village, of, xxunk, ., xxmaj, he, is, a, member, of, xxmaj, team, 10, ,, a, group, of, ninja, consisting, of, himself, ,, xxunk, xxunk, ,, xxunk, xxunk, ,, and, team, leader, xxunk, xxunk, ., xxunk, is, portrayed, as, a, lazy, character, ,, unwilling, to, apply, his, prodigious, intelligence, ;, xxunk, has, noted, that, he,...",1780
4,"[xxbos, =, xxmaj, meridian, ,, xxmaj, mississippi, =, \n▁\n▁, xxmaj, meridian, is, the, sixth, largest, city, in, the, state, of, xxmaj, mississippi, ,, in, the, xxmaj, united, xxmaj, states, ., xxmaj, it, is, the, county, seat, of, xxmaj, lauderdale, xxmaj, county, and, the, principal, city, of, the, xxmaj, meridian, ,, xxmaj, mississippi, xxunk, xxmaj, statistical, xxmaj, area, ., xxmaj, along, major, highways, ,, the, city, is, 93, mi, (, 150, km, ), east, of, xxmaj, jackson, ,, xxmaj, mississippi, ;, 154, mi, (, xxunk, km, ), west, of, xxmaj, birmingham, ,, xxmaj, alabama, ;, 202, mi, ...",13035


In [None]:
vocab = make_vocab(count)

In [None]:
splits = [list(range(len(val_txt), len(df_tok))), list(range(len(val_txt)))]
tfm = Numericalize(make_vocab(count))

In [None]:
dsrc = DataSource(df_tok["text"].values, [tfm], filts=splits)

In [None]:
bs,sl = 104,72
train_dl = LMDataLoader(dsrc.train, bs=bs,   seq_len=sl, after_batch=[Cuda()], shuffle=True)
valid_dl = LMDataLoader(dsrc.valid, bs=2*bs, seq_len=sl, after_batch=[Cuda()])

In [None]:
dbch = DataBunch(train_dl, valid_dl)
dbch.show_batch()

Unnamed: 0,text
0,"xxbos = lisa ( xxmaj japanese musician , born 1987 ) = \n▁\n▁ xxmaj risa xxmaj oribe ( xxunk xxunk , xxmaj oribe xxmaj risa , born xxmaj june 24 , 1987 ) , better known by her stage name lisa ( an acronym of xxmaj love is xxmaj same xxmaj all ) , is a xxmaj japanese pop singer - songwriter from xxunk , xxunk , signed to xxunk under xxmaj"
1,"( xxunk xxunk ) , and the zebra shark ( xxunk xxunk ) . \n▁\n▁ = = xxmaj distribution and habitat = = \n▁\n▁ xxmaj the tawny nurse shark is widely distributed in the indo - pacific region . xxmaj in the xxmaj indian xxmaj ocean , it is found from kwazulu - natal , xxmaj south xxmaj africa northward to the xxmaj red xxmaj sea , xxmaj persian xxmaj gulf and"
2,". xxmaj that year the xxmaj republicans were in the majority , and the election for governor was close enough to require a xxunk . xxmaj the legislature xxunk the ballots in a partisan manner ( for example , retaining ballots containing xxunk versions of xxmaj republican xxmaj james xxmaj sullivan 's name and discarding similar ballots marked for xxunk xxmaj caleb xxmaj strong ) . xxunk and other xxunk raised a"
3,"xxmaj texas at 22 : 00 xxup utc ( 5 : 00 p.m. xxup cdt ) . xxunk xxunk , a forecaster at the xxmaj weather xxmaj bureau ( now known as the xxmaj national xxmaj weather xxmaj service ) in xxmaj galveston , described the storm as a "" perfectly miniature hurricane . "" xxmaj despite moving onshore , xxmaj abby 's core continued to organize and it developed a closed"
4,"questions this analysis . xxmaj the historian xxmaj david xxmaj carpenter describes xxmaj montfort 's 1265 parliament as "" a landmark "" in the development of parliament as an institution during the medieval period . \n▁\n▁ = = = xxmaj modern recognition = = = \n▁\n▁ xxmaj the xxmaj parliament of the xxmaj united xxmaj kingdom presented a loyal address to xxmaj queen xxmaj elizabeth xxup ii in 1965 to mark the"
5,"xxmaj in 58 , xxunk became romantically involved with xxunk xxunk , the wife of his friend and future emperor xxunk . xxmaj reportedly because a marriage to xxunk and a divorce from xxmaj octavia did not seem politically feasible with xxunk alive , xxunk ordered the murder of his mother in 59 . a number of modern historians find this an unlikely motive as xxunk did not marry xxunk until 62"
6,"a few years earlier . xxmaj in 1977 , xxmaj braathens xxup safe made xxup nok 10 million in profit on the xxmaj trondheim route and xxup nok 4 million on the routes from xxmaj oslo to xxmaj kristiansand and xxmaj stavanger . xxmaj at the same time , the airline lost xxup nok 6 million on the routes to from xxmaj oslo to ålesund , xxmaj molde and xxmaj kristiansund ,"
7,"1853 . xxmaj the image of religious renewal is combined with the image of the xxmaj london air - raids and the constant fighting and destruction within the world . xxmaj this compound image is used to discuss the connection of holy places with the xxmaj holy xxmaj spirit , xxunk , communion with the dead , and the repetition of history . xxmaj the theme is also internal to xxmaj eliot"
8,"fed xxunk and information . xxmaj this marketing and xxunk of opera was reinforced by xxmaj meyerbeer 's xxmaj paris publisher xxmaj maurice xxmaj schlesinger who had established his fortune on the back of xxmaj robert , and even persuaded xxunk de xxunk to write a novella ( xxunk ) to promote xxmaj les xxmaj huguenots . xxmaj schlesinger 's publication of xxmaj franz xxmaj liszt 's xxunk de xxmaj robert le"
9,"by xxmaj métis since the middle 17th century . xxmaj the xxmaj red xxmaj river xxmaj colony , established to supply the xxmaj british fur trade , was fraught with problems from the beginning but became important in the xxmaj minnesota area 's early fur trade as well as supplying many early settlers to the region . \n▁\n▁ = = xxmaj pioneers and exploration = = \n▁\n▁ xxmaj at the beginning of"


## Model

In [None]:
config = awd_lstm_lm_config.copy()
config.update({'input_p': 0.6, 'output_p': 0.4, 'weight_p': 0.5, 'embed_p': 0.1, 'hidden_p': 0.2})
model = get_language_model(AWD_LSTM, len(vocab), config=config)

In [None]:
opt_func = partial(Adam, wd=0.1, eps=1e-7)
cb_funcs = [partial(MixedPrecision, clip=0.1), partial(RNNTrainer, alpha=2, beta=1)]

In [None]:
learn = Learner(model, dbch, loss_func=CrossEntropyLossFlat(), opt_func=opt_func, cb_funcs=cb_funcs, metrics=[accuracy, Perplexity()])

In [None]:
learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7,0.8), div=10)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,6.099594,5.067733,0.237785,158.813858,00:34
