In [None]:
from local.torch_basics import *
from local.test import *
from local.core import *
from local.layers import *
from local.data.all import *
from local.notebook.showdoc import show_doc
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.text.core import *
from local.text.data import *
from local.text.models.core import *
from local.text.models.awdlstm import *
from local.callback.rnn import *
from local.callback.all import *

# Integration test on Wikitext-2

> Training a Language Model on WT2

## Data

In [None]:
path = untar_data(URLs.WIKITEXT_TINY)

The dataset comes with all the wrticles concatenated. We split them to be able to shuffle at the beginning of each epoch.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

def read_file(filename):
    articles = L()
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line.replace('<unk>', UNK)
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article.split(' '))
            current_article = ''
    articles.append(current_article.split(' '))
    return articles

Then we put our list of tokenized texts together in an `LM_Dataset`. It will return tuples of sequences of `seq_len`, with the second sequence between the first one shifted by one on the right.

In [None]:
trn_txt = read_file(path/'train.txt')
val_txt = read_file(path/'valid.txt')
tst_txt = read_file(path/'test.txt')

In [None]:
all_texts = np.concatenate([val_txt, trn_txt, tst_txt])
df = pd.DataFrame({'texts':all_texts})
df.head()

Unnamed: 0,texts
0,"[, \n, =, Homarus, gammarus, =, \n, \n, Homarus, gammarus, ,, known, as, the, European, lobster, or, common, lobster, ,, is, a, species, of, xxunk, lobster, from, the, eastern, Atlantic, Ocean, ,, Mediterranean, Sea, and, parts, of, the, Black, Sea, ., It, is, closely, related, to, the, American, lobster, ,, H., americanus, ., It, may, grow, to, a, length, of, 60, cm, (, 24, in, ), and, a, mass, of, 6, kilograms, (, 13, lb, ), ,, and, bears, a, conspicuous, pair, of, claws, ., In, life, ,, the, lobsters, are, blue, ,, only, becoming, "", lobster, red, "", on, ...]"
1,"[, \n, =, Frank, xxunk, =, \n, \n, Air, Vice, Marshal, Frank, xxunk, ,, CB, ,, CBE, (, 15, July, 1914, –, 23, December, 1976, ), was, a, senior, commander, in, the, Royal, Australian, Air, Force, (, RAAF, ), ., Born, and, educated, in, Tasmania, ,, he, joined, the, RAAF, as, an, air, cadet, in, January, 1934, ., He, specialised, in, flying, instruction, and, navigation, before, the, outbreak, of, World, War, II, ., In, April, 1941, ,, he, became, commanding, officer, of, No., 2, Squadron, ,, which, operated, Lockheed, xxunk, ., The, squadron, was, deployed, to, Dutch, Timor, in, December, ..."
2,"[, \n, =, M, @-@, 82, (, Michigan, highway, ), =, \n, \n, M, @-@, 82, is, a, state, trunkline, in, the, Lower, Peninsula, in, the, US, state, of, Michigan, that, travels, between, xxunk, and, Howard, City, ., The, section, between, xxunk, and, Howard, City, travels, through, xxunk, and, along, the, southern, edge, of, xxunk, National, Forest, ., The, current, version, of, M, @-@, 82, is, actually, the, second, in, the, state, ;, the, first, usage, appeared, in, the, Upper, Peninsula, by, 1919, ., The, Lower, Peninsula, routing, has, been, in, use, since, the, 1920s, ., Various, extensions,..."
3,"[, \n, =, xxunk, xxunk, =, \n, \n, xxunk, xxunk, (, xxunk, xxunk, ,, xxunk, xxunk, ), is, a, fictional, character, in, the, xxunk, manga, and, anime, series, created, by, xxunk, xxunk, ., In, the, anime, and, manga, ,, xxunk, is, a, ninja, affiliated, with, the, village, of, xxunk, ., He, is, a, member, of, Team, 10, ,, a, group, of, ninja, consisting, of, himself, ,, xxunk, xxunk, ,, xxunk, xxunk, ,, and, team, leader, xxunk, xxunk, ., xxunk, is, portrayed, as, a, lazy, character, ,, unwilling, to, apply, his, prodigious, intelligence, ;, xxunk, has, noted, that, he, likes, xxunk, ...]"
4,"[, \n, =, Meridian, ,, Mississippi, =, \n, \n, Meridian, is, the, sixth, largest, city, in, the, state, of, Mississippi, ,, in, the, United, States, ., It, is, the, county, seat, of, Lauderdale, County, and, the, principal, city, of, the, Meridian, ,, Mississippi, xxunk, Statistical, Area, ., Along, major, highways, ,, the, city, is, 93, mi, (, 150, km, ), east, of, Jackson, ,, Mississippi, ;, 154, mi, (, xxunk, km, ), west, of, Birmingham, ,, Alabama, ;, 202, mi, (, 325, km, ), northeast, of, New, Orleans, ,, Louisiana, ;, and, 231, mi, (, 372, km, ), southeast, of, ...]"


In [None]:
#df_tok,count = tokenize_df(df, ['texts'])

In [None]:
count = Counter([p for t in df["texts"].values for p in t])
vocab = make_vocab(count)

In [None]:
splits = [list(range(len(val_txt), len(df))), list(range(len(val_txt)))]
tfm = Numericalize(make_vocab(count))

In [None]:
dsrc = DataSource(df["texts"].values, [tfm], filts=splits)

In [None]:
bs,sl = 104,72
train_dl = LMDataLoader(dsrc.train, bs=bs,   seq_len=sl, after_batch=[Cuda()], shuffle=True, num_workers=8)
valid_dl = LMDataLoader(dsrc.valid, bs=2*bs, seq_len=sl, after_batch=[Cuda()], num_workers=8)

In [None]:
dbch = DataBunch(train_dl, valid_dl)
dbch.show_batch()

Unnamed: 0,text
0,"\n = xxunk = \n \n xxunk , known in Japan as I Love Donald Duck : Georgia xxunk no xxunk ( Japanese : xxunk xxunk , Hepburn : xxunk xxunk xxunk xxunk xxunk xxunk no xxunk ) , is a 1991 xxunk video game developed and published by Sega for the Sega Genesis . The game was released in Europe in 1991 , in North America on December 19 ,"
1,"gaining the high @-@ kingship , Toirdelbach 's decision to march @-@ on Dublin reveals that the acquisition of this coastal kingdom had also become an essential part of the process . \n Toirdelbach 's subsequent capture of Donnchad in Dublin suggests that the latter was not only the leading Uí Chennselaig xxunk , but was also in the process of using the town as the capital of Leinster . Although the"
2,", Baptists and xxunk settled in the Welsh xxunk of Pennsylvania . While some Welsh colonists like Roger Williams , left to found Rhode Island , Anne Hutchinson founded a seed settlement in New York . Rhode Island was not initially counted as part of New England , having been excluded from the New England Confederation , but later joined the Dominion of New England . Thus , the definition of the"
3,"II , Count of Tripoli , granted the order property in the county . According to historian Jonathan Riley @-@ Smith , the Hospitallers effectively established a "" palatinate "" within Tripoli . The property included castles with which the Hospitallers were expected to defend Tripoli . Along with Krak des Chevaliers , the Hospitallers were given four other castles along the borders of the state which allowed the order to dominate"
4,"and behavioral approaches such as distraction techniques . \n \n = = xxunk = = \n \n The prognosis is worse in those with larger burns , those who are older , and those who are females . The presence of a smoke inhalation injury , other significant injuries such as long bone fractures , and serious co @-@ xxunk ( e.g. heart disease , diabetes , psychiatric illness , and xxunk"
5,"are xxunk , elastic , durable , and ( for military and espionage uses ) will not xxunk suspicion if found . \n xxunk military xxunk began during World War II , and includes covering the xxunk of rifle barrels to prevent xxunk , the waterproofing of firing assemblies in underwater xxunk , and storage of corrosive materials and xxunk by paramilitary agencies . \n Condoms have also been used to smuggle"
6,"music that landed a Top 10 spot on the Billboard 's Hot R & B / Hip @-@ Hop Singles Sales Chart during its first week of release . The university 's two @-@ story trading room simulator , located in the College of Business , provides hands @-@ on financial education using 25 dual @-@ monitor computers and can accommodate 50 people at one time . A second lab provides full"
7,"2004 , both schools moved to the Atlantic Coast Conference - which now has a tie @-@ in for what is now called the Chick @-@ xxunk A Bowl . \n \n \n = Sinclair Sovereign = \n \n The Sinclair Sovereign was a high @-@ end calculator introduced by Clive Sinclair 's company Sinclair xxunk in 1976 . It was an attempt to escape from the unprofitable low end of"
8,"characteristics . xxunk Jonathan Bignell suggests that the use of portraits conveys Anderson 's xxunk to "" visual revelation of machines and physical action "" . \n According to Daniel O 'Brien , writer of SF : UK : How British Science Fiction Changed the World , the Thunderbirds title sequence encapsulates the reasons for the series ' enduring popularity . Dyneley 's countdown is particularly well remembered and has been widely"
9,"was later described by Seattle writer and journalist xxunk Berger as \n one of Seattle 's great ' 60s landmarks , a gathering place for xxunk students , radicals , poets , nut jobs , chess masters , teens , intellectuals , workers , musicians , artists , xxunk , and xxunk ... I remember the xxunk , the open @-@ xxunk music , cigarette smoke , impromptu poetry readings , the"


## Model

In [None]:
config = awd_lstm_lm_config.copy()
config.update({'input_p': 0.6, 'output_p': 0.4, 'weight_p': 0.5, 'embed_p': 0.1, 'hidden_p': 0.2})
model = get_language_model(AWD_LSTM, len(vocab), config=config)

In [None]:
opt_func = partial(Adam, wd=0.1, eps=1e-7)
cb_funcs = [partial(MixedPrecision, clip=0.1), partial(RNNTrainer, alpha=2, beta=1)]

In [None]:
learn = Learner(dbch, model, loss_func=CrossEntropyLossFlat(), opt_func=opt_func, cb_funcs=cb_funcs, metrics=[accuracy, Perplexity()])

In [None]:
learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7,0.8), div=10)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,6.7266,5.68236,0.206063,293.641663,00:27


Full training

In [None]:
#learn.fit_one_cycle(90, 5e-3, moms=(0.8,0.7,0.8), div=10)