In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

  from numpy.core.umath_tests import inner1d


In [2]:
PATH='data2/'

TRN_PATH = 'train/all/'
VAL_PATH = 'test/all/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

[0m[01;31maclImdb.tgz[0m  [01;34mmodels[0m/  [01;34mtest[0m/  [01;31mtest.zip[0m  [01;34mtmp[0m/  [01;34mtrain[0m/  [01;31mtrain.zip[0m


In [3]:
bs,bptt = 64,70

In [4]:
trn_files = !ls {TRN}
trn_files[:10]

['0_0.txt',
 '0_3.txt',
 '0_9.txt',
 '10000_0.txt',
 '10000_4.txt',
 '10000_8.txt',
 '1000_0.txt',
 '10001_0.txt',
 '10001_10.txt',
 '10001_4.txt']

In [5]:
review = !cat {TRN}{trn_files[6]}
review[0]

"I have to say when a name like Zombiegeddon and an atom bomb on the front cover I was expecting a flat out chop-socky fung-ku, but what I got instead was a comedy. So, it wasn't quite was I was expecting, but I really liked it anyway! The best scene ever was the main cop dude pulling those kids over and pulling a Bad Lieutenant on them!! I was laughing my ass off. I mean, the cops were just so bad! And when I say bad, I mean The Shield Vic Macky bad. But unlike that show I was laughing when they shot people and smoked dope.<br /><br />Felissa Rose...man, oh man. What can you say about that hottie. She was great and put those other actresses to shame. She should work more often!!!!! I also really liked the fight scene outside of the building. That was done really well. Lots of fighting and people getting their heads banged up. FUN! Last, but not least Joe Estevez and William Smith were great as the...well, I wasn't sure what they were, but they seemed to be having fun and throwing out 

In [6]:
# words in training dataset
!find {TRN} -name '*.txt' | xargs cat | wc -w

17486692


In [7]:
# words in test dataset
!find {VAL} -name '*.txt' | xargs cat | wc -w

5686719


In [8]:
spacy_tok = spacy.load('en')

In [9]:
type(spacy_tok)

spacy.lang.en.English

In [10]:
' '.join([sent.string.strip() for sent in spacy_tok(review[0])])

"I have to say when a name like Zombiegeddon and an atom bomb on the front cover I was expecting a flat out chop - socky fung - ku , but what I got instead was a comedy . So , it was n't quite was I was expecting , but I really liked it anyway ! The best scene ever was the main cop dude pulling those kids over and pulling a Bad Lieutenant on them ! ! I was laughing my ass off . I mean , the cops were just so bad ! And when I say bad , I mean The Shield Vic Macky bad . But unlike that show I was laughing when they shot people and smoked dope.<br /><br />Felissa Rose ... man , oh man . What can you say about that hottie . She was great and put those other actresses to shame . She should work more often ! ! ! ! ! I also really liked the fight scene outside of the building . That was done really well . Lots of fighting and people getting their heads banged up . FUN ! Last , but not least Joe Estevez and William Smith were great as the ... well , I was n't sure what they were , but they see

In [11]:
# use torchtext to preprocess data

TEXT = data.Field(lower=True, tokenize="spacy")

In [12]:
# the parameters
bs=64; bptt=70

In [13]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)

In [14]:
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

In [15]:
# The TEXT attribute gets the instance var vocab, the vocabulary for the words from the training text
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

In [16]:
# batches; # unique tokens in the vocab; # tokens in the training set; # sentences

In [17]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(4583, 37392, 1, 20540756)

In [18]:
# 'itos': 'int-to-string'
TEXT.vocab.itos[:12]

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

In [19]:
# 'stoi': 'string to int'
TEXT.vocab.stoi['and']

5

## Train

In [20]:
# hyper-parameters based on the paper
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers
beta1 = 0.7
beta2 = 0.99

In [21]:
opt_fn = partial(optim.Adam, betas=(beta1, beta2))

In [22]:
# exact rates found from notebook cited above
# (1) implement hyperparms for dropout optimization (2) regularization function (3) Gradient clipping
learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [23]:
# uses Discriminative layer training to fit the model (https://docs.fast.ai/basic_train.html#Discriminative-layer-training)
# applies a different learning rate to each layer group 
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   
    0      4.963137   4.83045   



[array([4.83045])]

In [38]:
learner.save_encoder('adam_enc')

In [39]:
learner.load_encoder('adam_enc')

In [42]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

## Test

In [27]:
# CHANGE THE TEXT
m=learner.model
ss=""". So, it wasn't quite was I was expecting, but I really liked it anyway! The best"""
s = [TEXT.preprocess(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

". so , it was n't quite was i was expecting , but i really liked it anyway ! the best"

In [28]:
# Set batch size to 1 temporarily to output words
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
predictions,*_ = m(t)
# Put the batch size back to original size
m[0].bs=bs

In [29]:
# perform prediction
print(ss,"\n")
for i in range(50):
    n=predictions[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

. So, it wasn't quite was I was expecting, but I really liked it anyway! The best 

part of the characters were very good . the acting was very good , and the acting was very good . the acting was very good , and the acting was very good . the acting was very good , but the acting was very good . the acting was ...


## Sentiment analysis: train pre-trained model on task specific data

In [30]:
class ImdbDataset(torchtext.data.Dataset):
    def __init__(self, path, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for label in ['pos', 'neg']:
            fnames = glob(os.path.join(path, label, '*.txt'));
            assert fnames, f"can't find 'pos.txt' or 'neg.txt' under {path}/{label}"
            for fname in fnames:
                with open(fname, 'r') as f: text = f.readline()
                examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex): return len(ex.text)
    
    @classmethod
    def splits(cls, text_field, label_field, root='.data',
               train='train', test='test', **kwargs):
        return super().splits(
            root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)

In [31]:
# use the same word map to IDs 
# use the previous path
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))

In [46]:
PATH_sentiment = 'data/'
IMDB_LABEL = data.Field(sequential=False)
splits = ImdbDataset.splits(TEXT, IMDB_LABEL, PATH_sentiment, train='train', test='valid')

In [47]:
print(splits)

(<__main__.ImdbDataset object at 0x7f48f3782e10>, <__main__.ImdbDataset object at 0x7f48f3782ef0>)


In [48]:
md2 = TextData.from_splits(PATH_sentiment, splits, bs)

In [51]:
# MANUALLY LOADED THE 'models' dir from data2 to data. Resolve this issue
# perform shell command here
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder('adam_enc')

In [52]:
m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [53]:
m3.freeze_to(-1)
m3.fit(lrs/2, 1, metrics=[accuracy])
m3.unfreeze()
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy   
    0      0.554098   0.590021   0.691904  



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy   
    0      0.484516   0.527257   0.747325  



[array([0.52726]), 0.7473248047421132]

In [None]:
# epochs: from 7 to 3
m3.fit(lrs, 3, metrics=[accuracy], cycle_len=2, cycle_save_name='imdb2')

In [None]:
m3.load_cycle('imdb2', 4)

In [None]:
accuracy_np(*m3.predict_with_targs())