In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy
import sys

  from numpy.core.umath_tests import inner1d


In [2]:
# pipe std_out to file to prevent python websocket error
old_stdout = sys.stdout
sys.stdout = open('LanguageModel_output.txt', 'w')

In [3]:
PATH='data2/'

TRN_PATH = 'train/all/'
VAL_PATH = 'test/all/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

In [4]:
# Sanity check on the number of words used to train the language model
# words in training dataset
!find {TRN} -name '*.txt' | xargs cat | wc -w

In [5]:
# words in test dataset
!find {VAL} -name '*.txt' | xargs cat | wc -w

## The Language model, hyper-parameter choice and subsequent transfer learning technique is modelled from the paper (https://arxiv.org/abs/1801.06146) and implementation (https://github.com/fastai/fastai/blob/master/courses/dl1/lesson4-imdb.ipynb). 

## The implementation trains the Language model on IMDB data as well. I use the same IMDB data to create the Model. However, for the transfer learning, another IMDB dataset from Kaggle

In [6]:
class LanguageModel:
    def __init__(self, PATH, TRN_PATH, VAL_PATH):
        self.path = PATH
        self.text = data.Field(lower=True, tokenize="spacy")
        self.files = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
        # hyper-parameters based on the paper
        self.bs = 64 # batch-size
        self.bptt = 70 # bptt, the number of input words for a given batch
        self.em_sz = 200  # size of each embedding vector
        self.nh = 500     # number of hidden activations per layer
        self.nl = 3       # number of layers
        self.beta1 = 0.7  # adam optimizer parameter
        self.beta2 = 0.99 # adam optimizer parameter
        self.opt_fn = partial(optim.Adam, betas=(self.beta1, self.beta2))
        # create an instance of Fastai language model
        self.md = self.genLanguageModel()
        print(type(self.md))
        # create the learner
        self.learner = self.md.get_model(self.opt_fn, self.em_sz, self.nh, self.nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)

    # return the Fastai nlp langauge model
    def genLanguageModel(self):
        # the min_freq term requires that a word appear at least 10 times to be considered as input to RNN
        return LanguageModelData.from_text_files(self.path, self.text, **self.files, bs=self.bs, bptt=self.bptt, min_freq=10)
    
    # Print out: (batches, unique tokens in the vocab, tokens in the training set, sentences)
    def printStats(self):
        return len(self.md.trn_dl), self.md.nt, len(self.md.trn_ds), len(self.md.trn_ds[0].text)
    
    # Load the language model, after training has been done
    def getSerializedModel(self):
        return pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))
    
    # Load the encoder, after training has been done
    def getEncoder(self):
        return self.learner.load_encoder('adam_enc')
    
    def trainModel(self):
        self.learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        self.learner.clip=0.3
        # uses the 1 cycle policy to fit the model (https://sgugger.github.io/the-1cycle-policy.html)
        # COUNT NUMBER OF EPOCHS
        # fit the model
        self.learner.fit(3e-3, 1, wds=1e-6, cycle_len=1, cycle_mult=2)
        # save the encoder (used in the transfer learning)
        self.learner.save_encoder('adam_enc')
        self.learner.load_encoder('adam_enc')
        # save model to disk
        pickle.dump(self.text, open(f'{PATH}models/TEXT.pkl','wb'))
        
        return self.learner, self.text

In [7]:
initModel = LanguageModel(PATH, TRN_PATH, VAL_PATH)

In [8]:
print(initModel.printStats())

In [None]:
learner, tokenizer = initModel.trainModel()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

In [None]:
# redirect output to another file for Transfer Learning
sys.stdout = open('TransferLearning_output.txt', 'w')

In [None]:
# # (batches, unique tokens in the vocab, tokens in the training set, sentences)
# len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

In [None]:
# # ensure that TEXT (the tokenizer) performs the word-to-int and vice-versa mapping correctly
# # 'itos': 'int-to-string'
# TEXT.vocab.itos[:11]

In [None]:
# # 'stoi': 'string to int'
# TEXT.vocab.stoi['the']

## Train

In [None]:
# # hyper-parameters based on the paper
# em_sz = 200  # size of each embedding vector
# nh = 500     # number of hidden activations per layer
# nl = 3       # number of layers
# beta1 = 0.7 # adam  optimization parameters
# beta2 = 0.99

In [None]:
# opt_fn = partial(optim.Adam, betas=(beta1, beta2))

In [None]:
# # exact rates found from notebook cited above
# # (1) implement hyperparms for dropout optimization (2) regularization function (3) Gradient clipping
# learner = md.get_model(opt_fn, em_sz, nh, nl,
#                dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
# learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
# learner.clip=0.3

In [None]:
# # uses the 1 cycle policy to fit the model (https://sgugger.github.io/the-1cycle-policy.html)
# learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

In [None]:
# learner.save_encoder('adam_enc')

In [None]:
# learner.load_encoder('adam_enc')

In [None]:
# pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

## Test Langauge Model

In [None]:
# CHANGE THE TEXT
m=learner.model
ss=""". So, it wasn't quite was I was expecting, but I really liked it anyway! The best"""
s = [tokenizer.preprocess(ss)]
t=tokenizer.numericalize(s)
' '.join(s[0])

In [None]:
# Set batch size to 1 temporarily to output words
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
predictions,*_ = m(t)
# Put the batch size back to original size
m[0].bs=initModel.bs

In [None]:
# perform prediction
print(ss,"\n")
for i in range(50):
    n=predictions[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(tokenizer.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

## Sentiment analysis: train pre-trained model on task specific data

In [None]:
# defines a mechanism to iterate over the task-specific dataset
# this allows PyTorch to create a model for the data
class ImdbDataset(torchtext.data.Dataset):
    def __init__(self, path, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for label in ['pos', 'neg']:
            fnames = glob(os.path.join(path, label, '*.txt'));
            assert fnames, f"can't find 'pos.txt' or 'neg.txt' under {path}/{label}"
            for fname in fnames:
                with open(fname, 'r') as f: text = f.readline()
                examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex): return len(ex.text)
    
    @classmethod
    def splits(cls, text_field, label_field, root='.data',
               train='train', test='test', **kwargs):
        return super().splits(
            root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)

In [None]:
# use the same word map to IDs 
# use the previous path
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))

In [None]:
PATH_sentiment = 'data/'
IMDB_LABEL = data.Field(sequential=False)
splits = ImdbDataset.splits(TEXT, IMDB_LABEL, PATH_sentiment, train='train', test='valid')

In [None]:
print(splits)

In [None]:
md2 = TextData.from_splits(PATH_sentiment, splits, bs)

In [None]:
# MANUALLY LOADED THE 'models' dir from data2 to data. Resolve this issue
# perform shell command here
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder('adam_enc')

In [None]:
# discriminative learning here
m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [None]:
m3.freeze_to(-1)
m3.fit(lrs/2, 1, metrics=[accuracy])
m3.unfreeze()
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

In [None]:
# epochs: from 7 to 3
m3.fit(lrs, 3, metrics=[accuracy], cycle_len=2, cycle_save_name='imdb2')

In [None]:
m3.load_cycle('imdb2', 4)

In [None]:
accuracy_np(*m3.predict_with_targs())

In [None]:
# close piping std_out to file
sys.stdout = old_stdout