# Building a Language Model dataset using PyTorch Text

In [0]:
import spacy
import torchtext as tt
from collections import Counter
import functools
import operator

In [0]:
spacy_en = spacy.load('en')

In [0]:
seq_len  =  15#@param

## Text processing

Create a tokenizer function using Spacy.
> By default, PyTorch Text uses a whitespace tokenization

In [0]:
def spacy_tokenizer(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

Create a text processing pipeline using PyTorch Text

In [0]:
TEXT = tt.data.Field(
  tokenize    = spacy_tokenizer,
  lower       = True,
  batch_first = True,
  init_token  = '<bos>',
  eos_token   = '<eos>',
  fix_length  = seq_len
)

Text Preprocessing:
* Lowercasting text: as flag `lower` in TEXT is set to `true`
* Tokenizing text: as a tokenization function was provided

In [0]:
minibatch = [ 'The Brown Fox Jumped Over The Lazy Dog' ]
minibatch = list(map(TEXT.preprocess, minibatch))

Padding text with the `<pad>` token so that the sequence length is  matched

In [24]:
minibatch = TEXT.pad(minibatch)
print(minibatch)

[['<bos>', 'the', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']]


Manually build a vocab

In [25]:
tokens = functools.reduce(operator.concat, minibatch)
counter = Counter(tokens)
counter

Counter({'<bos>': 1,
         '<eos>': 1,
         '<pad>': 5,
         'brown': 1,
         'dog': 1,
         'fox': 1,
         'jumped': 1,
         'lazy': 1,
         'over': 1,
         'the': 2})

In [0]:
vocab = TEXT.vocab_cls(counter)

In [27]:
print(vocab.itos)

['<pad>', 'the', '<bos>', '<eos>', 'brown', 'dog', 'fox', 'jumped', 'lazy', 'over']


In [28]:
print(vocab.stoi)

defaultdict(<function _default_unk_index at 0x7ff038fb6a60>, {'<pad>': 0, 'the': 1, '<bos>': 2, '<eos>': 3, 'brown': 4, 'dog': 5, 'fox': 6, 'jumped': 7, 'lazy': 8, 'over': 9})


In [0]:
TEXT.vocab = vocab

In [30]:
TEXT.numericalize(minibatch)

tensor([[2, 1, 4, 6, 7, 9, 1, 8, 5, 3, 0, 0, 0, 0, 0]])

Automatically construct a vocab

In [0]:
TEXT.build_vocab(minibatch)

In [32]:
print(TEXT.vocab.itos)

['<unk>', '<pad>', '<bos>', '<eos>', 'the', 'brown', 'dog', 'fox', 'jumped', 'lazy', 'over']


In [33]:
print(TEXT.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7ff038fb6a60>, {'<unk>': 0, '<pad>': 1, '<bos>': 2, '<eos>': 3, 'the': 4, 'brown': 5, 'dog': 6, 'fox': 7, 'jumped': 8, 'lazy': 9, 'over': 10})


In [34]:
TEXT.numericalize(minibatch)

tensor([[ 2,  4,  5,  7,  8, 10,  4,  9,  6,  3,  1,  1,  1,  1,  1]])

## Data Loader

Build a dataset given a training and validation text files, and using the previously built text processing pipeline.

In [0]:
train_ds, valid_ds = tt.data.TabularDataset.splits(
    path=PATH,
    train='train.csv',
    validation='valid.csv',
    format='csv',
    fields=[('text', TEXT)]
)

### Data Loader for Language Modeling

This dataset can be used to build an iterator that produces data for multiple NLP Tasks. For instance, to build the samples to use for Language Modeling using [torchtext.data.BPTTIterator](https://torchtext.readthedocs.io/en/latest/data.html#bpttiterator).

In [0]:
def dataset2example(dataset, field):
    examples = list(map(lambda example: ['<bos>']+ example.text + ['<eos>'], dataset.examples))
    examples = [item for example in examples for item in example]
    example = tt.data.Example()
    setattr(example, 'text', examples)
    return tt.data.Dataset([example], fields={'text': field})

In [0]:
train_example = dataset2example(train_ds, TEXT)
valid_example = dataset2example(valid_ds, TEXT)

In [0]:
train_iter, valid_iter = tt.data.BPTTIterator.splits((train_example, valid_example), batch_size=batch_size, bptt_len=30)

The resulting `train_iter` and `valid_iter` are iterators over batches of samples that can be used in a training loop.