In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.external import *
from local.data.pipeline import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource

## Numericalizing

In [None]:
#export
class TokenizedText(L):
    def show(o, ctx=None, sep=None, **kwargs): 
        sep = sep or defaults.text_token_sep
        return show_title(sep.join(o), ctx=ctx)

In [None]:
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    if len(vocab) < max_vocab and len(vocab)%8 != 0: 
        #Make sure vocab size is a multiple of 8 for fast mixed precision training
        vocab += ['xxfake' for _ in range(0, 8-len(vocab)%8)]
    return vocab

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c'.split()))

In [None]:
# export
class Numericalize(MultiCategorize):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=None):
        super().__init__(vocab=vocab)
        self.sep = sep or defaults.text_token_sep
        self.min_freq,self.max_vocab = min_freq,max_vocab
    
    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o.split(self.sep))
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.otoi = {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'}

    def encodes(self, o):                return [self.otoi[o_] for o_ in o.split(self.sep)]
    def decodes(self, o)->TokenizedText: return self.sep.join([self.vocab[o_] for o_ in o])

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)

num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)

## LMPreloader -

In [None]:
class LM_PreLoader():
    "An intermediate between a dataset with texts and a DataLoader"
    def __init__(self, ds, lengths=None, bs=64, seq_len=70, shuffle=False):
        self.ds,self.bs,self.seq_len,self.shuffle = ds,bs,seq_len,shuffle
        self.lengths = [len(o[0]) for o in ds] if lengths is None else lengths
        self.n_batch = sum(self.lengths) // bs
        self.batchify()
    
    def __len__(self): return ((self.n_batch-1) // self.seq_len) * self.bs
    
    def __getitem__(self, i):
        k = (i % self.bs) * self.n_batch + (i // self.bs) * self.seq_len
        item_idx = (self.cumlen > k).nonzero().min().item()
        offset = k if item_idx==0 else k-self.cumlen[item_idx-1]
        text = self.ds[item_idx][0][offset:]
        while len(text) <= self.seq_len:
            item_idx += 1
            text += self.ds[item_idx][0]
        return tensor(text[:self.seq_len]),tensor(text[1:self.seq_len+1])
    
    def batchify(self):
        self.idxs = torch.randperm(len(ds)) if self.shuffle else tensor(range(len(self.ds)))
        self.cumlen = (tensor(self.lengths)[idxs] if self.shuffle else tensor(self.lengths)).cumsum(0)

In [None]:
lengths = [10,7,19,23,5,42]
ds = LM_PreLoader([(list(range(l)), 0) for l in lengths], lengths=lengths, bs=5, seq_len=4)
x,y = ds[0]
test_eq(x[1:], y[:-1])
test_eq(x+1, y)
#Going on the seq dimension reads the text in order
test_eq(torch.cat([ds[5*i][0] for i in range(5)]), 
        tensor(list(range(10))+list(range(7))+list(range(3))))
#3 is skipped for the next sample in the natch since it's the last target
test_eq(torch.cat([ds[5*i+1][0] for i in range(5)]),
        tensor(list(range(4,19))+list(range(5))))

## Integration example