In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.core import *
from local.data.all import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
#export
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + [f'xxfake' for i in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set([x for x in make_vocab(count) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set([x for x in make_vocab(count, min_freq=1) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a b c d'.split()))
test_eq(set([x for x in make_vocab(count,max_vocab=12, min_freq=1) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a b c'.split()))

In [None]:
#export
class TensorText(TensorBase):
    def show_multi(self, b, ctxs=None, max_n=10, **kwargs):
        if ctxs is None:
            n_samples = min(self.shape[0], max_n)
            df = pd.DataFrame(index = range(n_samples))
            ctxs = [df.iloc[i] for i in range(n_samples)]
        ctxs = default_show_multi(b, max_n=max_n, ctxs=ctxs, **kwargs)
        return ctxs
    
    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

In [None]:
# export
class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            count = Counter(p for o in dsrc for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i  [o_] for o_ in o]))
    def decodes(self, o): return Str(self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD]))

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set([x for x in num.vocab if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'This is an example of text this another'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start.split())

In [None]:
test_eq(t, tensor([11, 9, 12, 13, 14, 10]))
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set([x for x in num.vocab if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'is text'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start.split())
test_eq(t, tensor([0, 9, 0, 0, 0, 10]))
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_DataLoader -

In [None]:
#export
#TODO: add backward
@delegates()
class LMDataLoader(TfmdDL):
    def __init__(self, dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, **kwargs):
        super().__init__(dataset=dataset, bs=bs, num_workers=num_workers, **kwargs)
        self.items = ReindexCollection([(o[0] if isinstance(o, tuple) else o) for o in dataset], cache=cache)
        self.seq_len = seq_len
        if lens is None: lens = [len(o) for o in self.items]
        self.lens = ReindexCollection(lens, idxs=self.items.idxs)
        # The "-1" is to allow for final label
        self.m = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.n = self.m//(self.seq_len)
        self.spb = self.n//bs
        self.chunkify()

    def chunkify(self): self.chunks = Chunks(self.items, self.lens)
    def shuffle_fn(self,idxs):
        self.items.shuffle()
        self.chunkify()
        return idxs

    def create_item(self, seq):
        if seq>=self.n: raise IndexError
        st = ((seq%self.bs)*self.spb + (seq//self.bs)) * self.seq_len
        txt = self.chunks[st : st+self.seq_len+1]
        return txt[:-1],txt[1:]
    
    @classmethod
    def dbunchify(cls, dsrc, lens=None, bs=16, val_bs=None, shuffle_train=True, after_batch=None, **kwargs):
        n = len(dsrc.splits)-1
        bss = [bs] + [2*bs]*n if val_bs is None else [bs] + [val_bs]*n
        shuffles = [shuffle_train] + [False]*n
        if after_batch is None: after_batch = Cuda()
        lens = [None]*dsrc.n_subsets if lens is None else [L(lens, use_list=None)[f] for f in dsrc.splits]
        return DataBunch(*[cls(dsrc.subset(i), lens=l, bs=b, shuffle=s, drop_last=s, after_batch=after_batch, **kwargs)
                           for i,(b,s,l) in enumerate(zip(bss, shuffles, lens))])

In [None]:
bs,sl = 4,3
ints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).map(tensor)

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])

In [None]:
#hide
#Check lens work
dl = LMDataLoader(ints, lens=ints.map(len), bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)
for x,y in dl: test_eq(x[:,1:], y[:,:-1])
((x0,y0), (x1,y1)) = tuple(dl)
#Second batch begins where first batch ended
test_eq(y0[:,-1], x1[:,0]) 

## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text')
df_tok.head(2)

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,"[xxbos, xxmaj, un, -, bleeping, -, believable, !, xxmaj, meg, xxmaj, ryan, does, n't, even, look, her, usual, pert, lovable, self, in, this, ,, which, normally, makes, me, forgive, her, shallow, ticky, acting, schtick, ., xxmaj, hard, to, believe, she, was, the, producer, on, this, dog, ., xxmaj, plus, xxmaj, kevin, xxmaj, kline, :, what, kind, of, suicide, trip, has, his, career, been, on, ?, xxmaj, whoosh, …, xxmaj, banzai, xxrep, 3, !, xxmaj, finally, this, was, directed, by, the, guy, who, did, xxmaj, big, xxmaj, chill, ?, xxmaj, must, be, a, replay, of, xxmaj, jonestown, -, hollywood,...",108
1,positive,False,"[xxbos, xxmaj, this, is, a, extremely, well, -, made, film, ., xxmaj, the, acting, ,, script, and, camera, -, work, are, all, first, -, rate, ., xxmaj, the, music, is, good, ,, too, ,, though, it, is, mostly, early, in, the, film, ,, when, things, are, still, relatively, cheery, ., xxmaj, there, are, no, really, superstars, in, the, cast, ,, though, several, faces, will, be, familiar, ., xxmaj, the, entire, cast, does, an, excellent, job, with, the, script, ., \n\n, xxmaj, but, it, is, hard, to, watch, ,, because, there, is, no, good, end, to, a, situation, like, the, one, ...]",462


In [None]:
texts,lens = df_tok['text'],df_tok['text_lengths'].values.astype(np.int)

In [None]:
splits = RandomSplitter()(texts)
tfm = Numericalize(make_vocab(count))
dsrc = DataSource(texts, [tfm], splits=splits, dl_type=LMDataLoader)

In [None]:
show_at(dsrc.train, 0)

xxbos xxmaj this is one of those movies that 's difficult to review without giving away the plot . xxmaj suffice to say there are weird things and unexpected twists going on , beyond the initial superficial " tom xxmaj cruise screws around with multiple women " plot . 

 xxmaj the quality cast xxunk this movie above the xxunk , and all the cast are well suited to their parts : xxmaj cruise as the xxunk xxunk xxunk who has it all - and then loses it all , xxmaj xxunk as the attractive but slightly deranged xxunk lover , xxmaj xxunk as the exotic new girl on the scene and xxmaj russell as the xxunk xxunk . xxmaj the story involves elements of romance , morality , murder - mystery , suspense and sci - fi and is generally an entertaining trip . 

 i should add that the photography is also xxunk excellent and the xxunk of various visual xxunk is beautiful once you realize what 's going on . 

 xxmaj if you enjoy well - acted movies with twists and suspense , and are prepared to accept a sli

In [None]:
dbunch = dsrc.databunch(bs=16, seq_len=72)

In [None]:
dbunch.show_batch(max_n=6)

Unnamed: 0,text
0,xxbos xxmaj was n't sure what to expect from this movie considering its amazing collection of stars and directors but in the end it did n't disappoint . \n\n xxmaj for me one of the highlights was the final episode with the xxmaj american xxunk speaking with a dreadful xxmaj french accent ( which made me feel better about mine ) which was actually quite touching and a great way to xxunk
1,"xxmaj de xxmaj niro to pick up the slack . xxmaj de xxmaj niro , who gives his body over to xxmaj scorsese like a xxunk xxmaj christ , gives the film its only great scene . xxmaj in an improvised romantic sequence with young a xxmaj xxunk xxmaj lewis , he sticks his thumb into her mouth and kisses her . a kind of symbolic rape , the girl runs away"
2,"movie is no excuse . xxmaj read the xxunk on the back of the case , it reveals the whole story . i do not recommend that you watch this movie unless you have 80 minutes to waste on something that will leave you xxunk that you watched it . i feel really bad for those xxmaj xxunk and the irony of their name . xxmaj all xxunk xxmaj anthony xxmaj xxunk"
3,"is xxmaj janos xxmaj rukh , genius science who reads ancient xxunk of light to xxunk events in the great xxunk past particularly the crash of a xxunk xxunk meteor in xxmaj africa . xxmaj xxunk him is the ever - elegant xxmaj lugosi ( as a rare hero ) , who studies "" astro - chemistry . "" xxmaj xxunk xxmaj drake is the lovely , xxunk young wife ;"
4,"frightening , but i xxunk ) . xxmaj no one will believe him at first , but they will . xxmaj oh yes , they will . \n\n xxup ok , killer slugs are right above psychotic xxunk and right below xxmaj xxunk xxmaj winters as xxmaj xxunk 's baby in the creepiness factor . xxmaj so the xxunk of it all is quite apparent from the get go . xxmaj the"
5,"things that would have made this film work better . xxmaj the movie is also pretty long for a comedy . xxmaj okay , xxunk minutes is n't exactly long but it feels so much longer because there 's very little humor in the first hour . i think comedies should be kept short or else they have to find a lot of material to cover the entire running time . xxmaj"


In [None]:
x,y = dbunch.one_batch()
test_eq(type(x), TensorText)

In [None]:
test_eq(len(dbunch.valid_ds[0][0]), dbunch.valid_dl.lens[0])

## Classification

In [None]:
#export
def pad_collate(samples, pad_idx=1, pad_first=False, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        sl = slice(-len(s[0]), sys.maxsize) if pad_first else slice(0, len(s[0]))
        res[i,sl] = LongTensor(s[0])
    if backwards: res = res.flip(1)
    return res, tensor(np.array([s[1] for s in samples]))

In [None]:
test_eq(pad_collate([([1,2,3],1), ([4,5], 2), ([6], 3)], pad_idx=0), 
        (tensor([[1,2,3], [4,5,0], [6,0,0]]), tensor([1,2,3])))
test_eq(pad_collate([([1,2,3],1), ([4,5], 2), ([6], 3)], pad_idx=0, pad_first=True), 
        (tensor([[1,2,3], [0,4,5], [0,0,6]]), tensor([1,2,3])))
test_eq(pad_collate([([1,2,3],1), ([4,5], 2), ([6], 3)], pad_idx=0, backwards=True), 
        (tensor([[3,2,1], [5,4,0], [6,0,0]]), tensor([1,2,3])))

In [None]:
#export
def _default_sort(x): return len(x[0])

@delegates(TfmdDL)
class SortedDL(TfmdDL):
    def __init__(self, dataset, sort_func=None, res=None, **kwargs):
        super().__init__(dataset, **kwargs)
        self.sort_func = _default_sort if sort_func is None else sort_func
        self.res = [self.sort_func(self.do_item(i)) for i in range_of(self.dataset)] if res is None else res
        self.idx_max = np.argmax(self.res)
        
    def get_idxs(self):
        idxs = super().get_idxs()
        if self.shuffle: return idxs
        return sorted(idxs, key=lambda i: self.res[i], reverse=True)
    
    def shuffle_fn(self,idxs):
        idxs = np.random.permutation(len(self.dataset))
        idx_max = np.extract(idxs==self.idx_max, idxs)[0]
        idxs[0],idxs[idx_max] = idxs[idx_max],idxs[0]
        sz = self.bs*50
        chunks = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]
        chunks = [sorted(s, key=lambda i: self.res[i], reverse=True) for s in chunks]
        sort_idx = np.concatenate(chunks)
        
        sz = self.bs
        batches = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]
        sort_idx = np.concatenate(np.random.permutation(batches[1:-1])) if len(batches) > 2 else np.array([],dtype=np.int)
        sort_idx = np.concatenate((batches[0], sort_idx) if len(batches)==1 else (batches[0], sort_idx, batches[-1]))
        return iter(sort_idx)

In [None]:
ds = [([1,2],1), ([3,4,5,6],2), ([7],3), ([8,9,10],4)]
dl = SortedDL(ds, bs=2, create_batch=partial(pad_collate, pad_idx=0))
test_eq(list(dl), [(tensor([[ 3,  4,  5,  6], [ 8,  9, 10,  0]]), tensor([2, 4])), 
                   (tensor([[1, 2], [7, 0]]), tensor([1, 3]))])

In [None]:
ds = [(list(range(random.randint(1,10))),i) for i in range(101)]
dl = SortedDL(ds, bs=2, create_batch=partial(pad_collate, pad_idx=-1), shuffle=True, num_workers=0)
batches = list(dl)
max_len = len(batches[0][0])
for b in batches: 
    assert(len(b[0])) <= max_len 
    test_ne(b[0][0,-1], -1)

In [None]:
splits = RandomSplitter()(range_of(df_tok))
dsrc = DataSource(df_tok, splits=splits, tfms=[
    [attrgetter("text"), Numericalize(make_vocab(count))],
    [attrgetter("label"), Categorize()]])
dl = SortedDL(dsrc.train, create_batch=pad_collate, shuffle=True)

In [None]:
dl.show_batch(max_n=1)

Unnamed: 0,text,category
0,"xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the ...",negative


## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_torch_core.ipynb.
Converted 02_script.ipynb.
Converted 03_dataloader.ipynb.
Converted 04_transform.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_transforms.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 10_data_block.ipynb.
Converted 11_layers.ipynb.
Converted 11a_vision_models_xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 14a_callback_data.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 22_vision_learner.ipynb.
Converted 23_tutorial_transfer_learning.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Con