In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.source import *
from local.data.external import *
from local.data.pipeline import *
from local.data.load import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
#export
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + ['xxfake' for _ in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c xxfake'.split()))

In [None]:
#export core
def display_df(df):
    "Display `df` in a notebook or defaults to print"
    try:
        from IPython.display import display, HTML
        display(HTML(df.to_html()))
    except: print(df)

In [None]:
#export
class TensorText(TensorBase):
    def get_ctxs(self, max_samples=10, **kwargs):
        n_samples = min(self.shape[0], max_samples)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]
    
    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

In [None]:
# export
class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})
    
    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o)->TensorText: return tensor([self.o2i[o_] for o_ in o])
    def decodes(self, o)->Str: return self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD])

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start.split())

In [None]:
test_eq(t, tensor([11, 9, 12, 13, 14, 10]))
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start.split())
test_eq(t, tensor([0, 9, 0, 0, 0, 10]))
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_Dataset -

In [None]:
#export
def apply_coords(f, *dims):
    "Create coord array of size `dims` and apply `f` to each cell"
    gs = np.meshgrid(*map(range, dims), indexing='ij')
    return np.apply_along_axis(f, 0, np.stack(gs))

In [None]:
apply_coords(str,2,3,4)

array([[['[0 0 0]', '[0 0 1]', '[0 0 2]', '[0 0 3]'],
        ['[0 1 0]', '[0 1 1]', '[0 1 2]', '[0 1 3]'],
        ['[0 2 0]', '[0 2 1]', '[0 2 2]', '[0 2 3]']],

       [['[1 0 0]', '[1 0 1]', '[1 0 2]', '[1 0 3]'],
        ['[1 1 0]', '[1 1 1]', '[1 1 2]', '[1 1 3]'],
        ['[1 2 0]', '[1 2 1]', '[1 2 2]', '[1 2 3]']]], dtype='<U7')

In [None]:
#export
class LM_Dataset:
    def __init__(self, ds, lens=None, bs=64, seq_len=72, shuffle=False, cache=2):
        self.ds = ReindexCollection(ds, cache=cache)
        self.bs,self.seq_len,self.shuffle = bs,seq_len,shuffle
        if lens is None: lens = [len(o) for o in ds]
        self.lens = ReindexCollection(lens, idxs=self.ds.idxs)
        # The "-1" is to allow for final label
        self.n = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.spb = self.n//(self.seq_len * self.bs)
        self.reset()
        
    def __len__(self): return self.n//(self.seq_len)
    def reset(self):
        if self.shuffle: self.ds.shuffle()
        self.chunks = Chunks(self.ds, self.lens)
    
    def __getitem__(self, seq):
        if seq>=len(self): raise IndexError
        st = ((seq%self.bs)*self.spb + (seq//self.bs)) * self.seq_len
        txt = self.chunks[st : st+self.seq_len+1]
        return tensor(txt[:-1]),tensor(txt[1:])

In [None]:
#export
class LMDataLoader(DataLoader):
    def __init__(self, items, lens=None, bs=64, seq_len=72, shuffle=False, cache=2, num_workers=0, pin_memory=False, timeout=0):
        super().__init__(items=items, bs=bs, seq_len=seq_len, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, timeout=timeout)
        self.items = ReindexCollection(items, cache=cache)
        self.seq_len,self.shuffle = seq_len,shuffle
        if lens is None: lens = [len(o) for o in items]
        self.lens = ReindexCollection(lens, idxs=self.items.idxs)
        # The "-1" is to allow for final label
        self.m = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.spb = self.m//(self.seq_len * self.bs)
        self.n = self.m//(self.seq_len)
        
    def reset(self):
        if self.shuffle: self.items.shuffle()
        self.chunks = Chunks(self.items, self.lens)
    
    def item(self, seq):
        if seq>=self.n: raise IndexError
        st = ((seq%self.bs)*self.spb + (seq//self.bs)) * self.seq_len
        txt = self.chunks[st : st+self.seq_len+1]
        return tensor(txt[:-1]),tensor(txt[1:])

In [None]:
bs,sl = 4,3
ints = [[0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]]

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl)

In [None]:
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])

## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text')
df_tok.head(2)

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,"[xxbos, xxmaj, un, -, bleeping, -, believable, !, xxmaj, meg, xxmaj, ryan, does, n't, even, look, her, usual, pert, lovable, self, in, this, ,, which, normally, makes, me, forgive, her, shallow, ticky, acting, schtick, ., xxmaj, hard, to, believe, she, was, the, producer, on, this, dog, ., xxmaj, plus, xxmaj, kevin, xxmaj, kline, :, what, kind, of, suicide, trip, has, his, career, been, on, ?, xxmaj, whoosh, …, xxmaj, banzai, xxrep, 3, !, xxmaj, finally, this, was, directed, by, the, guy, who, did, xxmaj, big, xxmaj, chill, ?, xxmaj, must, be, a, replay, of, xxmaj, jonestown, -, hollywood,...",108
1,positive,False,"[xxbos, xxmaj, this, is, a, extremely, well, -, made, film, ., xxmaj, the, acting, ,, script, and, camera, -, work, are, all, first, -, rate, ., xxmaj, the, music, is, good, ,, too, ,, though, it, is, mostly, early, in, the, film, ,, when, things, are, still, relatively, cheery, ., xxmaj, there, are, no, really, superstars, in, the, cast, ,, though, several, faces, will, be, familiar, ., xxmaj, the, entire, cast, does, an, excellent, job, with, the, script, ., \n\n, xxmaj, but, it, is, hard, to, watch, ,, because, there, is, no, good, end, to, a, situation, like, the, one, ...]",462


In [None]:
texts,lengths = df_tok['text'],df_tok['text_lengths'].values.astype(np.int)

In [None]:
class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})
    
    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return [self.o2i[o_] for o_ in o]
    def decodes(self, o)->Str: return self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD])

In [None]:
splits = RandomSplitter()(texts)
dsrc = DataSource(texts, [Numericalize(make_vocab(count))], filts=splits)

In [None]:
dsrc.train.show_at(0)

xxbos xxmaj people like me will tear this movie apart . xxmaj it 's just not realistic . xxmaj the xxmaj plot is s xxrep 7 o predictable . xxmaj you can xxunk everything that happens xxunk xxmaj of course , they find the treasure and become xxunk rich , and trick the bad guy . xxmaj we 've seen it a million times before . xxmaj the writers of this movie must think that the majority of the movie going public is stupid . xxmaj they must be right because xxmaj the majority of people actually liked this film . i mean xxunk xxunk in a matter of seconds . xxmaj the secret treasure room hidden under the xxmaj manhattan xxunk ? xxmaj you 'd think with all the work that 's gone on in xxmaj new xxmaj york underground xxmaj that room would have been discovered before . and all that was constructed during the civil war ? xxup please xxmaj and the love story between xxmaj ben and xxmaj abigail ? ? how cute , and i thought the romance in xxmaj clive xxmaj xxunk novels was weak . xxmaj they just fall

1

In [None]:
tdl = TfmdDL(LMDataLoader(dsrc.train, bs=16, shuffle=True), tfms=None)

In [None]:
dl = LMDataLoader([t[0] for t in dsrc.train], bs=16, shuffle=True)

In [None]:
x,y = next(iter(dl))

In [None]:
x.shape,y.shape

(torch.Size([16, 72]), torch.Size([16, 72]))

In [None]:
type(x)

torch.Tensor

In [None]:
splits = RandomSplitter()(texts)
train,valid = [LM_Dataset(texts[s].reset_index(drop=True), lens=lengths[s], bs=bs) for s in splits]
tds = TfmdList(train, tfms=Numericalize(make_vocab(count)), as_item=False, wrap_l=False)

In [None]:
tds.show(tds[0])

xxbos i think the xxmaj croc xxmaj hunter is a pretty cool guy ! i know i would n't have the xxunk to go even 5 feet away from a croc . 

 xxmaj but , everything in this movie is bad . xxmaj xxunk jokes , people getting xxunk , and the skit about the xxmaj president all make the movie one of the worst of all time . 

 xxmaj
i think the xxmaj croc xxmaj hunter is a pretty cool guy ! i know i would n't have the xxunk to go even 5 feet away from a croc . 

 xxmaj but , everything in this movie is bad . xxmaj xxunk jokes , people getting xxunk , and the skit about the xxmaj president all make the movie one of the worst of all time . 

 xxmaj it


1

In [None]:
bs = 16
samp = LM_Sampler(train)
tdl = TfmdDL(tds, bs=bs, sampler=samp, num_workers=0)

In [None]:
x,y = tdl.one_batch()
tds.decode((x[0],))

("xxbos i think the xxmaj croc xxmaj hunter is a pretty cool guy ! i know i would n't have the xxunk to go even 5 feet away from a croc . \n\n xxmaj but , everything in this movie is bad . xxmaj xxunk jokes , people getting xxunk , and the skit about the xxmaj president all make the movie one of the worst of all time . \n\n xxmaj",)

In [None]:
tdl.show_batch(max_samples=2)

Unnamed: 0,text,text_
0,"xxbos i think the xxmaj croc xxmaj hunter is a pretty cool guy ! i know i would n't have the xxunk to go even 5 feet away from a croc . \n\n xxmaj but , everything in this movie is bad . xxmaj xxunk jokes , people getting xxunk , and the skit about the xxmaj president all make the movie one of the worst of all time . \n\n xxmaj","i think the xxmaj croc xxmaj hunter is a pretty cool guy ! i know i would n't have the xxunk to go even 5 feet away from a croc . \n\n xxmaj but , everything in this movie is bad . xxmaj xxunk jokes , people getting xxunk , and the skit about the xxmaj president all make the movie one of the worst of all time . \n\n xxmaj it"
1,"life in quest of xxunk and freedom ; and this time she will never come back . \n\n xxmaj gossip columns xxunk all xxmaj andré 's worst xxunk , as he learns of his wife 's xxunk through reports xxunk her name with xxmaj xxunk . xxmaj when the young xxunk xxunk at last to the xxunk xxunk of the screening - room , it is with drawn gun -- to be","in quest of xxunk and freedom ; and this time she will never come back . \n\n xxmaj gossip columns xxunk all xxmaj andré 's worst xxunk , as he learns of his wife 's xxunk through reports xxunk her name with xxmaj xxunk . xxmaj when the young xxunk xxunk at last to the xxunk xxunk of the screening - room , it is with drawn gun -- to be xxunk"


## Classification

In [None]:
#export
def pad_collate(samples, pad_idx=1, pad_first=True, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        sl = slice(-len(s[0]), sys.maxsize) if pad_first else slice(0, len(s[0]))
        res[i,sl] = LongTensor(s[0])
    if backwards: res = res.flip(1)
    return res, tensor(np.array([s[1] for s in samples]))

In [None]:
splits = RandomSplitter()(range_of(df_tok))
dsrc = DataSource(df_tok.itertuples(), filts=splits, type_tfms=[
    [attrgetter("text"), Numericalize(make_vocab(count))],
    [attrgetter("label"), Categorize()]])
dl = TfmdDL(dsrc, collate_fn=TfmdCollate(collate_fn=pad_collate))

In [None]:
dl.show_batch(max_samples=4)

Unnamed: 0,text,category
0,"xxbos xxmaj un - xxunk - believable ! xxmaj meg xxmaj ryan does n't even look her usual xxunk lovable self in this , which normally makes me forgive her shallow xxunk acting xxunk . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj xxunk … xxmaj xxunk xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj xxunk ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",negative
1,"xxbos xxmaj this is a extremely well - made film . xxmaj the acting , script and camera - work are all first - rate . xxmaj the music is good , too , though it is mostly early in the film , when things are still relatively xxunk . xxmaj there are no really xxunk in the cast , though several faces will be familiar . xxmaj the entire cast does an excellent job with the script . \n\n xxmaj but it is hard to watch , because there is no good end to a situation like the one presented . xxmaj it is now xxunk to blame the xxmaj british for setting xxmaj hindus and xxmaj muslims against each other ...",positive
2,"xxbos xxmaj every once in a long while a movie will come along that will be so awful that i feel compelled to warn people . xxmaj if i labor all my days and i can save but one soul from watching this movie , how great will be my joy . \n\n xxmaj where to begin my discussion of pain . xxmaj for xxunk , there was a musical montage every five minutes . xxmaj there was no character development . xxmaj every character was a stereotype . xxmaj we had xxunk guy , fat guy who eats donuts , goofy foreign guy , etc . xxmaj the script felt as if it were being written as the movie was being shot . xxm...",negative
3,"xxbos xxmaj name just says it all . i watched this movie with my dad when it came out and having served in xxmaj xxunk he had great admiration for the man . xxmaj the disappointing thing about this film is that it only concentrate on a short period of the man 's life - interestingly enough the man 's entire life would have made such an epic bio - xxunk that it is staggering to imagine the cost for production . \n\n xxmaj some posters xxunk to the flawed xxunk about the man , which are cheap shots . xxmaj the theme of the movie "" duty , xxmaj honor , xxmaj country "" are not just mere words ...",positive


## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_dataloader.ipynb.
Converted 01a_script.ipynb.
Converted 02_transforms.ipynb.
Converted 03_pipeline.ipynb.
Converted 04_data_external.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_source.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 09a_rect_augment.ipynb.
Converted 10_data_block.ipynb.
Converted 11_layers.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Converted 33_test_models_core.ipynb.
Converted 34_callback_rnn.ipynb.
Converted 35_tutorial_wikitex