In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.source import *
from local.data.external import *
from local.data.pipeline import *
from local.data.load import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
#export
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + ['xxfake' for _ in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c xxfake'.split()))

In [None]:
#export
class TensorText(TensorBase):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self.shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

In [None]:
# export
class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i[o_] for o_ in o]))
    def decodes(self, o): return Str(self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD]))

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start.split())

In [None]:
test_eq(t, tensor([11, 9, 12, 13, 14, 10]))
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start.split())
test_eq(t, tensor([0, 9, 0, 0, 0, 10]))
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_DataLoader -

In [None]:
#export
@delegates()
class LMDataLoader(TfmdDL):
    def __init__(self, dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, **kwargs):
        super().__init__(dataset=dataset, bs=bs, num_workers=num_workers, **kwargs)
        self.items = ReindexCollection([(o[0] if isinstance(o, tuple) else o) for o in dataset], cache=cache)
        self.seq_len = seq_len
        if lens is None: lens = [len(o) for o in self.items]
        self.lens = ReindexCollection(lens, idxs=self.items.idxs)
        # The "-1" is to allow for final label
        self.m = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.n = self.m//(self.seq_len)
        self.spb = self.n//bs

    def shuffle_fn(self,idxs): return idxs
    def before_iter(self):
        super().before_iter()
        if self.shuffle: self.items.shuffle()
        self.chunks = Chunks(self.items, self.lens)

    def create_item(self, seq):
        if seq>=self.n: raise IndexError
        st = ((seq%self.bs)*self.spb + (seq//self.bs)) * self.seq_len
        txt = self.chunks[st : st+self.seq_len+1]
        return txt[:-1],txt[1:]
    
    @classmethod
    def dbunchify(cls, dsrc, bs=16, val_bs=None, shuffle_train=True, **kwargs):
        n = len(dsrc.filts)-1
        bss = [bs] + [2*bs]*n if val_bs is None else [bs] + [val_bs]*n
        shuffles = [shuffle_train] + [False]*n
        return DataBunch(*[cls(dsrc.subset(i), bs=b, shuffle=s, drop_last=s, **kwargs)
                           for i,(b,s) in enumerate(zip(bss, shuffles))])

In [None]:
bs,sl = 4,3
ints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).mapped(tensor)

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])

In [None]:
# TODO check shuffled but contiguous

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)
for x,y in list(dl):
    print(x, 'x')
    print(y, 'y')

tensor([[ 0,  1,  2],
        [21, 22, 23],
        [12, 13, 14],
        [18,  5,  6]]) x
tensor([[ 1,  2,  3],
        [22, 23, 19],
        [13, 14, 15],
        [ 5,  6,  7]]) y
tensor([[ 3,  4, 24],
        [19, 20, 11],
        [15, 16, 17],
        [ 7,  8,  9]]) x
tensor([[ 4, 24, 21],
        [20, 11, 12],
        [16, 17, 18],
        [ 8,  9, 10]]) y


## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text')
df_tok.head(2)

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,"[xxbos, xxmaj, un, -, bleeping, -, believable, !, xxmaj, meg, xxmaj, ryan, does, n't, even, look, her, usual, pert, lovable, self, in, this, ,, which, normally, makes, me, forgive, her, shallow, ticky, acting, schtick, ., xxmaj, hard, to, believe, she, was, the, producer, on, this, dog, ., xxmaj, plus, xxmaj, kevin, xxmaj, kline, :, what, kind, of, suicide, trip, has, his, career, been, on, ?, xxmaj, whoosh, …, xxmaj, banzai, xxrep, 3, !, xxmaj, finally, this, was, directed, by, the, guy, who, did, xxmaj, big, xxmaj, chill, ?, xxmaj, must, be, a, replay, of, xxmaj, jonestown, -, hollywood,...",108
1,positive,False,"[xxbos, xxmaj, this, is, a, extremely, well, -, made, film, ., xxmaj, the, acting, ,, script, and, camera, -, work, are, all, first, -, rate, ., xxmaj, the, music, is, good, ,, too, ,, though, it, is, mostly, early, in, the, film, ,, when, things, are, still, relatively, cheery, ., xxmaj, there, are, no, really, superstars, in, the, cast, ,, though, several, faces, will, be, familiar, ., xxmaj, the, entire, cast, does, an, excellent, job, with, the, script, ., \n\n, xxmaj, but, it, is, hard, to, watch, ,, because, there, is, no, good, end, to, a, situation, like, the, one, ...]",462


In [None]:
texts,lengths = df_tok['text'],df_tok['text_lengths'].values.astype(np.int)

In [None]:
splits = RandomSplitter()(texts)
tfm = Numericalize(make_vocab(count))
#dsrc = DataSource(texts, [[tfm], [tfm]], filts=splits)
dsrc = DataSource(texts, [tfm], filts=splits)

In [None]:
dsrc.train.show_at(0)

xxbos i managed to grab a viewing of this with the aid of xxup mst3k , and oh boy , even with the xxunk this movie was excruciatingly bad . xxmaj imagine someone whose xxunk with a camera could be out done by a monkey . 

 xxmaj the highlights ( what little there were ) came from the special effects , which were " ok " . xxmaj the acting for the most part was also " ok " ; though nothing special , it was of a higher quality than other b - movies i have seen in the past . 

 xxmaj the rest of this movie is xxunk bad , xxmaj the camera work often looks like they 've just put the camera man on roller xxunk and pushed him along . xxmaj the story ( if it can be called that ) is so full of holes it 's almost funny , xxmaj it never really explains why the hell he survived in the first place , or needs human flesh in order to survive . xxmaj the script is poorly written and the dialogue xxunk on just plane stupid . xxmaj the climax to movie ( if there is one ) is absolutely laughable . 

 xxma

1

In [None]:
dbunch = LMDataLoader.dbunchify(dsrc, bs=16, seq_len=72)

In [None]:
dbunch.show_batch(max_n=6)

Unnamed: 0,text
0,"xxbos xxmaj this documentary explores a story covered in xxmaj pilger 's latest book "" freedom xxmaj next xxmaj time "" , which was published in 2006 . xxmaj it reveals the shocking xxunk of the natives of xxmaj diego xxmaj garcia , one of the xxmaj xxunk xxmaj xxunk in the xxmaj indian xxmaj ocean . \n\n xxmaj the islanders are technically xxmaj british citizens , as xxmaj diego xxmaj garcia"
1,) and the xxmaj jonestown xxunk fall from scene xxunk to bickering xxunk . xxmaj as the bands become more disjointed the friendships are stretched tension tight and at several points snap into xxunk and even on stage fights . xxmaj all of this is half funny and half tragic and believe it or not is xxunk watch able . xxmaj like i said at the beginning you can watch the xxmaj
2,"xxmaj next .. xxbos i really wanted to like this movie - the location shots were mostly filmed in xxmaj xxunk and the trailer had some wonderful photography . xxmaj but , even for a filmed cartoon , it was a really badly - made movie . xxmaj the continuity and pacing were both simply awful . xxmaj the best bits in the movie are under the ending credits , so it"
3,""" , and there was no xxunk . xxmaj there was also no "" grace "" ( wife of the original doctor ) and xxmaj hugh xxmaj crain 's wives died in totally different ways . xxmaj these changes , changed the story xxup way too much . i do n't know whether the producers of this movie should be glad xxmaj shirley xxmaj jackson no longer walks this earth or whether"
4,first xxunk by some old codger after her cherry was xxunk off ? xxmaj nope . xxmaj she lies there like a cold xxunk of meat on a xxunk block . xxmaj of course she is n't supposed to enjoy it . xxmaj and that is what i mean about this movie . xxmaj why could n't they have given her something to enjoy ? xxmaj why does all the sex have
5,"there are many many better ones . xxbos xxup star xxup rating : xxrep 5 * xxmaj saturday xxmaj night xxrep 4 * xxmaj friday xxmaj night xxrep 3 * xxmaj friday xxmaj morning * * xxmaj sunday xxmaj night * xxmaj xxunk xxmaj morning \n\n xxmaj ray ( ray xxmaj winstone ) has a criminal past , has had problems with alcohol and is now xxunk a drug habit that is"


In [None]:
x,y = dbunch.one_batch()
test_eq(type(x), TensorText)

## Classification

In [None]:
#export
def pad_collate(samples, pad_idx=1, pad_first=False, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        sl = slice(-len(s[0]), sys.maxsize) if pad_first else slice(0, len(s[0]))
        res[i,sl] = LongTensor(s[0])
    if backwards: res = res.flip(1)
    return res, tensor(np.array([s[1] for s in samples]))

In [None]:
splits = RandomSplitter()(range_of(df_tok))
dsrc = DataSource(df_tok.itertuples(), filts=splits, tfms=[
    [attrgetter("text"), Numericalize(make_vocab(count))],
    [attrgetter("label"), Categorize()]])
dl = TfmdDL(dsrc.train, create_batch=pad_collate)

In [None]:
dl.show_batch(max_n=4)

Unnamed: 0,text,category
0,"xxbos a grade - z horror filmmaker xxmaj xxunk xxmaj xxunk was one of the most xxunk directors operating within the field of the low - budget gory xxunk movies are full of inept gore , laughable acting , boring sub - plots and woeful xxunk mysterious black xxunk figure is xxunk xxunk xxunk staying at the family xxunk this film is almost xxunk do n't actually see the murders except with shadows and a few blood xxunk pace is xxunk and the plot is rather xxunk acting is merely competent , but the lack of gore and xxunk left me xxunk generous 4 out of xxunk xxunk : do not mistake xxmaj xxunk 's film with xxmaj andy xxmaj xxunk 's equally weak "" legacy of xxmaj blood "" .",negative
1,"xxbos xxmaj surprisingly enough does movie does have some redeeming quality in it when it moves toward its end . xxmaj for the other part this movie is being a really bad and lame one , with a small budget , insultingly bad written script and everything that goes with it . \n\n xxmaj it 's silly that with all the money going around in the xxmaj christian circles they never can seem to get xxunk xxunk to make a decent movie with . xxmaj i 'm not a religious , so i could n't care less really but film - making does some like a good tool to reach a new audience for xxunk and getting people more interested and curious in reading the bible for instance . xxmaj in that regard these movies always seem like a wasted opportunity . \n\n xxmaj the low budget does really hurt the movie and brings it down . xxmaj it makes the movie laughable to watch with its effects and it just gives the overall movie a campy xxmaj b - movie like feeling . \n\n xxmaj but what 's hurting this movie more is its writing . xxmaj the stuff that just happens in this movie is just insulting to the intelligence and then xxmaj i 'm not even complaining or talking about the religious aspects of the whole story . xxmaj the way the movie xxunk is just so improbable and the people within this movie do such highly unlikely things that it 's being insulting to its viewers . \n\n i also hated how the movie was being like a soap opera at times . xxmaj seemed to me that they simply had a hard time turning this into a full length movie and they added in some characters and dramatic developments just to fill things up . i just could n't cared less really at times . \n\n xxmaj still it needs to be said that the movie gets more solid and steady toward its end , when its story gets more focused on its essence . xxmaj still it remains predictable all but it xxunk this movie from being a complete wreck to watch and as far as these type of movies are concerned , there are far xxunk one to watch out there , though i do n't think this movie will win over any new souls . \n\n 4 / 10",negative
2,"xxbos xxmaj in keeping with xxmaj disney 's well - known practice of stealing .. i mean .. buying out known xxunk and xxunk them , this live - action version of the xxunk cartoon classic has got to be one of the worst re - makes in a year of bad re - makes . i grew up on the original cartoon xxup tv series . xxmaj any episode of the original cartoon series will give you more laughs than this entire movie . xxmaj not present is xxmaj penny 's cool computer book . xxmaj also not present is the gag with the self - xxunk orders that always ends up xxunk on the xxmaj chief . xxmaj new are a smooth talking xxmaj xxunk convertible ( the original cartoon had a xxunk vehicle that could turn into a van or a car ) and an element of a typical , unrealistic xxmaj hollywood romance . xxmaj do n't fill the xxunk to pay for xxmaj disney executives and even their _ xxunk - do n't see this movie .",negative
3,"xxbos xxmaj the first hour or so of the movie was mostly boring to say the least . xxmaj however it improved afterwards as the xxmaj valentine xxmaj party xxunk . xxmaj apart from the twist as to the identity of the killer in the very end , the hot bath murder scene was one of the few relatively memorable aspects of this movie . xxmaj the scene at the garden with xxmaj kate was well shot and so was the very last scene ( the ' twist ' ) . xxmaj in those scenes , there was some genuine suspense and thrills and the hot bath murder scene had a nasty ( the way slashers should be ) edge to it . xxmaj the earlier murders are xxunk devoid of gore .",positive


## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_torch_core.ipynb.
Converted 01b_script.ipynb.
Converted 01c_dataloader.ipynb.
Converted 02_data_transforms.ipynb.
Converted 03_data_pipeline.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_source.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 11_layers.ipynb.
Converted 11a_vision_models_xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 22_vision_learner.ipynb.
Converted 23_tutorial_transfer_learning.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Converted 33_text_models_core.i