In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.source import *
from local.data.external import *
from local.data.pipeline import *
from local.data.load import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
#export
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + ['xxfake' for _ in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c xxfake'.split()))

In [None]:
#export
class TensorText(TensorBase):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self.shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

In [None]:
# export
class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i[o_] for o_ in o]))
    def decodes(self, o): return Str(self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD]))

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start.split())

In [None]:
test_eq(t, tensor([11, 9, 12, 13, 14, 10]))
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start.split())
test_eq(t, tensor([0, 9, 0, 0, 0, 10]))
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_DataLoader -

In [None]:
#export
@delegates()
class LMDataLoader(TfmdDL):
    def __init__(self, dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, **kwargs):
        super().__init__(dataset=dataset, bs=bs, num_workers=num_workers, **kwargs)
        self.items = ReindexCollection([(o[0] if isinstance(o, tuple) else o) for o in dataset], cache=cache)
        self.seq_len = seq_len
        if lens is None: lens = [len(o) for o in self.items]
        self.lens = ReindexCollection(lens, idxs=self.items.idxs)
        # The "-1" is to allow for final label
        self.m = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.n = self.m//(self.seq_len)
        self.spb = self.n//bs
        self.chunkify()

    def chunkify(self): self.chunks = Chunks(self.items, self.lens)
    def shuffle_fn(self,idxs):
        self.items.shuffle()
        self.chunkify()
        return idxs

    def create_item(self, seq):
        if seq>=self.n: raise IndexError
        st = ((seq%self.bs)*self.spb + (seq//self.bs)) * self.seq_len
        txt = self.chunks[st : st+self.seq_len+1]
        return txt[:-1],txt[1:]
    
    @classmethod
    def dbunchify(cls, dsrc, bs=16, val_bs=None, shuffle_train=True, **kwargs):
        n = len(dsrc.filts)-1
        bss = [bs] + [2*bs]*n if val_bs is None else [bs] + [val_bs]*n
        shuffles = [shuffle_train] + [False]*n
        return DataBunch(*[cls(dsrc.subset(i), bs=b, shuffle=s, drop_last=s, **kwargs)
                           for i,(b,s) in enumerate(zip(bss, shuffles))])

In [None]:
bs,sl = 4,3
ints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).mapped(tensor)

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)
for x,y in dl: test_eq(x[:,1:], y[:,:-1])
((x0,y0), (x1,y1)) = tuple(dl)
#Second batch begins where first batch ended
test_eq(y0[:,-1], x1[:,0]) 

## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text')
df_tok.head(2)

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,"[xxbos, xxmaj, un, -, bleeping, -, believable, !, xxmaj, meg, xxmaj, ryan, does, n't, even, look, her, usual, pert, lovable, self, in, this, ,, which, normally, makes, me, forgive, her, shallow, ticky, acting, schtick, ., xxmaj, hard, to, believe, she, was, the, producer, on, this, dog, ., xxmaj, plus, xxmaj, kevin, xxmaj, kline, :, what, kind, of, suicide, trip, has, his, career, been, on, ?, xxmaj, whoosh, …, xxmaj, banzai, xxrep, 3, !, xxmaj, finally, this, was, directed, by, the, guy, who, did, xxmaj, big, xxmaj, chill, ?, xxmaj, must, be, a, replay, of, xxmaj, jonestown, -, hollywood,...",108
1,positive,False,"[xxbos, xxmaj, this, is, a, extremely, well, -, made, film, ., xxmaj, the, acting, ,, script, and, camera, -, work, are, all, first, -, rate, ., xxmaj, the, music, is, good, ,, too, ,, though, it, is, mostly, early, in, the, film, ,, when, things, are, still, relatively, cheery, ., xxmaj, there, are, no, really, superstars, in, the, cast, ,, though, several, faces, will, be, familiar, ., xxmaj, the, entire, cast, does, an, excellent, job, with, the, script, ., \n\n, xxmaj, but, it, is, hard, to, watch, ,, because, there, is, no, good, end, to, a, situation, like, the, one, ...]",462


In [None]:
texts,lengths = df_tok['text'],df_tok['text_lengths'].values.astype(np.int)

In [None]:
splits = RandomSplitter()(texts)
tfm = Numericalize(make_vocab(count))
#dsrc = DataSource(texts, [[tfm], [tfm]], filts=splits)
dsrc = DataSource(texts, [tfm], filts=splits)

In [None]:
dsrc.train.show_at(0)

xxbos xxmaj like most people i was xxunk when i heard the concept of this film , especially the " film makers were then attacked " aspect that the case seems to emphasize , what with the picture on the cover of the film makers being chased by an angry mob . 

 xxmaj then , to watch the film and discover , oh , what they mean by " the film makers were attacked " was some kids threw rocks at a sign and a number of people xxunk xxunk and said " someone should beat those two kids up . " xxmaj the picture on the cover , " the chase " as it were ? xxmaj total xxunk . xxmaj which i guess ties in with the theme of the film , lying and xxunk to satisfy vain , stupid children with more money and time then sense . 

 i have no idea what great truth the viewer is supposed to take away from this film . xxmaj it 's like xxmaj michael xxmaj moore 's " roger & xxmaj me " , but if " roger & xxmaj me " was xxmaj moore mocking the people of xxmaj xxunk . xxmaj it 's completely xxunk and totally inane . x

1

In [None]:
dbunch = LMDataLoader.dbunchify(dsrc, bs=16, seq_len=72)

In [None]:
dbunch.show_batch(max_n=6)

Unnamed: 0,text
0,"xxbos … said a couple xxunk the movie theater just as i was entering to watch this . xxmaj hmm , not a good sign , but who knows ? xxmaj different xxunk for different folks , after all . xxmaj well , nope . xxmaj they were being kind . xxmaj godard has released work that is passionate ( contempt ) , entertaining ( band of xxmaj outsiders ) , sometimes"
1,"carol , for instance , did not notice him constantly and strangely xxunk his xxunk the screenplay should have been improved and provided more suspense as these theatrical moves telegraphed the plot far too early to the audience . xxbos xxmaj this sure is one comedy xxmaj i 'm not likely to forget for a while . \n\n xxmaj would n't normally bother to comment on this movie : it 's so"
2,… \n\n xxmaj they xxup stuck xxup it xxup to xxup the xxup man ! xxbos i just watched xxmaj xxunk xxmaj fisher on xxup bravo . xxmaj what an awesome movie and incredible young man . xxmaj this movie is a must see for anyone who is dealing with how to overcome childhood abuse and xxunk as an adult . xxmaj denzel xxmaj washington puts in an outstanding performance as well
3,". xxmaj this film inspired me to xxunk in xxmaj sam xxmaj mraovich 's school of xxmaj screen writing , xxmaj acting , xxmaj directing , xxmaj xxunk , xxmaj casting , xxmaj producing , xxmaj production xxmaj design and xxmaj real xxmaj estate . i just want to say , "" thank you , xxmaj mr . xxmaj mraovich . xxmaj thank you for bringing this creation into the world ."
4,"or refer to my ethnic background as "" pollack . "" xxmaj and i certainly do n't like like it when others do . xxmaj can i watch or listen to a xxunk conversation in which this term is used ? xxmaj you xxunk ! xxmaj but again this did n't seem to be the case here . xxmaj it just seemed so out of place . xxmaj xxunk , xxunk ."
5,"the best of the xxmaj johnsons ' films . xxmaj the shots of the xxmaj xxunk are interesting and have some historical value as evidence of what this environment looked like in 1930 . xxmaj the shots of the xxmaj xxunk and other natives are also interesting , although these suffer from the xxmaj johnsons ' xxunk to stage events in a manner that makes the natives look ' wild ' and"


In [None]:
x,y = dbunch.one_batch()
test_eq(type(x), TensorText)

## Classification

In [None]:
#export
def pad_collate(samples, pad_idx=1, pad_first=False, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        sl = slice(-len(s[0]), sys.maxsize) if pad_first else slice(0, len(s[0]))
        res[i,sl] = LongTensor(s[0])
    if backwards: res = res.flip(1)
    return res, tensor(np.array([s[1] for s in samples]))

In [None]:
splits = RandomSplitter()(range_of(df_tok))
dsrc = DataSource(df_tok.itertuples(), filts=splits, tfms=[
    [attrgetter("text"), Numericalize(make_vocab(count))],
    [attrgetter("label"), Categorize()]])
dl = TfmdDL(dsrc.train, create_batch=pad_collate)

In [None]:
dl.show_batch(max_n=4)

Unnamed: 0,text,category
0,"xxbos xxmaj about the baby : xxmaj why was n't big brother assuming he 'd be hungry for a bottle or some xxunk or a xxunk change ? xxmaj he should have been screaming non - stop after that many hours without care . xxmaj definitely stupid to take the baby from a safe place when he did n't need to . \n\n xxmaj and why was the road miraculously clear whenever anyone wanted to drive somewhere ? xxmaj did n't any xxunk trees fall on the roads and block them ? \n\n i ca n't imagine the cops at the xxunk not immediately following after any young person who would crash it , especially when they said it was dangerous to go there . \n\n xxmaj that being said , it was nice to have a movie children could safely watch , for a change .",negative
1,"xxbos xxmaj while i count myself as a fan of the xxmaj xxunk 5 television series , the original movie that introduced the series was a weak start . xxmaj although many of the elements that would later mature and become much more compelling in the series are there , the pace of xxmaj the xxmaj gathering is slow , the makeup somewhat inadequate , and the plot confusing . xxmaj worse , the characterization in the premiere episode is poor . xxmaj although the ratings xxunk shows that many fans are willing to overlook these problems , i remember xxmaj the xxmaj gathering almost turned me off off what soon grew into a spectacular series .",negative
2,"xxbos xxmaj first off , anyone looking for meaningful "" outcome xxunk "" cinema that packs some sort of social message with meaningful performances and soul searching dialog spoken by dedicated , xxunk , heartfelt xxunk , please leave now . xxmaj you are wasting your time and life is short , go see the new xxmaj xxunk xxmaj jolie movie , have a good cry , go out & buy a xxunk car or throw away your conflict xxunk if that will make you feel better , and leave us alone . \n\n xxmaj do n't let the door hit you on the way out either . xxup the xxup incredible xxup melting xxup man is a grade b minus xxunk horror epic shot in the xxunk of xxmaj oklahoma by a young , xxup tv friendly cast & crew , and concerns itself with an astronaut who is exposed to bizarre radiation effects , wakes up in a hospital , and finds that his body is xxunk on him as he sits there feeling like a xxunk . xxmaj the melting man is played by one xxmaj alex xxmaj rebar , who is recognizable for about the first four minutes of the film . xxmaj but once he starts xxunk ' with xxmaj rick xxmaj baker 's extraordinary special effects makeup he more resembles something you might find in a tin of xxunk before you xxunk off all the xxunk , xxunk xxunk of xxunk . \n\n xxmaj the film has zero xxunk and does not xxunk about with plot points : xxmaj there are a couple of scenes involving scientist types riding around on an absurd industrial xxunk machine who xxunk xxunk a few obligatory lines about the effects of radiation but the movie does not care , really . xxmaj it 's a freak show and a xxunk one at that with a decidedly sick sense of humor for those who can stomach it -- xxmaj one great laugh comes when the melting man stumbles upon a young girl in the forest and is so at a loss for what to do that one of his eyes pops out . xxmaj hilarious . \n\n xxmaj the "" hero "" of the film is played by xxmaj burr debenning , a fascinating character actor from the golden 1970s & 80s television scene who was sort of an early model for the xxmaj kevin xxmaj xxunk xxunk ; slightly twisted , xxunk , and one step ahead of most everyone in the room even if he looks confused . xxmaj he appeared just after this movie was made in a bizarre made for xxup tv anthology horror piece called xxup house xxup of xxup the xxup dead ( or xxup the xxup alien xxup zone ) that is xxunk as one of the finest movies ever made in xxmaj oklahoma , which is where i suspect this film was made as well . xxmaj the xxunk , cold looking rural xxunk landscapes are certainly the same , and the xxunk that one unfortunate fly fisher chooses for his afternoon of sport appears to be the same one that xxmaj cameron xxmaj mitchell fought off flying alien xxunk in xxup without xxup warning … which also had a sick sense of humor , a xxup tv friendly cast , and some pretty outrageous gore . i definitely sense at least an aesthetic connection between the three movies , as well as xxup the xxup silence xxup of xxup the xxup xxunk which is of no surprise considering that director xxmaj jonathan xxmaj demme is a part of xxup melting man 's cast . \n\n xxmaj essentially , as others have pointed out , this is a 1950s b movie plot xxunk for later 1970s era special effects & the inevitable boobs . xxmaj the movie it probably xxunk most of it 's ideas from is xxup phantom xxup from xxup space with xxmaj peter xxmaj graves as an astronaut who also returns to xxmaj earth after being exposed to xxunk radiation effects that set him off on a killing xxunk . xxmaj one of the things that i actually xxunk about the film is that absolutely no regard is given for the melting man 's xxunk : xxmaj he simply goes on a rampage and the movie 's drama comes from wondering if he 's going to fall to pieces before certain characters fall victim to his madness . xxmaj the budget for the film is also xxunk low and every dime spent on it is up there on the screen , xxmaj rick xxmaj baker 's disgusting effects getting the lion 's share of whatever was spent on this . \n\n xxmaj sick , disgusting fun best enjoyed with a crowd of friends and plenty of beer . xxmaj why ca n't people have made more movies like these ? \n\n 8 / 10",positive
3,"xxbos xxmaj this is one of the best xxmaj hong xxmaj kong ( action ) films around and it has a tense and exciting storyline as well as great fight scenes . xxmaj this xxmaj xxunk film has it all , xxmaj romance , xxmaj drama , xxmaj excitement and a great hero as well . xxmaj it is the only martial arts film that got me interested in the plot rather than just waiting for the fights . xxmaj xxunk xxunk xxmaj this is a must see ( see also xxmaj eastern xxmaj xxunk , xxmaj xxunk xxmaj express ( xxunk xxmaj xxunk is xxmaj xxunk ! ) , xxmaj xxunk xxmaj forever and xxmaj enter the xxmaj fat xxmaj dragon .",positive


## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_torch_core.ipynb.
Converted 01b_script.ipynb.
Converted 01c_dataloader.ipynb.
Converted 02_data_transforms.ipynb.
Converted 03_data_pipeline.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_source.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 11_layers.ipynb.
Converted 11a_vision_models_xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 22_vision_learner.ipynb.
Converted 23_tutorial_transfer_learning.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Converted 33_text_models_core.i