In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.source import *
from local.data.external import *
from local.data.pipeline import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + ['xxfake' for _ in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c xxfake'.split()))

In [None]:
# export
class Numericalize(ItemTransform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})
    
    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o.split(self.sep))
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o):      return [self.o2i[o_] for o_ in o.split(self.sep)]
    def decodes(self, o)->Str: return self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD])

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start)
test_eq(t, [11, 9, 12, 13, 14, 10])
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start)
test_eq(t, [0, 9, 0, 0, 0, 10])
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_Dataset -

In [None]:
#export
from IPython.display import display, HTML

class TensorText(TensorBase):
    def get_ctxs(self, max_samples=10, **kwargs):
        n_samples = min(self.shape[0], max_samples)
        df = pd.DataFrame({'index': range(n_samples)})
        return [df.iloc[i] for i in range(n_samples)]
    
    def display(self, ctxs): display(HTML(pd.DataFrame(ctxs).to_html(index=False)))

In [None]:
#export
def apply_coords(f, *dims):
    "Create coord array of size `dims` and apply `f` to each cell"
    gs = np.meshgrid(*map(range, dims), indexing='ij')
    return np.apply_along_axis(f, 0, np.stack(gs))

In [None]:
apply_coords(str,2,3,4)

array([[['[0 0 0]', '[0 0 1]', '[0 0 2]', '[0 0 3]'],
        ['[0 1 0]', '[0 1 1]', '[0 1 2]', '[0 1 3]'],
        ['[0 2 0]', '[0 2 1]', '[0 2 2]', '[0 2 3]']],

       [['[1 0 0]', '[1 0 1]', '[1 0 2]', '[1 0 3]'],
        ['[1 1 0]', '[1 1 1]', '[1 1 2]', '[1 1 3]'],
        ['[1 2 0]', '[1 2 1]', '[1 2 2]', '[1 2 3]']]], dtype='<U7')

In [None]:
class LM_Sampler(Sampler):
    def __init__(self, ds): self.ds,self.bs,self.spb = ds,ds.bs,len(ds)//ds.bs
    def __len__(self): return len(self.ds)
    def __iter__(self): return ((i%self.bs)*self.spb + (i//self.bs) for i in L.range(self.ds))

In [None]:
#export
class LM_Dataset(BaseDS):
    def __init__(self, ds, lens=None, bs=64, seq_len=72, shuffle=False, cache=2, as_tensor=True):
        super().__init__(ReindexCollection(ds, cache=cache))
        self.bs,self.seq_len,self.shuffle,self.as_tensor = bs,seq_len,shuffle,as_tensor
        if lens is None: lens = [len(o[0]) for o in ds]
        self.lens = ReindexCollection(lens, idxs=self.ds.idxs)
        # The "-1" is to allow for final label
        self.n = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.reset()
        
    def __len__(self): return self.n//(self.seq_len)
    def reset(self):
        if self.shuffle: self.ds.shuffle()
        self.cum_lens = np.cumsum(self.lens)
    
    def __getitem__(self, seq):
        def _f(xy,it):
            tokidx = seq*self.seq_len + xy + it
            docidx = np.searchsorted(self.cum_lens, tokidx+1)
            return self.ds[docidx][0][tokidx-self.cum_lens[docidx]]
        res = [[_f(xy,it) for it in range(self.seq_len)] for xy in range(2)]
        return tuple(TensorText(tensor(o)) if self.as_tensor else o for o in res)

In [None]:
bs,sl = 4,3
ints = [(o,) for o in [[0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]]]

In [None]:
t = LM_Dataset(ints, bs=bs, seq_len=sl)
dl = DataLoader(t, batch_size=bs, sampler=LM_Sampler(t), collate_fn=noop)
test_eq(list(dl),
    [[[tensor([0, 1, 2]), tensor([1, 2, 3])],
      [tensor([6, 7, 8]), tensor([7, 8, 9])],
      [tensor([12, 13, 14]), tensor([13, 14, 15])],
      [tensor([18, 19, 20]), tensor([19, 20, 21])]],
     [[tensor([3, 4, 5]), tensor([4, 5, 6])],
      [tensor([9, 10, 11]), tensor([10, 11, 12])],
      [tensor([15, 16, 17]), tensor([16, 17, 18])],
      [tensor([21, 22, 23]), tensor([22, 23, 24])]]])

## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text')
df_tok.head(2)

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,"xxbos xxmaj un - bleeping - believable ! xxmaj meg xxmaj ryan does n't even look her usual pert lovable self in this , which normally makes me forgive her shallow ticky acting schtick . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj whoosh … xxmaj banzai xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj chill ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",108
1,positive,False,"xxbos xxmaj this is a extremely well - made film . xxmaj the acting , script and camera - work are all first - rate . xxmaj the music is good , too , though it is mostly early in the film , when things are still relatively cheery . xxmaj there are no really superstars in the cast , though several faces will be familiar . xxmaj the entire cast does an excellent job with the script . \n\n xxmaj but it is hard to watch , because there is no good end to a situation like the one presented . xxmaj it is now fashionable to blame the xxmaj british for setting xxmaj hindus and xxmaj muslims against...",462


In [None]:
texts,lengths = df_tok['text'].values,df_tok['text_lengths'].values.astype(np.int)

In [None]:
textlst = L(texts, use_list=True)
splits = RandomSplitter()(textlst)
dsrc = DataSource(textlst, type_tfms=[Numericalize(make_vocab(count))], filts=splits)

In [None]:
dsrc.decode_at(0)

("xxbos xxmaj un - xxunk - believable ! xxmaj meg xxmaj ryan does n't even look her usual xxunk lovable self in this , which normally makes me forgive her shallow xxunk acting xxunk . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj xxunk … xxmaj xxunk xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj xxunk ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",)

In [None]:
bs = 16
ds = LM_Dataset(dsrc.train, lens=lengths[splits[0]], bs=bs)
samp = LM_Sampler(ds)
dl = TfmdDL(ds, bs=bs, sampler=samp, num_workers=2)

In [None]:
x,y = dl.one_batch()
x.shape

torch.Size([16, 72])

In [None]:
ds.decode((x[0],))

('xxbos xxmaj this is one of my all - time favorite films , and while it may move too slowly for some , it \'s well worth seeing . a corporate lawyer ( richard xxmaj chamberlain ) is dragged into a case involving " city " xxmaj aborigines , and this is no ordinary case . xxup ok , a man has died but it was n\'t exactly a normal killing .',)

In [None]:
dl.show_batch()

index,text
0,"xxbos xxmaj this is one of my all - time favorite films , and while it may move too slowly for some , it 's well worth seeing . a corporate lawyer ( richard xxmaj chamberlain ) is dragged into a case involving "" city "" xxmaj aborigines , and this is no ordinary case . xxup ok , a man has died but it was n't exactly a normal killing ."
1,"think everyone xxunk my points in here . xxmaj in this so called documentary is very little truth , and my advice to everyone is : xxmaj inform yourself before watching this movie . xxmaj after that , you will only laugh at all pathetic accusations . \n\n xxmaj watch real , neutral documentaries about death of xxmaj yugoslavia . xxbos xxmaj nick xxmaj cage is xxmaj xxunk xxmaj raines , a"
2,perhaps my expectations would have been met had this been in the comedy section . xxmaj i 'm all for low - budget xxmaj indie horror but this one takes the crap - cake . xxmaj give xxmaj five xxmaj across the xxmaj eyes ( or xxup fate ; get it ? ) a pass . \n\n xxunk gives xxup fate 1 xxunk out of five / xxbos xxmaj xxunk xxup b.
3,"xxunk , in a way that is timeless . xxmaj the entire drama xxunk solely on the shoulders of xxmaj mr . xxmaj bronson and xxmaj ms . xxmaj xxunk , who do not disappoint . ( may they both rest in peace . ) \n\n a true classic . xxbos xxmaj not one of xxmaj keaton 's best efforts , this was perhaps a xxunk attempt to revenge himself on the"
4,"much going in to this , but still came away disappointed . xxmaj this was my least favorite xxmaj xxunk production i have seen . i thought it was supposed to be a comedy , but i only xxunk at 3 or 4 jokes . xxmaj is it really a funny gag to see a fat guy eating donuts and falling down over and over ? xxmaj what was up with the"
5,but in xxmaj lost you could predict xxup nothing . xxmaj every thing was so surprisingly stunning and it really was a mystery not because it has so many secrets but because there was nothing like it before everything was so great . i literally became xxunk to it . xxup lost is a classic work of art . xxmaj it gives you something to look forward to every week . xxmaj
6,"xxunk offering , xxmaj flik accidentally xxunk it into a stream just before the grasshoppers arrive to get it ! xxmaj the xxunk leader , xxmaj hopper , decides to give them a second chance to gather food and have it ready by the end of the season , but they will have to double their offering ! xxmaj flik suggests to the colony 's royal xxunk that he goes and finds"
7,"films should be recommended . xxmaj for me , the bottom line is always , was i entertained ? xxmaj did i have a good time with this movie ? xxmaj and here the answer to both was "" yes . "" xxmaj the first in the series is also the most raw . xxmaj it opens with some kind of accident at a nuclear facility and people melt down or something"
8,"and lonely and had a lot of love to give the right man . xxmaj she would probably end up making an awesome mom too . \n\n i could see in the future , a house full of xxmaj loretta and xxmaj xxunk 's loud , screaming happy kids and xxmaj rose and xxmaj xxunk enjoying every minute of it . xxbos xxmaj there is no relation at all between xxmaj fortier"
9,"' hanzo ' series . xxmaj if they had made 20 sequels more , i would have happily watched them all ! xxmaj the entire xxmaj hanzo series is brilliant , and while this third part is a bit inferior compared to its predecessors , it is definitely a must - see for all lovers of cult - cinema ! xxmaj oh how i wish they had made more sequels ! xxbos"


## Classification

In [None]:
def pad_collate(samples, pad_idx=1, pad_first=True, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        sl = slice(-len(s[0]), sys.maxsize) if pad_first else slice(0, len(s[0]))
        res[i,sl] = LongTensor(s[0])
    if backwards: res = res.flip(1)
    return TensorText(res), tensor(np.array([s[1] for s in samples]))

In [None]:
splits = RandomSplitter()(range_of(df_tok))
dsrc = DataSource(df_tok.itertuples(), filts=splits, type_tfms=[
    [attrgetter("text"), Numericalize(make_vocab(count))],
    [attrgetter("label"), Categorize()]])
dl = TfmdDL(dsrc, collate_fn=TfmdCollate(collate_fn=pad_collate))

In [None]:
dl.show_batch(max_samples=4)

index,text,category
0,"xxbos xxmaj un - xxunk - believable ! xxmaj meg xxmaj ryan does n't even look her usual xxunk lovable self in this , which normally makes me forgive her shallow xxunk acting xxunk . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj xxunk … xxmaj xxunk xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj xxunk ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",negative
1,"xxbos xxmaj this is a extremely well - made film . xxmaj the acting , script and camera - work are all first - rate . xxmaj the music is good , too , though it is mostly early in the film , when things are still relatively xxunk . xxmaj there are no really xxunk in the cast , though several faces will be familiar . xxmaj the entire cast does an excellent job with the script . \n\n xxmaj but it is hard to watch , because there is no good end to a situation like the one presented . xxmaj it is now xxunk to blame the xxmaj british for setting xxmaj hindus and xxmaj muslims against each other ...",positive
2,"xxbos xxmaj every once in a long while a movie will come along that will be so awful that i feel compelled to warn people . xxmaj if i labor all my days and i can save but one soul from watching this movie , how great will be my joy . \n\n xxmaj where to begin my discussion of pain . xxmaj for xxunk , there was a musical montage every five minutes . xxmaj there was no character development . xxmaj every character was a stereotype . xxmaj we had xxunk guy , fat guy who eats donuts , goofy foreign guy , etc . xxmaj the script felt as if it were being written as the movie was being shot . xxm...",negative
3,"xxbos xxmaj name just says it all . i watched this movie with my dad when it came out and having served in xxmaj xxunk he had great admiration for the man . xxmaj the disappointing thing about this film is that it only concentrate on a short period of the man 's life - interestingly enough the man 's entire life would have made such an epic bio - xxunk that it is staggering to imagine the cost for production . \n\n xxmaj some posters xxunk to the flawed xxunk about the man , which are cheap shots . xxmaj the theme of the movie "" duty , xxmaj honor , xxmaj country "" are not just mere words ...",positive
