In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.source import *
from local.data.external import *
from local.data.pipeline import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + ['xxfake' for _ in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c xxfake'.split()))

In [None]:
# export
class Numericalize(ItemTransform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})
    
    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o.split(self.sep))
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o):      return [self.o2i[o_] for o_ in o.split(self.sep)]
    def decodes(self, o)->Str: return self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD])

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start)
test_eq(t, [11, 9, 12, 13, 14, 10])
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start)
test_eq(t, [0, 9, 0, 0, 0, 10])
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_Dataset -

In [None]:
#export
from IPython.display import display, HTML

class TensorText(TensorBase):
    def get_ctxs(self, max_samples=10, **kwargs):
        n_samples = min(self.shape[0], max_samples)
        df = pd.DataFrame({'index': range(n_samples)})
        return [df.iloc[i] for i in range(n_samples)]
    
    def display(self, ctxs): display(HTML(pd.DataFrame(ctxs).to_html(index=False)))

In [None]:
#export
def apply_coords(f, *dims):
    "Create coord array of size `dims` and apply `f` to each cell"
    gs = np.meshgrid(*map(range, dims), indexing='ij')
    return np.apply_along_axis(f, 0, np.stack(gs))

In [None]:
apply_coords(str,2,3,4)

array([[['[0 0 0]', '[0 0 1]', '[0 0 2]', '[0 0 3]'],
        ['[0 1 0]', '[0 1 1]', '[0 1 2]', '[0 1 3]'],
        ['[0 2 0]', '[0 2 1]', '[0 2 2]', '[0 2 3]']],

       [['[1 0 0]', '[1 0 1]', '[1 0 2]', '[1 0 3]'],
        ['[1 1 0]', '[1 1 1]', '[1 1 2]', '[1 1 3]'],
        ['[1 2 0]', '[1 2 1]', '[1 2 2]', '[1 2 3]']]], dtype='<U7')

In [None]:
class LM_Sampler(Sampler):
    def __init__(self, ds): self.ds,self.bs,self.spb = ds,ds.bs,len(ds)//ds.bs
    def __iter__(self): return ((i%self.bs)*self.spb + (i//self.bs) for i in L.range(self.ds))

In [None]:
#export
class LM_Dataset(BaseDS):
    def __init__(self, ds, lens=None, bs=64, seq_len=72, shuffle=False, cache=2):
        super().__init__(ReindexCollection(ds, cache=cache))
        self.bs,self.seq_len,self.shuffle = bs,seq_len,shuffle
        if lens is None: lens = [len(o[0]) for o in ds]
        self.lens = ReindexCollection(lens, idxs=self.ds.idxs)
        # The "-1" is to allow for final label
        self.n = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.reset()
        
    def __len__(self): return self.n//(self.seq_len)
    def reset(self):
        if self.shuffle: self.ds.shuffle()
        self.cum_lens = np.cumsum(self.lens)
    
    def __getitem__(self, seq):
        def _f(o):
            tokidx = seq*self.seq_len + o[0] + o[1]
            docidx = np.searchsorted(self.cum_lens, tokidx+1)
            return self.ds[docidx][0][tokidx-self.cum_lens[docidx]]
        res = apply_coords(_f, 2, self.seq_len)
        return tuple(TensorText(tensor(o)) for o in res)

In [None]:
rev = {v:k for k,v in enumerate(string.ascii_lowercase)}
ints = [([rev[o_] for o_ in o],) for o in 'abcde fghijk lmnopqrs tu vwx y'.split()]

In [None]:
bs,sl=(4,3)
t = LM_Dataset(ints, bs=bs, seq_len=sl)
s = LM_Sampler(t)

In [None]:
dl = TfmdDL(t, bs=bs, sampler=s)
list(dl)

[(tensor([[ 0,  1,  2],
          [ 6,  7,  8],
          [12, 13, 14],
          [18, 19, 20]]), tensor([[ 1,  2,  3],
          [ 7,  8,  9],
          [13, 14, 15],
          [19, 20, 21]])), (tensor([[ 3,  4,  5],
          [ 9, 10, 11],
          [15, 16, 17],
          [21, 22, 23]]), tensor([[ 4,  5,  6],
          [10, 11, 12],
          [16, 17, 18],
          [22, 23, 24]]))]

In [None]:
x,y = dl.one_batch()

In [None]:
bs,sl = 4,3
txts = [(o,) for o in 'abcde fghijk lmnopqrs tu vwx y'.split()]

In [None]:
txts

[('abcde',), ('fghijk',), ('lmnopqrs',), ('tu',), ('vwx',), ('y',)]

In [None]:
#t = LM_Dataset(txts, bs=bs, seq_len=sl)
#test_eq(list(t),
#        [[[['a','b','c'], ['g','h','i'], ['m','n','o'], ['s', 't', 'u']],
#          [['b','c','d'], ['h','i','j'], ['n','o','p'], ['t','u','v']]],
#         [[['d','e','f'], ['j','k','l'], ['p','q','r'], ['v','w','x']],
#          [['e','f','g'], ['k','l','m'], ['q','r','s'], ['w','x','y']]]])

In [None]:
#t = LM_Dataset(txts[:-1], bs=bs, seq_len=sl)
#test_eq(list(t),
#        [[[['a','b','c'], ['d','e','f'], ['g','h','i'], ['j','k','l']],
#          [['b','c','d'], ['e','f','g'], ['h','i','j'], ['k','l','m']]]])

## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text')
df_tok.head(2)

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,"xxbos xxmaj un - bleeping - believable ! xxmaj meg xxmaj ryan does n't even look her usual pert lovable self in this , which normally makes me forgive her shallow ticky acting schtick . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj whoosh … xxmaj banzai xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj chill ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",108
1,positive,False,"xxbos xxmaj this is a extremely well - made film . xxmaj the acting , script and camera - work are all first - rate . xxmaj the music is good , too , though it is mostly early in the film , when things are still relatively cheery . xxmaj there are no really superstars in the cast , though several faces will be familiar . xxmaj the entire cast does an excellent job with the script . \n\n xxmaj but it is hard to watch , because there is no good end to a situation like the one presented . xxmaj it is now fashionable to blame the xxmaj british for setting xxmaj hindus and xxmaj muslims against...",462


In [None]:
texts,lengths = df_tok['text'].values,df_tok['text_lengths'].values.astype(np.int)

In [None]:
splits = RandomSplitter()(L(t for t in texts))
dsrc = DataSource(L(t for t in texts), type_tfms=[Numericalize(make_vocab(count))], filts=splits)

In [None]:
dsrc.decode_at(0)

("xxbos xxmaj un - xxunk - believable ! xxmaj meg xxmaj ryan does n't even look her usual xxunk lovable self in this , which normally makes me forgive her shallow xxunk acting xxunk . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj xxunk … xxmaj xxunk xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj xxunk ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",)

In [None]:
bs = 16
ds = LM_Dataset(dsrc.train, lens=lengths[splits[0]], bs=bs)
samp = LM_Sampler(ds)
dl = TfmdDL(ds, bs=bs, sampler=samp, num_workers=2)

In [None]:
x,y = dl.one_batch()
x.shape

torch.Size([16, 72])

In [None]:
ds.decode((x[0],))

("xxbos this movie was definitely the worst movie i 've seen in my entire life , and i 've seen some pretty bad movies . i did n't like the way this movie was filmed . all of the actors are unknown , and it looks as though a bunch of friends got together and decided to film their own movie . but it 's absolutely horrible . i 've never seen",)

In [None]:
dl.show_batch()

index,text
0,"xxbos this movie was definitely the worst movie i 've seen in my entire life , and i 've seen some pretty bad movies . i did n't like the way this movie was filmed . all of the actors are unknown , and it looks as though a bunch of friends got together and decided to film their own movie . but it 's absolutely horrible . i 've never seen"
1,. \n\n▁ xxmaj it 's hard to figure out what attracted xxmaj xxunk xxmaj heston and xxmaj james xxmaj xxunk to their respective roles . xxmaj heston plays a retired xxunk who goes after an xxunk bunch of convicts led by a violent xxunk ( xxunk ) . xxmaj the hunt becomes even more personal when xxmaj heston 's daughter ( barbara xxmaj xxunk ) is kidnapped by the convicts and subjected
2,was brought up as xxunk and have considered myself as an atheist throughout my adult life . xxmaj in fact when it comes to religion i consider myself a xxmaj marxist and religion is a cynical weapon used to manipulate people . a xxup love xxup divided shows what happens when self xxunk moral xxunk take it upon themselves to tell other people what to think and believe . xxmaj may i
3,"beautiful once you realize what 's going on . \n\n xxmaj if you enjoy well - acted movies with twists and suspense , and are prepared to accept a slightly fantastic xxmaj philip xxunk xxmaj dick style resolution , then this is a must - see . \n\n 9 / 10 xxbos "" deliverance "" is one of the best exploitation films to come out of that wonderful 1970 's decade from"
4,". xxmaj and if you believe an interview with xxmaj hines on the xxmaj xxunk website , this film had an 8 figure budget ! i can only assume that xxunk facial hair does not come cheap in the xxup us . xxmaj maybe the problem is that xxmaj hines & co tried to make a film of the book , rather than turn the book into a film ( if that"
5,"xxmaj sure there is ! xxmaj count xxmaj xxunk comes to mind . xxmaj with this xxunk bunch we must add a mysterious black cat who i have xxunk named xxmaj xxunk xxmaj xxunk . ( remember xxmaj speed xxmaj xxunk ? ) xxmaj as you have already guessed , they were murdered in this xxunk of doom and now desire to kill everyone who enters the premises . xxmaj you see"
6,"this movie that much more relevant . xxmaj it clearly marks a moment in time for our collective xxunk . xxbos xxmaj creative use of modern and mystical elements : 1956 xxmaj xxunk convertible to xxunk evil xxunk xxmaj xxunk xxmaj turner ( john xxmaj waters ' "" serial xxmaj mom "" ) and the 2 twisted sisters ; xxmaj queen xxmaj xxunk as the xxunk xxunk ; xxmaj david xxmaj warner"
7,"xxmaj doctor xxmaj who has shown potential ever since from episode one from the new series in 2005 , first being so harmless to scary , from fun to serious , from light to darkness . i hope many old fans will one day soon say "" the old xxmaj doctor xxmaj who has returned "" . \n\n 10 out of 10 xxbos xxmaj kubrick again puts on display his stunning ability"
8,"probably end up buying it myself . xxmaj but even though it 's not as good as "" spirited xxmaj away , "" it 's still pretty good . xxbos xxmaj what a sad sight these xxup tv xxunk make , running out the clock on their careers stumbling about a little xxunk xxunk of a ship - boat might be more appropriate . xxmaj the whole production feels cheap and xxunk"
9,"lead actor xxmaj ryan xxmaj xxunk does a pretty good job of xxunk xxmaj kyle xxmaj reese … there 's a massacre in a police xxunk … the bad guy is xxunk with red eyes … and it even contains dialogue along the lines of "" you said it yourself , he wo n't ever stop . xxmaj never . "" xxmaj the dire script comes from a first - time screenwriter"


## Classification

In [None]:
def pad_collate(samples, pad_idx=1, pad_first=True, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        if pad_first: res[i,-len(s[0]):] = LongTensor(s[0])
        else:         res[i,:len(s[0]):] = LongTensor(s[0])
    if backwards: res = res.flip(1)
    return TensorText(res), tensor(np.array([s[1] for s in samples]))

In [None]:
splits = RandomSplitter()(range(len(df)))
_get_txt = lambda i: df_tok["text"][i]
_get_lbl = lambda i: df_tok["label"][i]
dsrc = DataSource(range(len(df)), type_tfms=[[_get_txt, Numericalize(make_vocab(count))], [_get_lbl, Categorize()]], filts=splits)
dl = TfmdDL(dsrc, collate_fn=TfmdCollate(collate_fn=pad_collate))

In [None]:
dl.show_batch()

index,text,category
0,"xxbos xxmaj un - xxunk - believable ! xxmaj meg xxmaj ryan does n't even look her usual xxunk lovable self in this , which normally makes me forgive her shallow xxunk acting xxunk . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj xxunk … xxmaj xxunk xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj xxunk ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",negative
1,"xxbos xxmaj this is a extremely well - made film . xxmaj the acting , script and camera - work are all first - rate . xxmaj the music is good , too , though it is mostly early in the film , when things are still relatively xxunk . xxmaj there are no really xxunk in the cast , though several faces will be familiar . xxmaj the entire cast does an excellent job with the script . \n\n xxmaj but it is hard to watch , because there is no good end to a situation like the one presented . xxmaj it is now xxunk to blame the xxmaj british for setting xxmaj hindus and xxmaj muslims against each other ...",positive
2,"xxbos xxmaj every once in a long while a movie will come along that will be so awful that i feel compelled to warn people . xxmaj if i labor all my days and i can save but one soul from watching this movie , how great will be my joy . \n\n xxmaj where to begin my discussion of pain . xxmaj for xxunk , there was a musical montage every five minutes . xxmaj there was no character development . xxmaj every character was a stereotype . xxmaj we had xxunk guy , fat guy who eats donuts , goofy foreign guy , etc . xxmaj the script felt as if it were being written as the movie was being shot . xxm...",negative
3,"xxbos xxmaj name just says it all . i watched this movie with my dad when it came out and having served in xxmaj xxunk he had great admiration for the man . xxmaj the disappointing thing about this film is that it only concentrate on a short period of the man 's life - interestingly enough the man 's entire life would have made such an epic bio - xxunk that it is staggering to imagine the cost for production . \n\n xxmaj some posters xxunk to the flawed xxunk about the man , which are cheap shots . xxmaj the theme of the movie "" duty , xxmaj honor , xxmaj country "" are not just mere words ...",positive
4,"xxbos xxmaj this movie succeeds at being one of the most unique movies you 've seen . xxmaj however this comes from the fact that you ca n't make heads or xxunk of this mess . xxmaj it almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid . xxmaj if you do n't want to feel xxunk you 'll sit through this horrible film and develop a real sense of pity for the actors involved , they 've all seen better days , but then you realize they actually got paid quite a bit of money to do this and you 'll l...",negative
5,"xxbos xxmaj from the start , you know how this movie will end . xxmaj it 's so full of clichés your typical xxup xxunk member will not even like this movie . i give it 2 out of 10 , only because of the acting of xxmaj william xxmaj xxunk . i ca n't believe people voted 6 + for this movie . xxmaj it 's so biased towards a ' certain point of view ' ( once a thief … ) . xxmaj people are n't born bad . xxmaj neither are they born good . xxmaj they are born with a clean xxunk . xxmaj it 's society , parents and education what makes them who they are . xxmaj and if they take the wrong turn , som...",negative
6,"xxbos xxmaj there were a lot of truly great horror movies produced in the seventies - but this film certainly is n't one of them ! xxmaj it 's a shame xxmaj the xxmaj child is n't better as it works from a decent idea that takes in a couple of sometimes successful horror themes . xxmaj we have the idea of a xxunk child , which worked so well in classic films such as xxmaj the xxmaj bad xxmaj xxunk and then we have the central zombie theme , which of course has been the xxunk of many a successful horror movie . xxmaj the plot is basically this : young girl blames a load of people for the de...",negative
7,"xxbos i was xxunk enough to meet xxmaj george xxmaj pal ( and still have my xxup xxunk : xxunk poster xxunk by him ) at a convention shortly after the release , and asked him why he chose to do the film "" camp "" . xxmaj before he could answer , two studio flacks xxunk and xxunk me on how the studio "" knew best "" and how "" no one will take such a film seriously "" . i had been reading the xxmaj bantam xxunk for a couple of years thanks to a friend ( xxunk xxunk of the 1970s will recall xxmaj xxunk and his band ? i was in a couple of years of that with him ) , and had higher hopes than what w...",negative
8,"xxbos xxmaj this film is the freshman effort of xxmaj xxunk xxmaj xxunk and her new production company . xxmaj while it suffers from a few problems , as every low budget production does , it is a good start for xxmaj ms . xxmaj xxunk and her company . \n\n xxmaj the story is not terribly new having been done in films like xxmaj the xxmaj burning and every xxmaj friday the 13th since part 2 . xxmaj but , the performances are heartfelt . xxmaj so many big budget movies just have the actors going through the motions , its always nice to see actors really trying to xxunk their craft . \n\n xxm...",negative
9,"xxbos xxmaj greg xxmaj davis and xxmaj xxunk xxmaj xxunk take some xxunk statements by a xxunk , add some commentary by a bunch of xxunk - right xxunk , xxunk the most extreme positions of the most xxunk xxmaj xxunk on the planet to everyone who calls themselves a xxmaj xxunk , and presents this as the xxunk of xxmaj islam . xxmaj maybe their next film will involve xxunk xxmaj fred xxmaj phelps and the xxunk of the xxmaj xxunk xxmaj baptist xxmaj church , adding commentary by some xxunk atheist "" xxunk , and call their film "" what the xxmaj world xxmaj needs to xxmaj know xxmaj about xxmaj...",negative
