In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.source import *
from local.data.external import *
from local.data.pipeline import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + ['xxfake' for _ in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c xxfake'.split()))

In [None]:
# export
class Numericalize(ItemTransform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})
    
    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o.split(self.sep))
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o):      return [self.o2i[o_] for o_ in o.split(self.sep)]
    def decodes(self, o)->Str: return self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD])

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start)
test_eq(t, [11, 9, 12, 13, 14, 10])
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start)
test_eq(t, [0, 9, 0, 0, 0, 10])
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_Dataset -

In [None]:
#export
from IPython.display import display, HTML

class TensorText(TensorBase):
    def get_ctxs(self, max_samples=10, **kwargs):
        n_samples = min(self.shape[0], max_samples)
        df = pd.DataFrame({'index': range(n_samples)})
        return [df.iloc[i] for i in range(n_samples)]
    
    def display(self, ctxs): display(HTML(pd.DataFrame(ctxs).to_html(index=False)))

In [None]:
#export
def apply_coords(f, *dims):
    "Create coord array of size `dims` and apply `f` to each cell"
    gs = np.meshgrid(*map(range, dims), indexing='ij')
    return np.apply_along_axis(f, 0, np.stack(gs))

In [None]:
apply_coords(str,2,3,4)

array([[['[0 0 0]', '[0 0 1]', '[0 0 2]', '[0 0 3]'],
        ['[0 1 0]', '[0 1 1]', '[0 1 2]', '[0 1 3]'],
        ['[0 2 0]', '[0 2 1]', '[0 2 2]', '[0 2 3]']],

       [['[1 0 0]', '[1 0 1]', '[1 0 2]', '[1 0 3]'],
        ['[1 1 0]', '[1 1 1]', '[1 1 2]', '[1 1 3]'],
        ['[1 2 0]', '[1 2 1]', '[1 2 2]', '[1 2 3]']]], dtype='<U7')

In [None]:
class LM_Sampler(Sampler):
    def __init__(self, ds): self.ds,self.bs,self.spb = ds,ds.bs,len(ds)//ds.bs
    def __len__(self): return len(self.ds)
    def __iter__(self): return ((i%self.bs)*self.spb + (i//self.bs) for i in L.range(self.ds))

In [None]:
#export
class LM_Dataset(BaseDS):
    def __init__(self, ds, lens=None, bs=64, seq_len=72, shuffle=False, cache=2, as_tensor=True):
        super().__init__(ReindexCollection(ds, cache=cache))
        self.bs,self.seq_len,self.shuffle,self.as_tensor = bs,seq_len,shuffle,as_tensor
        if lens is None: lens = [len(o[0]) for o in ds]
        self.lens = ReindexCollection(lens, idxs=self.ds.idxs)
        # The "-1" is to allow for final label
        self.n = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.reset()
        
    def __len__(self): return self.n//(self.seq_len)
    def reset(self):
        if self.shuffle: self.ds.shuffle()
        self.cum_lens = np.cumsum(self.lens)
    
    def __getitem__(self, seq):
        def _f(o):
            tokidx = seq*self.seq_len + o[0] + o[1]
            docidx = np.searchsorted(self.cum_lens, tokidx+1)
            return self.ds[docidx][0][tokidx-self.cum_lens[docidx]]
        res = apply_coords(_f, 2, self.seq_len)
        return tuple(TensorText(tensor(o)) if self.as_tensor else o for o in res)

In [None]:
bs,sl = 4,3
ints = [(o,) for o in [[0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]]]

In [None]:
t = LM_Dataset(txts, bs=bs, seq_len=sl)
dl = DataLoader(t, batch_size=bs, sampler=LM_Sampler(t), collate_fn=noop)
test_eq(list(dl),
    [[[tensor([0, 1, 2]), tensor([1, 2, 3])],
      [tensor([6, 7, 8]), tensor([7, 8, 9])],
      [tensor([12, 13, 14]), tensor([13, 14, 15])],
      [tensor([18, 19, 20]), tensor([19, 20, 21])]],
     [[tensor([3, 4, 5]), tensor([4, 5, 6])],
      [tensor([9, 10, 11]), tensor([10, 11, 12])],
      [tensor([15, 16, 17]), tensor([16, 17, 18])],
      [tensor([21, 22, 23]), tensor([22, 23, 24])]]])

## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text', n_workers=1)
df_tok.head(2)

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,"xxbos xxmaj un - bleeping - believable ! xxmaj meg xxmaj ryan does n't even look her usual pert lovable self in this , which normally makes me forgive her shallow ticky acting schtick . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj whoosh … xxmaj banzai xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj chill ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",108
1,positive,False,"xxbos xxmaj this is a extremely well - made film . xxmaj the acting , script and camera - work are all first - rate . xxmaj the music is good , too , though it is mostly early in the film , when things are still relatively cheery . xxmaj there are no really superstars in the cast , though several faces will be familiar . xxmaj the entire cast does an excellent job with the script . \n\n xxmaj but it is hard to watch , because there is no good end to a situation like the one presented . xxmaj it is now fashionable to blame the xxmaj british for setting xxmaj hindus and xxmaj muslims against...",462


In [None]:
texts,lengths = df_tok['text'].values,df_tok['text_lengths'].values.astype(np.int)

In [None]:
splits = RandomSplitter()(L(t for t in texts))
dsrc = DataSource(L(t for t in texts), type_tfms=[Numericalize(make_vocab(count))], filts=splits)

In [None]:
dsrc.decode_at(0)

("xxbos xxmaj un - xxunk - believable ! xxmaj meg xxmaj ryan does n't even look her usual xxunk lovable self in this , which normally makes me forgive her shallow xxunk acting xxunk . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj xxunk … xxmaj xxunk xxrep 3 ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj xxunk ? xxmaj must be a replay of xxmaj jonestown - hollywood style . w xxrep 3 o xxrep 3 f !",)

In [None]:
bs = 16
ds = LM_Dataset(dsrc.train, lens=lengths[splits[0]], bs=bs)
samp = LM_Sampler(ds)
dl = TfmdDL(ds, bs=bs, sampler=samp, num_workers=2)

In [None]:
x,y = dl.one_batch()
x.shape

torch.Size([16, 72])

In [None]:
ds.decode((x[0],))

('xxbos xxmaj it seems evident from this adaptation that he did not . xxmaj not only did he leave the plot behind , he made up his own ! xxmaj the things that he chose to leave in were so ridiculously unbelievable that i was happy he chose to leave out some of the most important parts of the novel . xxmaj the plot was xxunk , inconsistent and xxunk to say',)

In [None]:
dl.show_batch()

index,text
0,"xxbos xxmaj it seems evident from this adaptation that he did not . xxmaj not only did he leave the plot behind , he made up his own ! xxmaj the things that he chose to leave in were so ridiculously unbelievable that i was happy he chose to leave out some of the most important parts of the novel . xxmaj the plot was xxunk , inconsistent and xxunk to say"
1,", and i was therefore more than eager to find the sequels , and full of anticipation when i finally stumbled over them recently . xxmaj while this third "" hanzo "" film is just not quite as brilliant as its predecessors it is definitely another great piece of cult - cinema that no lover of xxmaj japanese exploitation cinema can afford to miss . "" who 's xxmaj got xxmaj the"
2,"of xxmaj xxunk xxmaj xxunk xxmaj carter 's chimp , the xxunk of the humans ' xxunk , the ease of their escape , their extraordinary skills of xxunk ( this is an astronaut and a group of human xxunk suddenly riding full xxunk ) , the massive and immediate human xxunk all are too unbelievable . xxmaj mark xxmaj xxunk never once projects any sense of real fear , danger or"
3,". xxmaj caan was one of the 1970s ' best actors , and his xxunk xxunk with xxmaj xxunk , xxmaj duvall , xxmaj hopkins , and both xxmaj xxunk give "" killer xxmaj elite "" real xxunk . \n\n xxmaj but you do n't watch "" killer xxmaj elite "" thinking about that . xxmaj you watch it thinking of the film that got away . xxbos xxmaj this film has"
4,"actress of this era , for that matter ) give another woman a swift punch in the xxunk ? ( twice ! ) \n\n xxmaj after xxmaj harlow 's xxmaj ruby is sent to a xxunk after getting mixed up with xxmaj gable 's xxmaj edward xxmaj hall ( he of that cheesy yet endearing crooked smile ) , her xxunk becomes all the more complicated when she discovers that she is"
5,"though , i do n't rate it . \n\n xxmaj if xxmaj i 'd been watching it believing the opening text to be true ( "" i found this tape … "") , i might have been a bit disturbed by it , thinking it was real . xxmaj even without the benefit of knowing it not to be real though , i think xxmaj i 'd have worked out that it"
6,when i saw it as a kid . xxmaj and i own the video . xxmaj the film could have been so much more if it had been done properly . xxmaj oh well … xxbos xxmaj this is one of the worst films ever . i like cheesy movies but this is simply awful . xxmaj where are the images in the film that are on the box ? i think
7,"'s to an even higher level ! \n\n i actually bought this movie just because of that character , and still have it somewhere ! \n\n xxmaj gulfax may look like sh!t , but he made this movie xxrep 3 ! xxmaj the only reason xxmaj i 've never seen the sequel , or even sought it out , was because of his absence ! xxmaj perhaps should there be a final"
8,", engaging in child neglect , stupid , uneducated , racist , ugly , eating poor food , and dim - xxunk -- xxunk , only by turning to xxmaj indian culture can the local priest be "" redeemed "" at the end of the film . \n\n xxmaj by contrast , the xxmaj indian family are beautiful , clever , educated , can speak many xxunk , are caring and loving"
9,"xxmaj even the photography stinks , in and out xxunk with the camera switching this way and that trying to make it look like the vampires move to fast for the camera to keep up and then the camera turns all to bright in the scene of xxmaj savage chasing the son of xxmaj xxunk around till he xxunk himself . xxmaj avoid this one ! ! xxbos a group of model"


## Classification

In [None]:
def pad_collate(samples, pad_idx=1, pad_first=True, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        if pad_first: res[i,-len(s[0]):] = LongTensor(s[0])
        else:         res[i,:len(s[0]):] = LongTensor(s[0])
    if backwards: res = res.flip(1)
    return TensorTextBase(res), tensor(np.array([s[1] for s in samples]))

In [None]:
splits = RandomSplitter()(range(len(df)))
_get_txt = lambda i: df_tok["text"][i]
_get_lbl = lambda i: df_tok["label"][i]
dsrc = DataSource(range(len(df)), type_tfms=[[_get_txt, Numericalize(make_vocab(count))], [_get_lbl, Categorize()]], filts=splits)
dl = TfmdDL(dsrc, collate_fn=TfmdCollate(collate_fn=pad_collate))

In [None]:
dl.show_batch()

NameError: Traceback (most recent call last):
  File "/home/jhoward/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 177, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/jhoward/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/jhoward/git/fastai_dev/dev/local/data/core.py", line 191, in __call__
    self.collate_fn = collate_fn
  File "<ipython-input-122-2e49ed148c91>", line 10, in pad_collate
    return TensorTextBase(res), tensor(np.array([s[1] for s in samples]))
NameError: name 'TensorTextBase' is not defined
