In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.source import *
from local.data.external import *
from local.data.pipeline import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource

## Numericalizing

In [None]:
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    if len(vocab) < max_vocab and len(vocab)%8 != 0: 
        #Make sure vocab size is a multiple of 8 for fast mixed precision training
        vocab += ['xxfake' for _ in range(0, 8-len(vocab)%8)]
    return vocab

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c'.split()))

In [None]:
# export
class Numericalize(ItemTransform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=None):
        self.sep = sep or defaults.text_token_sep
        self.vocab,self.min_freq,self.max_vocab = vocab,min_freq,max_vocab
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})
    
    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o.split(self.sep))
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o):      return [self.o2i[o_] for o_ in o.split(self.sep)]
    def decodes(self, o)->Str: return self.sep.join([self.vocab[o_] for o_ in o])

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start)
test_eq(t, [11, 9, 12, 13, 14, 10])
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start)
test_eq(t, [0, 9, 0, 0, 0, 10])
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LMPreloader -

In [None]:
class LMCollate():
    def __init__(self, bs=64, seq_len=72): self.bs,self.seq_len,self.offset = bs,seq_len,None
    def __call__(self, samples):
        #Samples has more than bs elements if more than one text is needed to make one of the batch
        i,res,s_len,s_txt = 0,[],0,[]
        for s in samples:
            s = tensor(s).long()
            l = self.seq_len-s_len
            s_txt.append(s[0][self.offset[i]:self.offset[i]+l+1])
            s_len += len(s_txt[-1])
            self.offset[i] = self.offset[i]+l if self.offset[i]+l < len(s[0]) else 0
            if s_len >= self.seq_len+1:
                i += 1
                res.append(torch.cat(s_txt))
                s_len,s_txt = 0,[]
        res = torch.stack(res, dim=0)
        return res[:,:-1],res[:,1:]

In [None]:
items = [(range(21),), (range(32),), (range(10),), (range(16),), (range(26),)]
cumlen = tensor([len(t) for t in items]).cumsum(0)
tst = LMCollate(bs=5, seq_len=10)
tst.offset = [0,0,21,0,5]
res = tst([items[0], items[1], items[1], items[3], items[4]])

for i in [0,1,3]: 
    test_eq(res[0][i], tensor(range(10)))
    test_eq(res[1][i], tensor(range(1,11)))
test_eq(res[0][2], tensor(range(21,31)))
test_eq(res[1][2], tensor(range(22,32)))
test_eq(res[0][4], tensor(range(5,15)))
test_eq(res[1][4], tensor(range(6,16)))
test_eq(tst.offset, [10, 10, 31, 10, 15])

res = tst([items[0], items[1], items[1], items[2], items[3], items[4], items[4]])
for i in [0,1]: 
    test_eq(res[0][i], tensor(range(10,20)))
    test_eq(res[1][i], tensor(range(11,21)))
test_eq(res[0][2], torch.cat([tensor([31]), tensor(range(9))]))
test_eq(res[1][2], tensor(range(10)))
test_eq(res[0][3], torch.cat([tensor(range(10,16)), tensor(range(4))]))
test_eq(res[1][3], torch.cat([tensor(range(11,16)), tensor(range(5))]))
test_eq(res[0][4], tensor(range(15,25)))
test_eq(res[1][4], tensor(range(16,26)))
test_eq(tst.offset, [20, 20, 9, 4, 25])

In [None]:
class LMSampler(BatchSampler):
    def __init__(self, ds, cf, lengths=None, bs=64, seq_len=72, shuffle=False):
        self.ds,self.cf,self.bs,self.seq_len,self.shuffle = ds,cf,bs,seq_len,shuffle
        self.lengths = [len(o[0]) for o in ds] if lengths is None else lengths
        self.n_batch = sum(self.lengths) // bs
    
    def __iter__(self):
        self.batchify()
        for i in range(0, self.n_batch-1, self.seq_len):
            idx = tensor(range(self.bs)) * self.n_batch + i + self.seq_len
            end_idx = len(self.cumlen) - (self.cumlen[:,None] > idx[None]).sum(0)
            s_idx = [list(range(i1,i2+1)) for (i1,i2) in zip(self.start_idx, end_idx)]
            yield [self.idxs[i] for s in s_idx for i in s]
            self.start_idx = end_idx
        
    def batchify(self):
        self.idxs = torch.randperm(len(self.ds)) if self.shuffle else tensor(range(len(self.ds)))
        self.cumlen = (tensor(self.lengths)[self.idxs] if self.shuffle else tensor(self.lengths)).cumsum(0)
        idx = tensor(range(self.bs)) * self.n_batch
        self.start_idx = len(self.cumlen) - (self.cumlen[:,None] > idx[None]).sum(0)
        self.cf.offset = idx - torch.cat([tensor([0]), self.cumlen])[self.start_idx]
        
    def __len__(self): return (self.n_batch-1) // self.seq_len

In [None]:
items = [(range(21),), (range(32),), (range(10),), (range(16),), (range(26),)]
cf = LMCollate(bs=5, seq_len=10)
s = LMSampler(items, cf, bs=5, seq_len=10)
s.batchify()
test_eq(cf.offset, tensor([0,0,21,0,5]))
itr = iter(s)

b1 = next(itr)
test_eq(b1, [0, 1, 1, 3, 4])

b2 = next(itr)
test_eq(b2, [0, 1, 1, 2, 3, 4, 4])

test_fail(lambda: next(itr))

In [None]:
#TODO: make better
cf = LMCollate(bs=5, seq_len=10)
s = LMSampler(items, cf, bs=5, seq_len=10, shuffle=True)
itr = iter(s)
b1 = next(itr)
b2 = next(itr)
test_fail(lambda: next(itr))

In [None]:
#Other approach
class LM_PreLoader(GetAttr):
    "An intermediate between a dataset with texts and a DataLoader"
    _xtra = ['show', 'decode', 'show_at', 'decode_at', 'decode_batch']
    def __init__(self, ds, lengths=None, bs=64, seq_len=70, shuffle=False):
        self.ds,self.bs,self.seq_len,self.shuffle = ds,bs,seq_len,shuffle
        self.lengths = [len(o[0]) for o in ds] if lengths is None else lengths
        self.n_batch = sum(self.lengths) // bs
        self.batchify()
        self.default = self.ds
    
    def __len__(self): return ((self.n_batch-1) // self.seq_len) * self.bs
    
    def __getitem__(self, i):
        k = (i % self.bs) * self.n_batch + (i // self.bs) * self.seq_len
        item_idx = (self.cumlen > k).nonzero().min().item()
        offset = k if item_idx==0 else k-self.cumlen[item_idx-1]
        text = self.ds[self.idxs[item_idx]][0][offset:]
        while len(text) <= self.seq_len:
            item_idx += 1
            text += self.ds[self.idxs[item_idx]][0]
        return tensor(text[:self.seq_len]),tensor(text[1:self.seq_len+1])
    
    def batchify(self):
        self.idxs = torch.randperm(len(ds)) if self.shuffle else tensor(range(len(self.ds)))
        self.cumlen = (tensor(self.lengths)[self.idxs] if self.shuffle else tensor(self.lengths)).cumsum(0)

In [None]:
lengths = [10,7,19,23,5,42]
ds = LM_PreLoader([(list(range(l)), 0) for l in lengths], lengths=lengths, bs=5, seq_len=4)
x,y = ds[0]
test_eq(x[1:], y[:-1])
test_eq(x+1, y)
#Going on the seq dimension reads the text in order
test_eq(torch.cat([ds[5*i][0] for i in range(5)]), 
        tensor(list(range(10))+list(range(7))+list(range(3))))
#3 is skipped for the next sample in the natch since it's the last target
test_eq(torch.cat([ds[5*i+1][0] for i in range(5)]),
        tensor(list(range(4,19))+list(range(5))))

## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')

In [None]:
df.head()

Unnamed: 0,label,text,is_valid
0,negative,Un-bleeping-believable! Meg Ryan doesn't even ...,False
1,positive,This is a extremely well-made film. The acting...,False
2,negative,Every once in a long while a movie will come a...,False
3,positive,Name just says it all. I watched this movie wi...,False
4,negative,This movie succeeds at being one of the most u...,False


In [None]:
df_tok,count = tokenize_df(df, 'text')

In [None]:
df_tok.head()

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,xxbos▁xxmaj▁un▁-▁bleeping▁-▁believable▁!▁xxmaj...,103.0
1,positive,False,xxbos▁xxmaj▁this▁is▁a▁extremely▁well▁-▁made▁fi...,462.0
2,negative,False,xxbos▁xxmaj▁every▁once▁in▁a▁long▁while▁a▁movie...,220.0
3,positive,False,xxbos▁xxmaj▁name▁just▁says▁it▁all▁.▁i▁watched▁...,184.0
4,negative,False,xxbos▁xxmaj▁this▁movie▁succeeds▁at▁being▁one▁o...,398.0


In [None]:
texts,lengths = df_tok['text'].values,df_tok['text_lengths'].map(int).values

In [None]:
splits = RandomSplitter()(L(t for t in texts))
dsrc = DataSource(L(t for t in texts), type_tfms=[Numericalize(make_vocab(count))], filts=splits)

In [None]:
dsrc.decode_at(0)

("xxbos▁xxmaj▁un▁-▁xxunk▁-▁believable▁!▁xxmaj▁meg▁xxmaj▁ryan▁does▁n't▁even▁look▁her▁usual▁xxunk▁lovable▁self▁in▁this▁,▁which▁normally▁makes▁me▁forgive▁her▁shallow▁xxunk▁acting▁xxunk▁.▁xxmaj▁hard▁to▁believe▁she▁was▁the▁producer▁on▁this▁dog▁.▁xxmaj▁plus▁xxmaj▁kevin▁xxmaj▁kline▁:▁what▁kind▁of▁suicide▁trip▁has▁his▁career▁been▁on▁?▁xxmaj▁xxunk▁...▁xxmaj▁xxunk▁!▁!▁!▁xxmaj▁finally▁this▁was▁directed▁by▁the▁guy▁who▁did▁xxmaj▁big▁xxmaj▁xxunk▁?▁xxmaj▁must▁be▁a▁replay▁of▁xxmaj▁jonestown▁-▁hollywood▁style▁.▁xxmaj▁xxunk▁!",)

In [None]:
bs = 16
cf = LMCollate(bs=bs)
samp = LMSampler(dsrc.train, cf, lengths=lengths[splits[0]], bs=bs)
tdl = TfmdDL(dsrc.train, bs=bs, num_workers=0, collate_fn=cf, batch_sampler=samp)

In [None]:
x,y = tdl.one_batch()

In [None]:
tdl.decode_batch((x,y))

(#10) [('xxbos▁xxmaj▁in▁xxmaj▁iran▁,▁women▁are▁not▁xxunk▁to▁attend▁men▁\'s▁sporting▁events▁,▁apparently▁to▁"▁xxunk▁"▁them▁from▁all▁the▁xxunk▁and▁foul▁language▁they▁might▁hear▁xxunk▁from▁the▁male▁fans▁(▁so▁since▁men▁ca▁n\'t▁xxunk▁or▁behave▁themselves▁,▁women▁are▁forced▁to▁suffer▁.▁xxmaj▁go▁figure▁.▁)▁.▁"▁offside▁"▁tells▁the▁tale▁of▁a▁half▁dozen▁or',),("in▁the▁end▁,▁i▁thought▁the▁film▁handled▁the▁concept▁well▁(▁even▁if▁some▁scenes▁were▁a▁little▁clichéd▁)▁.▁\n\n▁xxmaj▁the▁cast▁was▁quite▁good▁,▁and▁the▁two▁leads▁seemed▁to▁take▁their▁roles▁very▁seriously▁.▁i▁could▁n't▁help▁thinking▁,▁though▁,▁that▁xxmaj▁xxunk▁xxmaj▁turner▁is▁a▁bit▁of▁a▁xxmaj▁xxunk▁xxmaj▁davis▁look▁-▁a▁-▁like▁.▁xxmaj",),('negative▁review▁of▁the▁movie▁:▁xxmaj▁if▁xxmaj▁neo▁could▁do▁the▁xxmaj▁superman▁thing▁,▁why▁bother▁to▁fight▁at▁all▁?▁xxmaj▁the▁answer▁,▁of▁course▁,▁is▁that▁\'s▁what▁draws▁the▁young▁,▁male▁xxunk▁group▁into▁the▁theatre▁.▁)▁xxmaj▁then▁there▁is▁the▁"▁redemption▁through▁love▁"▁aspect▁.▁xxmaj▁that▁plot▁device▁was▁w

In [None]:
bs = 16
ds = LM_PreLoader(dsrc.train, lengths=lengths[splits[0]], bs=bs)
dl = TfmdDL(ds, bs=bs, num_workers=0)

In [None]:
x,y = dl.one_batch()

In [None]:
dl.decode_batch((x,y))

(#10) [('xxbos▁xxmaj▁in▁xxmaj▁iran▁,▁women▁are▁not▁xxunk▁to▁attend▁men▁\'s▁sporting▁events▁,▁apparently▁to▁"▁xxunk▁"▁them▁from▁all▁the▁xxunk▁and▁foul▁language▁they▁might▁hear▁xxunk▁from▁the▁male▁fans▁(▁so▁since▁men▁ca▁n\'t▁xxunk▁or▁behave▁themselves▁,▁women▁are▁forced▁to▁suffer▁.▁xxmaj▁go▁figure▁.▁)▁.▁"▁offside▁"▁tells▁the▁tale▁of▁a▁half',),("in▁the▁end▁,▁i▁thought▁the▁film▁handled▁the▁concept▁well▁(▁even▁if▁some▁scenes▁were▁a▁little▁clichéd▁)▁.▁\n\n▁xxmaj▁the▁cast▁was▁quite▁good▁,▁and▁the▁two▁leads▁seemed▁to▁take▁their▁roles▁very▁seriously▁.▁i▁could▁n't▁help▁thinking▁,▁though▁,▁that▁xxmaj▁xxunk▁xxmaj▁turner▁is▁a▁bit▁of▁a▁xxmaj▁xxunk▁xxmaj▁davis▁look▁-▁a▁-▁like",),('negative▁review▁of▁the▁movie▁:▁xxmaj▁if▁xxmaj▁neo▁could▁do▁the▁xxmaj▁superman▁thing▁,▁why▁bother▁to▁fight▁at▁all▁?▁xxmaj▁the▁answer▁,▁of▁course▁,▁is▁that▁\'s▁what▁draws▁the▁young▁,▁male▁xxunk▁group▁into▁the▁theatre▁.▁)▁xxmaj▁then▁there▁is▁the▁"▁redemption▁through▁love▁"▁aspect▁.▁xxmaj▁that▁plot▁device▁was▁worn▁out▁by▁xxmaj▁

In [None]:
ds.decode((x[0],))

('xxbos▁xxmaj▁in▁xxmaj▁iran▁,▁women▁are▁not▁xxunk▁to▁attend▁men▁\'s▁sporting▁events▁,▁apparently▁to▁"▁xxunk▁"▁them▁from▁all▁the▁xxunk▁and▁foul▁language▁they▁might▁hear▁xxunk▁from▁the▁male▁fans▁(▁so▁since▁men▁ca▁n\'t▁xxunk▁or▁behave▁themselves▁,▁women▁are▁forced▁to▁suffer▁.▁xxmaj▁go▁figure▁.▁)▁.▁"▁offside▁"▁tells▁the▁tale▁of▁a▁half',)

### Showing

In [None]:
#export
class TensorTextBase(TensorBase):
    def show(self, ctx=None, **kwargs):
        return show_image(self, ctx=ctx, **{**self._show_args, **kwargs})
    
    def get_ctxs(self, max_samples=10, rows=None, cols=None, figsize=None, **kwargs):
        n_samples = min(self.shape[0], max_samples)
        rows = rows or int(np.ceil(math.sqrt(n_samples)))
        cols = cols or int(np.ceil(math.sqrt(n_samples)))
        figsize = (cols*3, rows*3) if figsize is None else figsize
        _,axs = plt.subplots(rows, cols, figsize=figsize)
        return axs.flatten()

In [None]:
row = pd.Series({'a': None, 'b': None})
row = fill_empty(row, 'tst')
test_eq(row['a'], 'tst')
assert row['b'] is None

row = fill_empty(row, 'tst1')
test_eq(row['a'], 'tst')
test_eq(row['b'], 'tst1')

row = fill_empty(row, 'tst2')
test_eq(row['a'], 'tst')
test_eq(row['b'], 'tst1')

In [None]:
#export
def show_title1(o, ax=None, ctx=None):
    "Set title of `ax` to `o`, or print `o` if `ax` is `None`"
    ax = ifnone(ax,ctx)
    if ax is None: print(o)
    elif isinstance(ax, pd.Series): ax = fill_empty(ax, o)
    else: ax.set_title(o)
    return ax

In [None]:
def _show(self, ctx=None, **kwargs): return show_title1(str(self), ctx=ctx)
Str.show = _show

In [None]:
from IPython.display import display, HTML

In [None]:
df = pd.DataFrame({'index': range(5), 'text': [None for _ in range(5)]}, columns=['index', 'text'])
tdl.show_batch(ctxs = [df.iloc[i] for i in range(5)])
with pd.option_context('display.max_colwidth', -1):
    display(HTML(df.to_html(index=False)))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


index,text
0,
1,
2,
3,
4,
