In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.external import *
from local.data.pipeline import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource

## Numericalizing

In [None]:
#export
class TokenizedText(L):
    def show(o, ctx=None, sep=None, **kwargs): 
        sep = sep or defaults.text_token_sep
        return show_title(sep.join(o), ctx=ctx)

In [None]:
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    if len(vocab) < max_vocab and len(vocab)%8 != 0: 
        #Make sure vocab size is a multiple of 8 for fast mixed precision training
        vocab += ['xxfake' for _ in range(0, 8-len(vocab)%8)]
    return vocab

In [None]:
# export
class Numericalize(MultiCategorize):
    "Reversible transform of multi-category strings to `vocab` id"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=None):
        super().__init__(vocab=vocab)
        self.sep = sep or defaults.text_token_sep
        self.min_freq,self.max_vocab = min_freq,max_vocab
    
    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o.split(self.sep))
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.otoi = {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'}

    def encodes(self, o):                return [self.otoi[o_] for o_ in o.split(self.sep)]
    def decodes(self, o)->TokenizedText: return self.sep.join([self.vocab[o_] for o_ in o])

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
assert len(num.vocab)%8 == 0

num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text', 'this is another text'))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
assert len(num.vocab)%8 == 0

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df_tok,count = tokenize_df(df, 'text')

In [None]:
num = Numericalize()
num.setup(df_tok['text'].values)

In [None]:
len(num.vocab)

7088

In [None]:
v1 = make_vocab(count)
len(v1)

7088