In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

Mounted at /content/gdrive


In [2]:
from IPython.display import display, HTML

How to use mid level api to deal with data when Textblock api doesnt work

In [3]:
from fastai.text.all import *

dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')

# this works very weel with IMDB data

In [4]:
# we an also do that with DataBLock api

path = untar_data(URLs.IMDB)
dls = DataBlock(
    blocks=(TextBlock.from_folder(path), CategoryBlock),
    get_y = parent_label,
    get_items = partial(get_text_files, folders=['train', 'test']),
    splitter = GrandparentSplitter(valid_name='test')
).dataloaders(path)

But these might not be flexible enough. For debugging purposes wemight need to apply just parts of the transforms that with these datablocks. Or we might want to create a dataloaders not supported by fastai, we will dig into datablock api

### Transforms

In [5]:
files = get_text_files(path, folders=['train', 'test'])
txts = L(o.open().read() for o in files[:2000])

# we grabbed a bunch of texts

In [6]:
tok = Tokenizer.from_folder(path)
tok.setup(txts)
toks = txts.map(tok)
toks[0]

# we tokenised them with tokenizer

(#68) ['xxbos','xxmaj','the','first','xxmaj','shiloh','film','was','enjoyable','by'...]

In [7]:
num = Numericalize()
num.setup(toks)
nums = toks.map(num)
nums[0][:10]

# numericalize and creating automatic token for vocab of our corpus

tensor([   2,    8,    9,  111,    8,    0,   33,   25, 1564,   55])

In [8]:
# classes have decode methods to give us the string tokens

num_dec = num.decode(nums[0][:10])
num_dec

(#10) ['xxbos','xxmaj','the','first','xxmaj','xxunk','film','was','enjoyable','by']

In [10]:
# getting tokenizer might be tricky as it might not be possible that
# tokenizer is reversible

tok.decode(num_dec)

'xxbos xxmaj the first xxmaj xxunk film was enjoyable by'

Decoder is used by fastai show batch and show results
to convert predictions and mini batches into a human understandable representation

n general, a Transform is an object that behaves like a function and has an optional setup method that will initialize some inner state (like the vocab inside num) and an optional decode that will reverse the function (this reversal may not be perfect, as we saw with tok).



In [11]:
tok((txts[0],txts[1]))

((#68) ['xxbos','xxmaj','the','first','xxmaj','shiloh','film','was','enjoyable','by'...],
 (#371) ['xxbos','i','saw','this','film','when','it','first','came','out'...])

### creating your own transforms

In [12]:
# transformer will onlty convert the object of same type

def f(x:int): return x+1
tfm = Transform(f)
tfm(2), tfm(2.0)

(3, 2.0)

In [13]:
# susing a decorator for passing a function to another function

def f(x:int): return x+1
f(2), f(2.0)

(3, 3.0)

In [14]:
@Transform
def f(x:int): return x+1
f(2), f(2.0)

# 2.0 is rejected as it is float

(3, 2.0)

In [15]:
class NormalizeMean(Transform):
  def setups(self, items):
    self.mean = sum(items)/len(items)
  def encodes(self, x):
    return x-self.mean
  def decodes(self, x):
    return x+self.mean

In [16]:
tfm = NormalizeMean()
tfm.setup([1,2,3,4,5])
start = 2
y = tfm(start)
z = tfm.decode(y) # should be 2
tfm.mean, y, z

(3.0, -1.0, 2.0)

### Pipeline class

 to compose transforms together

In [17]:
tfms = Pipeline([tok,num])
t = tfms(txts[0])
t[:20]

tensor([   2,    8,    9,  111,    8,    0,   33,   25, 1564,   55, 1907,   30,  104,   30,  516,   10,    8,   20,   44,  495])

In [18]:
tfms.decode(t)[:100]

'xxbos xxmaj the first xxmaj xxunk film was enjoyable by adults as well as children . xxmaj this one '

The only part that doesn't work the same way as in Transform is the setup. To properly set up a Pipeline of Transforms on some data, you need to use a TfmdLists.

#TfmdLists and Datasets : Transformed Collections

In [19]:
tls = TfmdLists(files, [Tokenizer.from_folder(path), Numericalize])

#TfmdLists will automatically call the setup method of each transform order

In [20]:
t = tls[0]; t[:20]

tensor([    2,     8,     9,   107,     8, 36289,    32,    25,   756,    48,  1571,    27,    91,    27,   439,    10,     8,    20,    44,   530])

In [21]:
tls.decode(t)[:100]

'xxbos xxmaj the first xxmaj shiloh film was enjoyable by adults as well as children . xxmaj this one'

In [23]:
# it also has a show method

tls.show(t)

xxbos xxmaj the first xxmaj shiloh film was enjoyable by adults as well as children . xxmaj this one starts with about an hour of filler where not much happens , with stilted dialogue ; only in the last act is there any significant action that really moves the plot along . xxmaj the dog is still cute , though , and young kids may enjoy it .


The TfmdLists is named with an "s" because it can handle a training and a validation set with a splits argument

In [25]:
cut = int(len(files)*0.8)
splits = [list(range(cut)), list(range(cut,len(files)))]
tls = TfmdLists(files, [Tokenizer.from_folder(path), Numericalize], 
                splits=splits)

In [28]:
splits[1][:5], splits[0][:5]

([40000, 40001, 40002, 40003, 40004], [0, 1, 2, 3, 4])

In [30]:
cut

40000

In [31]:
tls.valid[0][:20]

tensor([    2,     8,    77,    29,     8,   365,    15,   875,    73,    12,   214,  1340,    54,     8,    64,     9,   173,    39,   138, 16708])

f you have manually written a Transform that performs all of your preprocessing at once, turning raw items into a tuple with inputs and targets, then TfmdLists is the class you need. You can directly convert it to a DataLoaders object with the dataloaders method

In [32]:
lbls = files.map(parent_label)
lbls

# grabbing labels from parent folder

(#50000) ['neg','neg','neg','neg','neg','neg','neg','neg','neg','neg'...]

In [33]:
cat = Categorize()
cat.setup(lbls)
cat.vocab, cat(lbls[0])

# building a covab ofunique items

(['neg', 'pos'], TensorCategory(0))

In [34]:
# doing the whole setupthrough TfmdLists

tls_y = TfmdLists(files, [parent_label, Categorize()])
tls_y[0]

# but we dont need seperate objects for inputs and targets thats why we use datasets

TensorCategory(0)

### Datasets

In [35]:
x_tfms = [Tokenizer.from_folder(path),Numericalize]
y_tfms = [parent_label,Categorize()]
dsets = Datasets(files, [x_tfms, y_tfms], splits=splits)
x,y = dsets.valid[0]
x[:20],y

(tensor([    2,     8,    77,    29,     8,   365,    15,   875,    73,    12,   214,  1340,    54,     8,    64,     9,   173,    39,   138, 16708]),
 TensorCategory(1))

In [36]:
x_tfms = [Tokenizer.from_folder(path), Numericalize]
y_tfms = [parent_label, Categorize()]
dsets = Datasets(files, [x_tfms, y_tfms], splits=splits)
x,y = dsets.valid[0]
x[:20],y

(tensor([    2,     8,    77,    29,     8,   365,    15,   875,    73,    12,   214,  1340,    54,     8,    64,     9,   173,    39,   138, 16708]),
 TensorCategory(1))

In [37]:
t = dsets.valid[0]
dsets.decode(t)

("xxbos xxmaj time for xxmaj hollywood to sit up and take notice ! xxmaj if the actors are acting snooty , all you need to do is get the animators who worked on this little marvel . xxmaj renaissance is probably the first animation flick which makes you forget that you are not seeing human beings . xxmaj although the voice overs by the cast ( craig , mccormack , xxmaj pryce etc . ) are some of the best i have ever heard but even then the emotions portrayed by the ' cartoons ' are unnerving . \n\n xxmaj this style of animation is not very new but the use of light and shadows makes the movie a living painting . xxmaj ironically , such technical wizardry makes you forget that this is actually a very very nice movie . xxmaj the pacing and plot development are marvelous and the dialogs crisp . \n\n xxmaj plot : xxmaj disappearance of a mega corporation 's top employee unravels a tale of deceit and corruption with a xxmaj cold hearted hero at the helm . xxmaj ca n't say much without giving i

In [38]:
# last step iscalling dataloaders
dls = dsets.dataloaders(bs=64, before_batch=pad_input)

Here the full code

In [40]:
tfms = [[Tokenizer.from_folder(path), Numericalize], [parent_label,Categorize]]
files = get_text_files(path, folders=['train', 'test'])
splits = GrandparentSplitter(valid_name='test')(files)
dsets = Datasets(files, tfms, splits=splits)
dls = dsets.dataloaders(dl_type=SortedDL, before_batch=pad_input)

In [41]:
# which is same as the above 

path = untar_data(URLs.IMDB)
dls = DataBlock(
    blocks=(TextBlock.from_folder(path),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path)

In [44]:
dls.show_batch()

Unnamed: 0,text,category
0,"xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero",pos
1,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,pos
2,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,neg
3,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,pos
4,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,neg
5,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,pos
6,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,neg
7,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,pos
8,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,neg
