# Preprocess text

In [16]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
#export
from exp.nb_11a import *

## Loading and splitting the dataset

In [18]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [19]:
path.ls()

[PosixPath('/home/fabiograetz/.fastai/data/imdb/README'),
 PosixPath('/home/fabiograetz/.fastai/data/imdb/train'),
 PosixPath('/home/fabiograetz/.fastai/data/imdb/test'),
 PosixPath('/home/fabiograetz/.fastai/data/imdb/unsup'),
 PosixPath('/home/fabiograetz/.fastai/data/imdb/tmp_clas'),
 PosixPath('/home/fabiograetz/.fastai/data/imdb/imdb.vocab'),
 PosixPath('/home/fabiograetz/.fastai/data/imdb/tmp_lm')]

In [20]:
#export
def read_file(fn): 
    with open(fn, 'r', encoding = 'utf8') as f: return f.read()

In [21]:
#export
class TextList(ItemList):
    @classmethod
    def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs):
        return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs)
    
    def get(self, i):
        if isinstance(i, Path): return read_file(i)
        return i

In [22]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [23]:
len(il)

100000

In [24]:
il

TextList (100000 items)
 [PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/9809_2.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/7291_2.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/1279_3.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/7323_1.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/9921_3.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/1825_2.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/233_1.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/3324_3.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/9439_3.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/10967_4.txt')...]
 Path: /home/fabiograetz/.fastai/data/imdb

In [25]:
text = il[0]

In [26]:
text

"Some wonder why there weren't anymore Mrs. Murphy movies after this one. Will it's because this movie totally blew snot. Disney was not the right studio to run this film. MAYBE Touchstone (well, they're owned by Disney, but it'd be more adult). The film is too kid-ish, as the book series is not. The casting is all wrong for the characters. The characters don't even act the way they do in the books. And why was Tucker changed to a guy? He's a girl in the frigging books! Was this done to make the film appeal to boys? Sheesh. And where was Pewter, the gray cat? One of the funniest characters from the book is absent from this filth. Rita Mae Brown is a good writer, but letting Disney blow her work was wrong. An animated feature film, perhaps in the vane of Don Bluth's artwork would suit a better Mrs. Murphy film. Overall, I give this a 2, because at least Disney made a film from an under-appreciated book series. But, I wish they did better. Either way, I still have my books to entertain m

In [27]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [28]:
sd

SplitData
Train: TextList (90111 items)
 [PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/7291_2.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/1279_3.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/7323_1.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/9921_3.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/1825_2.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/233_1.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/3324_3.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/9439_3.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/10967_4.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/3729_2.txt')...]
 Path: /home/fabiograetz/.fastai/data/imdb
Valid: TextList (9889 items)
 [PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/9809_2.txt'), PosixPath('/home/fabiograetz/.fastai/data/imdb/train/neg/3466_3.txt'), PosixPath('/home/fabiograetz/

## Tokenization

In [29]:
#export
import spacy, html

In [35]:
#export
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [36]:
#export
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]

default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [37]:
replace_rep('aaaa')

' xxrep 4 a '

In [39]:
replace_wrep("test test test test ")

' xxwrep 4 test  '

*Rules that are applied after tokenization:*

In [51]:
#export
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

In [52]:
replace_all_caps(["AAA", "bbb", "Fabio" ,"FABIO"])

['xxup', 'aaa', 'bbb', 'Fabio', 'xxup', 'fabio']

In [54]:
#export
def deal_caps(x):
    "Replace all Capitalized tokens by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): 
            res.append(TK_MAJ)
        res.append(t.lower())
    return res

In [55]:
deal_caps(["AAA", "bbb", "Fabio" ,"FABIO"])

['aaa', 'bbb', 'xxmaj', 'fabio', 'fabio']

In [56]:
#export
def add_eos_bos(x): return [BOS] + x + [EOS]

In [65]:
#export
default_post_rules = [replace_all_caps, deal_caps, add_eos_bos]  # changed order with respect to fastai because otherwise all_caps are not handled correctly

In [66]:
x = ["AAA", "bbb", "Fabio" ,"FABIO"]

for f in default_post_rules:
    x = f(x)

In [67]:
x

['xxbos', 'xxup', 'aaa', 'bbb', 'xxmaj', 'fabio', 'xxup', 'fabio', 'xxeos']

In [155]:
#export
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor

In [363]:
#export
def parallel(func, arr, max_workers=4):
    if max_workers < 2:
        results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results

In [364]:
#export
class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize, self.max_workers = chunksize, max_workers
        self.tokenizer = spacy.blank(lang).tokenizer
        
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, args):
        # chunk is a list of strings
        i, chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]  # list of strings
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]  # docs is a list of lists of tokens
        docs = [compose(t, self.post_rules) for t in docs]  # Formerly capitalized tokens are all lowercase now with special tokens before
        return docs  # List of lists of tokens
 
    def __call__(self, items): 
        toks = []
        if isinstance(items[0], Path): items = [read_file(i) for i in items]
        # items is a list of strings
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        # chunks is a list of lists of strings

        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [365]:
tp = TokenizeProcessor()

In [366]:
text[:200]

"Some wonder why there weren't anymore Mrs. Murphy movies after this one. Will it's because this movie totally blew snot. Disney was not the right studio to run this film. MAYBE Touchstone (well, they'"

In [370]:
' • '.join(tp(il[:10])[0])[:400]

"xxbos • xxmaj • some • wonder • why • there • were • n't • anymore • xxmaj • mrs. • xxmaj • murphy • movies • after • this • one • . • xxmaj • will • it • 's • because • this • movie • totally • blew • snot • . • xxmaj • disney • was • not • the • right • studio • to • run • this • film • . • xxup • maybe • xxmaj • touchstone • ( • well • , • they • 're • owned • by • xxmaj • disney • , • but • it"

## Numericalization

In [405]:
#export
import collections

class NumericalizeProcessor(Processor):
    def __init__(self, vocab=None, max_vocab=60000, min_freq=2):
        self.vocab, self.max_vocab, self.min_freq = vocab, max_vocab, min_freq
        
    def __call__(self, items):
        # items is a list of lists of tokens
        # Define vocab on first use
        if self.vocab is None:
            freq = Counter(p for o in items for p in o)
            self.vocab = [o for o, c in freq.most_common(self.max_vocab) if c >= self.min_freq]
            
            for o in reversed(default_spec_tok):
                if o in self.vocab: self.vocab.remove(o)
                self.vocab.insert(0, o)
                
        if getattr(self, 'otoi', None) is None:
            self.otoi = collections.defaultdict(int, {v:k for k,v in enumerate(self.vocab)})
        
        return [self.proc1(o) for o in items]
    
    def proc1(self, item):
        # item is list of tokens
        return [self.otoi[o] for o in item]  # returns list of strings
    
    def deprocess(self, idxs):
        #idxs is a list of lists of ints
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    
    def deproc1(self, idx):
        # idx is a list of ints
        return [self.vocab[i] for i in idx]

In [406]:
proc_tok, proc_num = TokenizeProcessor(max_workers=8), NumericalizeProcessor()

In [407]:
%time ll = label_by_func(sd, lambda x: 0, proc_x= [proc_tok, proc_num])

CPU times: user 16.9 s, sys: 2.17 s, total: 19.1 s
Wall time: 43.6 s


In [408]:
idxs = proc_num.proc1(["xxbos", "xxmaj", "some", "wonder", "why", "there", "were", "n't", "anymore"])

In [409]:
idxs

[2, 7, 65, 602, 154, 54, 86, 35, 1557]

In [410]:
proc_num.deproc1(idxs)

['xxbos', 'xxmaj', 'some', 'wonder', 'why', 'there', 'were', "n't", 'anymore']

In [417]:
print(ll.train.x_obj(0))

xxbos xxmaj in order to hold the public 's attention for three hours , we were treated not so much to a family 's romp through four generations and 120 years of xxmaj hungarian history , as to sexual liaisons with a sister , a sister - in - law and other xxunk . xxmaj oh yes , there was also a totally gratuitous rape . xxmaj having said all this , the first story of the relationship among the children of the patriarch was fresh and sensual - thanks to xxmaj jennifer xxmaj ehle . xxeos


In [425]:
proc_num.deproc1(ll.train[0][0])[:10]

['xxbos',
 'xxmaj',
 'in',
 'order',
 'to',
 'hold',
 'the',
 'public',
 "'s",
 'attention']

## Batching