In [None]:
from nb_200 import *

## Benchmarking different tokening approaches

### fastai v1.0

In [None]:
from fastai.text import *

In [None]:
path = untar_data(URLs.IMDB)

In [None]:
il = (TextList.from_folder(path, processor=[OpenFileProcessor(), TokenizeProcessor()])
              .filter_by_folder(include=['train', 'test', 'unsup']))

In [None]:
opener = OpenFileProcessor()
opener.process(il)

In [None]:
tokenizer = TokenizeProcessor()

In [None]:
%time tokenizer.process(il)

Memory peak at 3.42G (389MB without the kernel)

In [None]:
from time import time

In [None]:
time()

In [None]:
time()

### dev_course nb 12

In [None]:
from exp.nb_12 import *

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [None]:
tp = TokenizeProcessor()

In [None]:
texts = [read_file(f) for f in il.items]

In [None]:
%time tokens = tp(texts)

Peak at 4.6G

Doesn't kill process each time

### Other

In [None]:
from exp.nb_12 import *

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [None]:
from multiprocessing import Process, Queue, cpu_count

In [None]:
def text_gen(fnames):
    for fn in fnames:
        with open(fn, 'r') as r:
            txt = r.read()
            for fn in default_pre_rules:
                txt = fn(txt)
            yield txt

In [None]:
def process_files(fnames, data_queue, progress_queue, lang='en', batch_size=5000):
    nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"])
    for w in default_spec_tok: nlp.tokenizer.add_special_case(w, [{ORTH: w}])
    tokens = []
    for docs in nlp.pipe(text_gen(fnames), batch_size=batch_size):
        toks = [t.text for t in docs]
        for fn in default_post_rules: toks = fn(toks)
        tokens.append(toks)
        progress_queue.put(1)
    data_queue.put(tokens)

In [None]:
def tokenize(fnames, lang='en', n_workers=4, chunk_size=5000):
    progress_queue,data_queue = Queue(maxsize=n_workers),Queue(maxsize=n_workers)
    processes = [Process(target=process_files,
                         args=(batch, data_queue, progress_queue, lang, chunk_size))
                 for i,batch in enumerate(np.array_split(fnames, n_workers))]
    for p in processes: p.start()
    tokens = []
    for _ in progress_bar(fnames): _ = progress_queue.get()  
    for _ in processes: tokens += data_queue.get()
    for p in processes: p.join()
    return tokens

In [None]:
%time t = tokenize(il.items)

### Writing tokens in memory

In [None]:
from nb_200 import *

In [None]:
path = untar_data(URLs.IMDB)

In [None]:
from multiprocessing import Process, Queue
import spacy,html
from spacy.symbols import ORTH
from fastprogress import progress_bar
import pickle

Before even tokenizeing, we will apply a bit of preprocessing on the texts to clean them up (we saw the one up there had some HTML code). These rules are applied before we split the sentences in tokens.

In [None]:
#export
#special tokens
UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxfld xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [None]:
replace_rep('cccc')

In [None]:
replace_wrep('word word word word word ')

These rules are applies after the tokenization on the list of tokens.

In [None]:
#export
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [None]:
replace_all_caps(['I', 'AM', 'SHOUTING'])

In [None]:
deal_caps(['My', 'name', 'is', 'Jeremy'])

In [None]:
class BaseTokenizer():
    def __init__(self, lang, special_toks): pass
    def tokenize_chunk(self, chunk): return [t.split(' ') for t in chunk]

In [None]:
class SpacyTokenizer():
    def __init__(self, lang='en', special_toks=None):
        special_toks = ifnone(special_toks, default_spec_tok)
        self.nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"])
        for w in default_spec_tok: self.nlp.tokenizer.add_special_case(w, [{ORTH: w}])
    
    def tokenize_pipe(self, items, chunksize=5000):
        for doc in self.nlp.pipe(items, batch_size=chunksize):
            yield [d.text for d in doc]

In [None]:
def apply_rules(items, rules):
    for o in items:
        yield apply_all(o, rules)

In [None]:
def read_text(fname):
    with open(fname, 'r') as f: return f.read()

In [None]:
def tok_items(items, output_func, output_queue, data_queue, tok_func, pre_rules, post_rules,
                   progress_queue=None, chunksize=5000, **tok_kwargs):
    tokenizer = tok_func(**tok_kwargs)
    counts = Counter()
    for i,tok in enumerate(tokenizer.tokenize_pipe(apply_rules(items, pre_rules), chunksize=chunksize)):
        tok = apply_all(tok, post_rules)
        output_queue.put(output_func(items[i], tok))
        counts.update(Counter(tok))
    data_queue.put(counts)

In [None]:
def create_folders(path, output_dir, include=None):
    output_dir = Path(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    for i,(p,d,f) in enumerate(os.walk(path)): # returns (dirpath, dirnames, filenames)
        if include is not None and i==0: d[:] = [o for o in d if o in include]
        else:                            d[:] = [o for o in d if not o.startswith('.')]
        for x in d: os.makedirs(output_dir/(Path(p)/Path(x)).relative_to(path), exist_ok=True)

In [None]:
def tok_folder(path, extensions=['.txt'], include=None, output_dir=None, n_workers=4, chunksize=5000, 
               pre_rules=None, post_rules=None, tok_func=SpacyTokenizer, **tok_kwargs):
    path = Path(path)
    fnames = get_files(path, extensions=extensions, recurse=True, include=include)
    output_dir = Path(ifnone(output_dir, path.parent/f'{path.name}_tok'))
    create_folders(path, output_dir, include=include)
    pre_rules = [read_text] + listify(ifnone(pre_rules, default_pre_rules.copy()))
    post_rules = listify(ifnone(post_rules, default_post_rules.copy()))
    output_queue,data_queue = Queue(maxsize=n_workers),Queue(maxsize=n_workers)
    def _output(o, tok):
        with open(output_dir/o.relative_to(path), 'w') as f: f.write(' '.join(tok))
        return 1
            
    processes = [Process(target=tok_items,
                         args=(batch, _output, output_queue, data_queue, tok_func, pre_rules, post_rules),
                         kwargs={'chunksize': chunksize, **tok_kwargs})
                 for i,batch in enumerate(np.array_split(fnames, n_workers))]
    
    for p in processes: p.start()
    counter = Counter()
    for fn in progress_bar(fnames): _ = output_queue.get()
    for _ in processes: counter.update(data_queue.get())
    for p in processes: p.join()
    pickle.dump(counter, open(output_dir/'counter.pkl','wb'))

In [None]:
path = untar_data(URLs.IMDB)

In [None]:
tok_folder(path, include=['train', 'test', 'unsup'])

In [None]:
tfnames = get_files(path.parent/'imdb_tok', extensions=['.txt'], recurse=True)

In [None]:
tfnames[5]

In [None]:
read_text(tfnames[5])

In [None]:
def join_texts(idx, df, mark_fields=False):
    return ' '.join([(f'{FLD} {i} ' if mark_fields else '') + t for i,t in enumerate(df.iloc[int(idx)].values)])

In [None]:
def tok_df(df, text_cols, n_workers=4, chunksize=5000, pre_rules=None, post_rules=None, 
           mark_fields=None, tok_func=SpacyTokenizer, **tok_kwargs):
    text_cols = listify(text_cols)
    mark_fields = ifnone(mark_fields, len(listify(text_cols)) > 1)
    pre_rules = listify(ifnone(pre_rules, default_pre_rules.copy()))
    pre_rules = [partial(join_texts, df=df[text_cols], mark_fields=mark_fields)] + pre_rules
    post_rules = listify(ifnone(post_rules, default_post_rules.copy()))
    
    output_queue,data_queue = Queue(maxsize=n_workers),Queue(maxsize=n_workers)
    def _output(o, tok): return (o,tok)
            
    processes = [Process(target=tok_items,
                         args=(batch, _output, output_queue, data_queue, tok_func, pre_rules, post_rules),
                         kwargs={'chunksize': chunksize, **tok_kwargs})
                 for i,batch in enumerate(np.array_split(range(len(df)), n_workers))]
    
    for p in processes: p.start()
    outputs,counter = np.zeros(len(df), dtype=np.object),Counter()
    for _ in progress_bar(range(len(df))): 
        i,tok = output_queue.get()
        outputs[i] = tok
    for _ in processes: counter.update(data_queue.get())
    for p in processes: p.join()
    return outputs, counter

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')

In [None]:
df.head()

In [None]:
texts = tok_df(df, text_cols='text')

In [None]:
df.head()

In [None]:
texts[0][1]