In [None]:
from nb_200 import *

## Benchmarking different tokening approaches

### fastai v1.0

In [None]:
from fastai.text import *

In [None]:
path = untar_data(URLs.IMDB)

In [None]:
il = (TextList.from_folder(path, processor=[OpenFileProcessor(), TokenizeProcessor()])
              .filter_by_folder(include=['train', 'test', 'unsup']))

In [None]:
opener = OpenFileProcessor()
opener.process(il)

In [None]:
tokenizer = TokenizeProcessor()

In [None]:
%time tokenizer.process(il)

Memory peak at 3.42G (389MB without the kernel)

### dev_course nb 12

In [None]:
from exp.nb_12 import *

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [None]:
tp = TokenizeProcessor()

In [None]:
texts = [read_file(f) for f in il.items]

In [None]:
%time tokens = tp(texts)

Peak at 4.6G

Doesn't kill process each time

### Other

In [None]:
from exp.nb_12 import *

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [None]:
from multiprocessing import Process, Queue, cpu_count

In [None]:
def text_gen(fnames):
    for fn in fnames:
        with open(fn, 'r') as r:
            txt = r.read()
            for fn in default_pre_rules:
                txt = fn(txt)
            yield txt

In [None]:
def process_files(fnames, data_queue, progress_queue, lang='en', batch_size=5000):
    nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"])
    for w in default_spec_tok: nlp.tokenizer.add_special_case(w, [{ORTH: w}])
    tokens = []
    for docs in nlp.pipe(text_gen(fnames), batch_size=batch_size):
        toks = [t.text for t in docs]
        for fn in default_post_rules: toks = fn(toks)
        tokens.append(toks)
        progress_queue.put(1)
    data_queue.put(tokens)

In [None]:
def tokenize(fnames, lang='en', n_workers=4, chunk_size=5000):
    progress_queue,data_queue = Queue(maxsize=n_workers),Queue(maxsize=n_workers)
    processes = [Process(target=process_files,
                         args=(batch, data_queue, progress_queue, lang, chunk_size))
                 for i,batch in enumerate(np.array_split(fnames, n_workers))]
    for p in processes: p.start()
    tokens = []
    for _ in progress_bar(fnames): _ = progress_queue.get()  
    for _ in processes: tokens += data_queue.get()
    for p in processes: p.join()
    return tokens

In [None]:
%time t = tokenize(il.items)

### Writing tokens in memory

In [None]:
from exp.nb_12 import *

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [None]:
from multiprocessing import Process, Queue, cpu_count

In [None]:
def text_gen(fnames):
    for fn in fnames:
        with open(fn, 'r') as r:
            txt = r.read()
            for fn in default_pre_rules:
                txt = fn(txt)
            yield txt

In [None]:
def process_files(fnames, output_file, data_queue, progress_queue, lang='en', batch_size=5000):
    nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"])
    for w in default_spec_tok: nlp.tokenizer.add_special_case(w, [{ORTH: w}])
    counts = Counter()
    with open(output_file, 'w') as w:
        for docs in nlp.pipe(text_gen(fnames), batch_size=batch_size):
            tokens = [t.text for t in docs]
            for fn in default_post_rules: tokens = fn(tokens)
            w.write(' '.join(tokens) + ' ')
            progress_queue.put(1)
            counts.update(Counter(tokens))
    data_queue.put(counts)

In [None]:
def tokenize(fnames, lang='en', n_workers=4, chunk_size=5000, tokens_dir='tmp'):
    progress_queue,data_queue = Queue(maxsize=n_workers),Queue(maxsize=n_workers)
    tokens_dir = Path(tokens_dir)
    os.makedirs(tokens_dir, exist_ok=True)
    processes = [Process(target=process_files,
                         args=(batch, tokens_dir/f'tokens{i}.txt', data_queue, progress_queue, lang, chunk_size))
                 for i,batch in enumerate(np.array_split(fnames, n_workers))]
    
    for p in processes: p.start()
    counter = Counter()
    for _ in progress_bar(range(len(fnames))): _ = progress_queue.get()  
    for _ in processes: counter.update(data_queue.get())
    for p in processes: p.join()
    return counter

In [None]:
%time counter = tokenize(il.items)

In [None]:
itos = [v for (v,c) in counter.most_common()[:60000] if c >= 2]
for o in reversed(default_spec_tok):
    if o in itos: itos.remove(o)
    itos.insert(0, o)
stoi = collections.defaultdict(int,{s:i for i,s in enumerate(itos)})

Tokens are all written in a rwo (there are \n in some texts in IMDB that might make lines, but there might not always be) and we don't want to read all in one go since the goal is to spare memory, so we read by chunks.

In [None]:
def read_chunks(file, chunk_size=1024):
    while True:
        data = file.read(chunk_size)
        if not data: break
        yield data

Token EOS tells us when we have finished a sentence.

In [None]:
def process_tokens(fname, stoi, data_queue, pid):
    all_ids,ids,last = [],[],''
    with open(fname, 'r') as f:
        for chunk in read_chunks(f):
            stream = (last+chunk).split(' ')
            for t in stream[:-1]: #Last token is incomplete (probably) so we keep it for the next chunk
                ids.append(stoi[t])
                if t == EOS:
                    all_ids.append(ids)
                    ids = []
            last = stream[-1]
    data_queue.put([pid, all_ids])

In [None]:
def numericalize(stoi, tokens_dir='tmp', n_workers=4):
    data_queue = Queue(maxsize=n_workers)
    tokens_dir = Path(tokens_dir)
    processes = [Process(target=process_tokens,
                         args=(tokens_dir/f'tokens{i}.txt', stoi, data_queue, i))
                 for i in range(n_workers)]
    for p in processes: p.start()
    ids = [data_queue.get() for _ in processes]
    for p in processes: p.join()
    ids.sort(key = lambda x:x[0])
    return np.concatenate([o[1] for o in ids])

In [None]:
%time ids = numericalize(stoi)

In [None]:
len(ids)

In [None]:
' '.join([itos[i] for i in ids[-1]])

In [None]:
with open(il.items[-1], 'r') as f: print(f.read())