## Imports

In [1]:
import re
from pathlib import Path
from copy import deepcopy
from collections import namedtuple
from multiprocessing import Pool, cpu_count

In [2]:
import spacy
from spacy import symbols

## Global Variables

In [3]:
DATA_ROOT = Path.home() / 'data'
IMDB = DATA_ROOT / 'aclImdb'
SENTINEL = IMDB / 'created'

LM_PATH =  IMDB / 'imdb_lm'
LM_TOKENS_PATH = LM_PATH / 'tmp' / 'tokens.pickle'
LM_VOCAB_PATH = LM_PATH / 'tmp' / 'vocab.pickle'

CLASS_PATH = IMDB / 'imdb_class'
CLS_TOKENS_PATH = CLASS_PATH / 'tmp' / 'tokens.pickle'
CLS_VOCAB_PATH = CLASS_PATH / 'tmp' / 'vocab.pickle'
CLASSES = ['neg', 'pos', 'unsup']

RE_SPACE = re.compile(r'  +')
RANDOM_SEED = 42

Vocab = namedtuple('Vocab', 'itos stoi size')

## Dataset Preparation

## Tokenization

In [4]:
BOS, FLD, UNK, PAD = 'xxbos','xxfld','xxunk','xxpad'
TK_UP, TK_REP, TK_WREP = 'xxup','xxrep','xxwrep'


def spec_add_spaces(t: str) -> str:
    "Add spaces around / and # in `t`."
    return re.sub(r'([/#])', r' \1 ', t)


def rm_useless_spaces(t: str) -> str:
    "Remove multiple spaces in `t`."
    return re.sub(' {2,}', ' ', t)


def replace_rep(t: str) -> str:
    "Replace repetitions at the character level in `t`."
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)


def replace_wrep(t: str) -> str:
    "Replace word repetitions in `t`."
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)


def deal_caps(t: str) -> str:
    """Replace words in all caps in `t`."""
    res = []
    for s in re.findall(r'\w+|\W+', t):
        res += (
            [f' {TK_UP} ',s.lower()] 
            if (s.isupper() and (len(s)>2)) 
            else [s.lower()])
    return ''.join(res)


def fix_html(x:str) -> str:
    "List of replacements from html strings in `x`."
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [5]:
DEFAULT_RULES = (
    spec_add_spaces,
    rm_useless_spaces,
    replace_rep,
    replace_wrep,
    deal_caps,
    fix_html
)
DEFAULT_SPECIAL_TOKENS = (BOS, FLD, UNK, PAD)

In [6]:
class SpacyTokenizer:
    """A thin wrapper on top of Spacy tokenization tools."""
    
    def __init__(self, lang='en', rules=DEFAULT_RULES, special_cases=DEFAULT_SPECIAL_TOKENS):
        tokenizer = spacy.load(lang).tokenizer
        if special_cases:
            tokenizer.add_special_cases(special_cases)
        self.rules = rules
        self.tokenizer = tokenizer
    
    def tokenize(self, sentence):
        for rule in self.rules:
            rule(sentence)
        return [t for t in self.tokenizer(sentence)]

In [7]:
def parallel_tokenization(texts):
    
    def tokenize(texts):
        tok = SpacyTokenizer()
        return [tok.tokenize(text) for text in texts]
            
    n_workers = cpu_count()
    parts = split_into(texts, len(texts)//n_workers + 1)
    with Pool(n_workers) as pool:
        results = pool.map(tokenize, parts)
    return sum(results, [])

In [8]:
def split_into(arr, n):
    return [arr[i:i + n] for i in range(0, len(arr), sz)]