In [82]:
#export
"""
Text processing utils. Mostly copied from the fastai library.

The utilities help to convert "raw" texts into formats more suitable for
NLP models. The texts are cleaned and converted into list of tokens.
"""
from collections import Counter, OrderedDict
from itertools import chain
import html
from multiprocessing import cpu_count
from pathlib import Path
import re

from joblib import Parallel, delayed
import pandas as pd
import spacy
from spacy.lang.en import English

from loop.annotations import MaybeList, Callable
from loop.utils import combine, chunks

In [2]:
#export
SEP    = '•'
T_UNK  = 'xxunk'
T_PAD  = 'xxpad'
T_BOS  = 'xxbos'
T_EOS  = 'xxeos'
T_REP  = 'xxrep'
T_WREP = 'xxwrep'
T_UP   = 'xxup'
T_MAJ  = 'xxmaj'
TOKENS = [T_UNK, T_PAD, T_BOS, T_EOS, T_REP, T_WREP, T_UP, T_MAJ]

In [3]:
#export
def replace_tabs_with_spaces(s: str) -> str: return s.replace('\t', ' ')

In [4]:
assert replace_tabs_with_spaces('\ttabs\t') == ' tabs '
assert replace_tabs_with_spaces('\t\t\tmore tabs\t\t\t') == '   more tabs   '
assert replace_tabs_with_spaces('noop') == 'noop'

In [5]:
#export
def add_spaces_around(s: str) -> str: return re.sub(r'([/#\n])', r' \1 ', s)

In [6]:
assert add_spaces_around('#') == ' # '
assert add_spaces_around('\n') == ' \n '
assert add_spaces_around('noop') == 'noop'

In [7]:
#export
def trim_useless_spaces(s: str) -> str: return re.sub(' {2,}', ' ', s)

In [8]:
space = ' '
assert all([trim_useless_spaces(space * i) == space for i in range (1, 11)])
assert trim_useless_spaces(f'{space}word{space}') == f'{space}word{space}'
assert trim_useless_spaces('noop') == 'noop'

In [9]:
#export
def replace_repeated_chars(s: str) -> str:
    def _replace(match):
        char, repeats = match.groups()
        return f' {T_REP} {len(repeats) + 1} {char} '
    regex = re.compile(r'(\S)(\1{3,})')
    return regex.sub(_replace, s)

In [10]:
assert replace_repeated_chars('aaaa') == f' {T_REP} 4 a '
assert replace_repeated_chars('sooooo cooool') == f's {T_REP} 5 o  c {T_REP} 4 o l'
assert replace_repeated_chars('noop') == 'noop'

In [11]:
#export
def replace_repeated_words(s: str) -> str:
    def _replace(match):
        word, repeats = match.groups()
        return f' {T_WREP} {len(repeats.split()) + 1} {word} '
    regex = re.compile(r'(\b\w+\W+)(\1{3,})')
    return regex.sub(_replace, s)

In [12]:
assert replace_repeated_words('one one one one one') == f' {T_WREP} 4 one  one'

In [13]:
#export
def replace_br_tags(s: str) -> str: return re.sub(r'<[\s]*br[\s]*/[\s]*>', '\n', s)

In [14]:
#export
def fix_special_cases(s: str) -> str:
    regex = re.compile(r'  +')
    s = (s.
         replace('#39;',  "'").replace('amp;',    '&').replace('#146;',   "'").
         replace('nbsp;', ' ').replace('#36;',    '$').replace('\\n',    "\n").
         replace('quot;', "'").replace('\\"',     '"').replace(' @.@ ',   '.').
         replace(' @-@ ', '-').replace(' @,@ ',   ',').replace('\\',   ' \\ ').
         replace('<unk>', T_UNK))
    return regex.sub(' ', html.unescape(s))

In [15]:
#export
def replace_capslock(tokens: list) -> list:
    new = []
    for token in tokens:
        if token.isupper() and len(token) > 1:
            new += [T_UP, token.lower()]
        else:
            new.append(token)
    return new

In [16]:
assert replace_capslock(['CAPSLOCK']) == [T_UP, 'capslock']

In [17]:
#export
def replace_capitalized(tokens: list) -> list:
    new = []
    for token in tokens:
        if token == '':
            continue
        if token[0].isupper() and len(token) > 1 and token[1:].islower():
            new.append(T_MAJ)
        new.append(token.lower())
    return new

In [18]:
assert replace_capitalized(['Capitalized', 'Words']) == [T_MAJ, 'capitalized', T_MAJ, 'words']

In [19]:
#export
PREP_RULES = [
    replace_tabs_with_spaces,
    add_spaces_around,
    trim_useless_spaces,
    replace_repeated_chars,
    replace_repeated_words,
    replace_br_tags,
    fix_special_cases
]

POST_RULES = [
    replace_capslock,
    replace_capitalized
]

In [20]:
#export
def tokenize(text: str, prep: MaybeList=None, post: MaybeList=None, 
             special: MaybeList=None, model_fn: Callable=English) -> list:
    """Convert text into list of tokens."""
    nlp = model_fn()
    if special is not None:
        for t in special:
            nlp.tokenizer.add_special_case(t, [{spacy.symbols.ORTH: t}])
    text = combine(text, *prep)
    tokens = [token.text for token in nlp.make_doc(text)]
    tokens = combine(tokens, *post)
    return tokens

In [21]:
#export
def tokenize_english(text):
    return tokenize(text, prep=PREP_RULES, post=POST_RULES, special=TOKENS)

In [22]:
text = """English text that should be tokenized.

The text contains "quoted names", commas, dots. It also has some shortcuts, like "doesn't"
and "don't", if you'd like. 

Also, we've SOME CAPSLOCK here.
"""

expected = [
    T_MAJ, 'english', 'text', 'that', 'should', 'be', 'tokenized', '.', '\n \n ',
    T_MAJ, 'the', 'text', 'contains', '"', 'quoted', 'names', '"', ',', 'commas',
    ',', 'dots', '.',
    T_MAJ, 'it', 'also', 'has', 'some', 'shortcuts', ',', 'like', '"', 'does',
    "n't", '"', '\n ', 'and', '"', 'do', "n't", '"', ',', 'if', 'you', "'d", 'like',
    '.', '\n \n ',
    T_MAJ, 'also', ',', 'we', "'ve", T_UP, 'some', T_UP, 'capslock', 'here', '.', '\n '
]

assert tokenize_english(text) == expected

In [23]:
#export
def useless_token(token, remove=('=', ' ')):
    return token in remove

In [24]:
#export
def create_samples(tokens, eos=T_EOS, ignore=useless_token):
    """Splits list of tokens into samples using EOS tokens as delimiters."""
    samples, run = [], []
    for token in tokens:
        if ignore(token):
            continue
        run.append(token)
        if token == eos:
            samples.append(run)
            run = []
    if run:
        samples.append(run)
    return samples

In [25]:
#export
def format_tokens(tokens): return SEP.join(tokens)

In [26]:
#export
def print_tokens(tokens, n=500): print(format_tokens(tokens[:n]))

In [27]:
print_tokens(expected)

xxmaj•english•text•that•should•be•tokenized•.•
 
 •xxmaj•the•text•contains•"•quoted•names•"•,•commas•,•dots•.•xxmaj•it•also•has•some•shortcuts•,•like•"•does•n't•"•
 •and•"•do•n't•"•,•if•you•'d•like•.•
 
 •xxmaj•also•,•we•'ve•xxup•some•xxup•capslock•here•.•
 


In [28]:
#export
def read_files(root, labels=None, ext='txt', as_pandas=False):
    """Reads files from folders, using each one as a label name."""
    texts = []
    for path in Path(root).expanduser().iterdir():
        if path.is_dir():
            label = path.stem
            if labels is not None and label in labels:
                continue
            items = [
                {'text': fn.open().read(), 'name': fn.stem, 'label': label}
                for fn in path.glob(f'*.{ext}')]
            texts += items
    return pd.DataFrame(texts) if as_pandas else texts

In [29]:
imdb = read_files('~/data/imdb/train', as_pandas=True)

In [30]:
#export
def parallel_tokenizer(texts, tokenizer_fn, chunk_size=10000, n_jobs=None,
                       backend=None, as_pandas=False):
    
    def tokenize_chunk(chunk_of_texts):
        return [tokenizer_fn(text) for text in chunk_of_texts]
    
    n_jobs = n_jobs or cpu_count()
    with Parallel(n_jobs=n_jobs, backend=backend) as parallel:
        results = parallel(delayed(tokenize_chunk)(ch) for ch in chunks(texts))
    return list(chain(*results))

In [31]:
tokens = parallel_tokenizer(imdb.text.tolist(), tokenize_english)

In [32]:
print_tokens(tokens[0])

xxmaj•the•concept•of•having•xxmaj•laurel•&•xxmaj•hardy•this•time•in•the•role•of•chimney•sweepers•works•out•surprisingly•hilarious•.•xxmaj•it•guarantees•some•funny•situations•and•silly•antics•,•from•especially•xxmaj•stan•xxmaj•laurel•of•course•as•usual•.•

•xxmaj•the•movie•also•has•a•subplot•with•a•nutty•professor•who•is•working•on•a•rejuvenation•formula•.•xxmaj•it•does•n't•really•sound•like•a•logical•mix•of•story•lines•and•incoherent•but•both•plot•lines•blend•in•perfectly•toward•the•memorable•ending•.•xxmaj•it•'s•still•a•bit•weird•but•its•funny•nevertheless•,•so•it•works•for•the•movie•.•

•xxmaj•the•supporting•cast•of•the•movie•is•surprising•good•.•xxmaj•sam•xxmaj•adams•is•great•as•the•stereotypical•butler•and•xxmaj•lucien•xxmaj•littlefield•goes•deliciously•over•-•the•-•top•as•the•nutty•professor•.•

•xxmaj•the•movie•is•filled•with•some•excellent•timed•and•hilarious•constructed•sequences•,•which•are•all•quite•predictable•but•become•hilarious•to•watch•nevertheless•thanks•to•the•way•they

In [33]:
len(tokens)

25000

In [115]:
class Vocab:
    def __init__(self, strings, size=60000, min_freq=2, special=TOKENS):
        vocab = sorted([w for w, c in Counter(strings).most_common(size) if c >= min_freq])

        if special is not None:
            for tok in reversed(special):
                if tok in vocab:
                    vocab.remove(tok)
                vocab.insert(0, tok)
            
        self.itos = OrderedDict(enumerate(vocab))
        self.stoi = OrderedDict([(v, k) for k, v in self.itos.items()])
        
    @staticmethod
    def from_token_lists(lists_of_tokens, **kwargs):
        return Vocab(list(chain.from_iterable(lists_of_tokens)), **kwargs)

    def __len__(self): return len(self.itos)
    
    def __iter__(self): 
        return iter(self.itos.items())
    
    def __call__(self, strings):
        return [self.stoi[string] for string in strings]
        
    def __getitem__(self, value):
        if isinstance(value, str):
            return self.stoi.get(value, 0)
        elif isinstance(value, int):
            return self.itos.get(value, T_UNK)
        raise TypeError(f'unexpected index type: {type(value)}, should be str or int')

In [116]:
vocab = Vocab.from_token_lists(tokens)