In [1]:
#export
"""
Text processing utils. Mostly copied from the fastai library.

The utilities help to convert "raw" texts into formats more suitable for
NLP models. The texts are cleaned and converted into list of tokens.
"""
from collections import Counter, OrderedDict
from itertools import chain
import html
from multiprocessing import cpu_count
from pathlib import Path
import re

from joblib import Parallel, delayed
import pandas as pd
import spacy
from spacy.lang.en import English

from loop.annotations import MaybeList, Callable
from loop.utils import combine, chunks

In [2]:
#export
SEP    = '•'
T_UNK  = 'xxunk'
T_PAD  = 'xxpad'
T_BOS  = 'xxbos'
T_EOS  = 'xxeos'
T_REP  = 'xxrep'
T_WREP = 'xxwrep'
T_UP   = 'xxup'
T_MAJ  = 'xxmaj'
TOKENS = [T_UNK, T_PAD, T_BOS, T_EOS, T_REP, T_WREP, T_UP, T_MAJ]

In [3]:
#export
def replace_tabs_with_spaces(s: str) -> str: return s.replace('\t', ' ')

In [4]:
assert replace_tabs_with_spaces('\ttabs\t') == ' tabs '
assert replace_tabs_with_spaces('\t\t\tmore tabs\t\t\t') == '   more tabs   '
assert replace_tabs_with_spaces('noop') == 'noop'

In [5]:
#export
def add_spaces_around(s: str) -> str: return re.sub(r'([/#\n])', r' \1 ', s)

In [6]:
assert add_spaces_around('#') == ' # '
assert add_spaces_around('\n') == ' \n '
assert add_spaces_around('noop') == 'noop'

In [7]:
#export
def trim_useless_spaces(s: str) -> str: return re.sub(' {2,}', ' ', s)

In [8]:
space = ' '
assert all([trim_useless_spaces(space * i) == space for i in range (1, 11)])
assert trim_useless_spaces(f'{space}word{space}') == f'{space}word{space}'
assert trim_useless_spaces('noop') == 'noop'

In [9]:
#export
def replace_repeated_chars(s: str) -> str:
    def _replace(match):
        char, repeats = match.groups()
        return f' {T_REP} {len(repeats) + 1} {char} '
    regex = re.compile(r'(\S)(\1{3,})')
    return regex.sub(_replace, s)

In [10]:
assert replace_repeated_chars('aaaa') == f' {T_REP} 4 a '
assert replace_repeated_chars('sooooo cooool') == f's {T_REP} 5 o  c {T_REP} 4 o l'
assert replace_repeated_chars('noop') == 'noop'

In [11]:
#export
def replace_repeated_words(s: str) -> str:
    def _replace(match):
        word, repeats = match.groups()
        return f' {T_WREP} {len(repeats.split()) + 1} {word} '
    regex = re.compile(r'(\b\w+\W+)(\1{3,})')
    return regex.sub(_replace, s)

In [12]:
assert replace_repeated_words('one one one one one') == f' {T_WREP} 4 one  one'

In [13]:
#export
def replace_br_tags(s: str) -> str: return re.sub(r'<[\s]*br[\s]*/[\s]*>', '\n', s)

In [14]:
#export
def fix_special_cases(s: str) -> str:
    regex = re.compile(r'  +')
    s = (s.
         replace('#39;',  "'").replace('amp;',    '&').replace('#146;',   "'").
         replace('nbsp;', ' ').replace('#36;',    '$').replace('\\n',    "\n").
         replace('quot;', "'").replace('\\"',     '"').replace(' @.@ ',   '.').
         replace(' @-@ ', '-').replace(' @,@ ',   ',').replace('\\',   ' \\ ').
         replace('<unk>', T_UNK))
    return regex.sub(' ', html.unescape(s))

In [15]:
#export
def replace_new_lines(s: str) -> str: return s.replace('\n', ' ')

In [16]:
#export
def replace_capslock(tokens: list) -> list:
    new = []
    for token in tokens:
        if token.isupper() and len(token) > 1:
            new += [T_UP, token.lower()]
        else:
            new.append(token)
    return new

In [17]:
assert replace_capslock(['CAPSLOCK']) == [T_UP, 'capslock']

In [18]:
#export
def replace_capitalized(tokens: list) -> list:
    new = []
    for token in tokens:
        if token == '':
            continue
        if token[0].isupper() and len(token) > 1 and token[1:].islower():
            new.append(T_MAJ)
        new.append(token.lower())
    return new

In [19]:
assert replace_capitalized(['Capitalized', 'Words']) == [T_MAJ, 'capitalized', T_MAJ, 'words']

In [20]:
#export
PREP_RULES = [
    replace_tabs_with_spaces,
    replace_br_tags,
    fix_special_cases,
    replace_repeated_chars,
    replace_repeated_words,
    replace_new_lines,
    trim_useless_spaces
]

POST_RULES = [
    replace_capslock,
    replace_capitalized
]

In [21]:
#export 
def clean_text(s: str, rules=None):
    rules = rules or PREP_RULES
    return combine(s, *rules)

In [22]:
#export
def update_tokens(tokens: str, rules=None):
    rules = rules or POST_RULES
    return combine(tokens, *rules)

In [23]:
#export
def tokenize_english(texts: list):
    return tokenize(texts, chunk_size=100_000, num_workers=cpu_count(), special=TOKENS)


def tokenize(texts: list, chunk_size: int, num_workers: int=1, 
             model_fn=English, prep=clean_text, post=update_tokens,
             special=None, backend='loky'):
    
    def doc_to_list(doc: str):
        return [token.text for token in doc]
    
    def worker(nlp, texts):
        return [post(doc_to_list(nlp.make_doc(prep(text)))) for text in texts]
    
    if len(texts) <= 2*chunk_size:
        nlp = init_tokenizer(model_fn, special)
        return worker(nlp, texts)
    
    with Parallel(n_jobs=num_workers, backend=backend) as parallel:
        results = parallel(
            delayed(worker)(nlp, text_chunk)
            for nlp, text_chunk in (
                (init_tokenizer(model_fn, special), t) 
                for t in chunks(texts, chunk_size)
            )
        )
    
    return list(chain(*results))


def init_tokenizer(model_fn, special=None):
    nlp = model_fn()
    if special is not None:
        for t in special:
            nlp.tokenizer.add_special_case(t, [{spacy.symbols.ORTH: t}])
    return nlp

In [24]:
text = """English text that should be tokenized.

The text contains "quoted names", commas, dots. It also has some shortcuts, like "doesn't"
and "don't", if you'd like. 

Also, we've SOME CAPSLOCK here.
"""

'•'.join(tokenize_english([text])[0])

'xxmaj•english•text•that•should•be•tokenized•.•xxmaj•the•text•contains•"•quoted•names•"•,•commas•,•dots•.•xxmaj•it•also•has•some•shortcuts•,•like•"•does•n\'t•"•and•"•do•n\'t•"•,•if•you•\'d•like•.•xxmaj•also•,•we•\'ve•xxup•some•xxup•capslock•here•.'

In [25]:
#export
def useless_token(token, remove=('=', ' ')):
    return token in remove

In [26]:
#export
def format_tokens(tokens): return SEP.join(tokens)

In [27]:
#export
def print_tokens(tokens): print(format_tokens(tokens))

In [28]:
#export
def read_files(root, labels=None, ext='txt', as_pandas=False):
    """Reads files from folders, using each one as a label name."""
    texts = []
    for path in Path(root).expanduser().iterdir():
        if path.is_dir():
            label = path.stem
            if labels is not None and label in labels:
                continue
            items = [
                {'text': fn.open().read(), 'name': fn.stem, 'label': label}
                for fn in path.glob(f'*.{ext}')]
            texts += items
    return pd.DataFrame(texts) if as_pandas else texts

In [29]:
imdb = read_files('~/data/imdb/train', as_pandas=True)

In [30]:
class Vocab:
    def __init__(self, strings, size=60000, min_freq=2, special=TOKENS):
        vocab = sorted([w for w, c in Counter(strings).most_common(size) if c >= min_freq])

        if special is not None:
            for tok in reversed(special):
                if tok in vocab:
                    vocab.remove(tok)
                vocab.insert(0, tok)
            
        self.itos = OrderedDict(enumerate(vocab))
        self.stoi = OrderedDict([(v, k) for k, v in self.itos.items()])
        
    @staticmethod
    def from_token_lists(lists_of_tokens, **kwargs):
        return Vocab(list(chain.from_iterable(lists_of_tokens)), **kwargs)
    
    @property
    def words(self): return list(self.stoi.keys())

    def __len__(self): return len(self.itos)
    
    def __iter__(self): 
        return iter(self.itos.items())
    
    def __call__(self, strings):
        return [self.stoi[string] for string in strings]
        
    def __getitem__(self, value):
        if isinstance(value, str):
            return self.stoi.get(value, 0)
        elif isinstance(value, int):
            return self.itos.get(value, T_UNK)
        raise TypeError(f'unexpected index type: {type(value)}, should be str or int')

In [31]:
tokens = tokenize_english(imdb.text.tolist())

In [32]:
imdb.text[15142]

"...when he remade Broadway BILL (1934) as RIDING HIGH (1950). Recasting Bing Crosby as DAN BROOKS did not help a screenplay that was 'dated' in 34 let alone 50. This sad film has entire scenes lifted from the original with many of the supporting cast repeating their roles, unless they were dead. Though being older did not seem to matter to the Director. Nor that the Cars and Clothes in the background plates from 1934 did not seem match up too 1950s' standards. Not even 'der Bingel' singing can redeem this effort.<br /><br />We rated both the original and the remake IMDb Four****Stars. Frank's touch was long gone and all that was left was CAPRA-CORN. That did not stop Mr. Capra though. After floundering around the 50's making some educational documentaries he wound up his career remaking LADY FOR A DAY (1933) as POCKETFUL OF MIRACLES (1961). Again a fine cast was let down on that IMDb Six******Star effort compared too the originals Eight********Stars. Sometimes it is better to quit whi

In [33]:
print_tokens(tokens[15142])

...•when•he•remade•xxmaj•broadway•xxup•bill•(•1934•)•as•xxup•riding•xxup•high•(•1950•)•.•xxmaj•recasting•xxmaj•bing•xxmaj•crosby•as•xxup•dan•xxup•brooks•did•not•help•a•screenplay•that•was•'•dated•'•in•34•let•alone•50•.•xxmaj•this•sad•film•has•entire•scenes•lifted•from•the•original•with•many•of•the•supporting•cast•repeating•their•roles•,•unless•they•were•dead•.•xxmaj•though•being•older•did•not•seem•to•matter•to•the•xxmaj•director•.•xxmaj•nor•that•the•xxmaj•cars•and•xxmaj•clothes•in•the•background•plates•from•1934•did•not•seem•match•up•too•1950s•'•standards•.•xxmaj•not•even•'•der•xxmaj•bingel•'•singing•can•redeem•this•effort•.•xxmaj•we•rated•both•the•original•and•the•remake•imdb•xxmaj•four•xxrep•4•*•xxmaj•stars•.•xxmaj•frank•'s•touch•was•long•gone•and•all•that•was•left•was•xxup•capra•-•xxup•corn•.•xxmaj•that•did•not•stop•xxmaj•mr.•xxmaj•capra•though•.•xxmaj•after•floundering•around•the•50•'s•making•some•educational•documentaries•he•wound•up•his•career•remaking•xxup•lady•xxup•for•a•xxup•d

In [34]:
vocab = Vocab.from_token_lists(tokens)