In [25]:
#export
"""
Text processing utils. Mostly copied from the fastai library.

The utilities help to convert "raw" texts into formats more suitable for
NLP models. The texts are cleaned and converted into list of tokens.
"""
import html
import re

import spacy
from spacy.lang.en import English

from loop.annotations import MaybeList, Callable
from loop.utils import combine

In [2]:
#export
SEP    = '•'
T_UP   = 'xxup'
T_REP  = 'xxrep'
T_WREP = 'xxwrep'
T_MAJ  = 'xxmaj'
T_BOS  = 'xxbos'
T_EOS  = 'xxeos'
T_FLD  = 'xxfld'
T_UNK  = 'xxunk'
T_PAD  = 'xxpad'
TOKENS = [T_UP, T_REP, T_WREP, T_MAJ, T_BOS, T_EOS, T_FLD, T_UNK, T_PAD]

In [3]:
#export
def replace_tabs_with_spaces(s: str) -> str: return s.replace('\t', ' ')

In [4]:
assert replace_tabs_with_spaces('\ttabs\t') == ' tabs '
assert replace_tabs_with_spaces('\t\t\tmore tabs\t\t\t') == '   more tabs   '
assert replace_tabs_with_spaces('noop') == 'noop'

In [5]:
#export
def add_spaces_around(s: str) -> str: return re.sub(r'([/#\n])', r' \1 ', s)

In [6]:
assert add_spaces_around('#') == ' # '
assert add_spaces_around('\n') == ' \n '
assert add_spaces_around('noop') == 'noop'

In [7]:
#export
def trim_useless_spaces(s: str) -> str: return re.sub(' {2,}', ' ', s)

In [8]:
space = ' '
assert all([trim_useless_spaces(space * i) == space for i in range (1, 11)])
assert trim_useless_spaces(f'{space}word{space}') == f'{space}word{space}'
assert trim_useless_spaces('noop') == 'noop'

In [9]:
#export
def replace_repeated_chars(s: str) -> str:
    def _replace(match):
        char, repeats = match.groups()
        return f' {T_REP} {len(repeats) + 1} {char} '
    regex = re.compile(r'(\S)(\1{3,})')
    return regex.sub(_replace, s)

In [10]:
assert replace_repeated_chars('aaaa') == f' {T_REP} 4 a '
assert replace_repeated_chars('sooooo cooool') == f's {T_REP} 5 o  c {T_REP} 4 o l'
assert replace_repeated_chars('noop') == 'noop'

In [11]:
#export
def replace_repeated_words(s: str) -> str:
    def _replace(match):
        word, repeats = match.groups()
        return f' {T_WREP} {len(repeats.split()) + 1} {word} '
    regex = re.compile(r'(\b\w+\W+)(\1{3,})')
    return regex.sub(_replace, s)

In [12]:
assert replace_repeated_words('one one one one one') == f' {T_WREP} 4 one  one'

In [13]:
#export
def fix_wikitext_special_cases(s: str) -> str:
    regex = re.compile(r'  +')
    s = (s.
         replace('#39;',    "'").replace('amp;',    '&').replace('#146;', "'").
         replace('nbsp;',   ' ').replace('#36;',    '$').replace('\\n',  "\n").
         replace('quot;',   "'").replace('<br />', "\n").replace('\\"',   '"').
         replace(' @.@ ',   '.').replace(' @-@ ',   '-').replace(' @,@ ', ',').
         replace('\\',   ' \\ ').replace('<unk>', T_UNK))
    return regex.sub(' ', html.unescape(s))

In [14]:
#export
def replace_capslock(tokens: list) -> list:
    new = []
    for token in tokens:
        if token.isupper() and len(token) > 1:
            new += [T_UP, token.lower()]
        else:
            new.append(token)
    return new

In [15]:
assert replace_capslock(['CAPSLOCK']) == [T_UP, 'capslock']

In [16]:
#export
def replace_capitalized(tokens: list) -> list:
    new = []
    for token in tokens:
        if token == '':
            continue
        if token[0].isupper() and len(token) > 1 and token[1:].islower():
            new.append(T_MAJ)
        new.append(token.lower())
    return new

In [17]:
assert replace_capitalized(['Capitalized', 'Words']) == [T_MAJ, 'capitalized', T_MAJ, 'words']

In [18]:
#export
PREP_RULES = [
    replace_tabs_with_spaces,
    add_spaces_around,
    trim_useless_spaces,
    replace_repeated_chars,
    replace_repeated_words,
    fix_wikitext_special_cases,
]

POST_RULES = [
    replace_capslock,
    replace_capitalized
]

In [27]:
#export
def tokenize(text: str, prep: MaybeList=None, post: MaybeList=None, 
             special: MaybeList=None, model_fn: Callable=English) -> list:
    """Convert text into list of tokens."""
    nlp = model_fn()
    if special is not None:
        for t in special:
            nlp.tokenizer.add_special_case(t, [{spacy.symbols.ORTH: t}])
    text = combine(text, *prep)
    tokens = [token.text for token in nlp.make_doc(text)]
    tokens = combine(tokens, *post)
    return tokens

In [28]:
#export
def tokenize_english(text):
    return tokenize(text, prep=PREP_RULES, post=POST_RULES, special=TOKENS)

In [34]:
text = """English text that should be tokenized.

The text contains "quoted names", commas, dots. It also has some shortcuts, like "doesn't"
and "don't", if you'd like. 

Also, we've SOME CAPSLOCK here.
"""

expected = [
    T_MAJ, 'english', 'text', 'that', 'should', 'be', 'tokenized', '.', '\n \n ',
    T_MAJ, 'the', 'text', 'contains', '"', 'quoted', 'names', '"', ',', 'commas',
    ',', 'dots', '.',
    T_MAJ, 'it', 'also', 'has', 'some', 'shortcuts', ',', 'like', '"', 'does',
    "n't", '"', '\n ', 'and', '"', 'do', "n't", '"', ',', 'if', 'you', "'d", 'like',
    '.', '\n \n ',
    T_MAJ, 'also', ',', 'we', "'ve", T_UP, 'some', T_UP, 'capslock', 'here', '.', '\n '
]

assert tokenize_english(text) == expected

['xxmaj',
 'english',
 'text',
 'that',
 'should',
 'be',
 'tokenized',
 '.',
 '\n \n ',
 'xxmaj',
 'the',
 'text',
 'contains',
 '"',
 'quoted',
 'names',
 '"',
 ',',
 'commas',
 ',',
 'dots',
 '.',
 'xxmaj',
 'it',
 'also',
 'has',
 'some',
 'shortcuts',
 ',',
 'like',
 '"',
 'does',
 "n't",
 '"',
 '\n ',
 'and',
 '"',
 'do',
 "n't",
 '"',
 ',',
 'if',
 'you',
 "'d",
 'like',
 '.',
 '\n \n ',
 'xxmaj',
 'also',
 ',',
 'we',
 "'ve",
 'xxup',
 'some',
 'xxup',
 'capslock',
 'here',
 '.',
 '\n ']

In [21]:
#export
def useless_token(token, remove=('=', ' ')):
    return token in remove

In [22]:
#export
def create_samples(tokens, eos=T_EOS, ignore=useless_token):
    """Splits list of tokens into samples using EOS tokens as delimiters."""
    samples, run = [], []
    for token in tokens:
        if ignore(token):
            continue
        run.append(token)
        if token == eos:
            samples.append(run)
            run = []
    if run:
        samples.append(run)
    return samples

In [None]:
#export
def format_tokens(tokens): return SEP.join(tokens)

In [None]:
#export
def print_tokens(tokens, n=500): print(format_tokens(tokens[:n]))