# Text Preprocessing



In [1]:
import collections
import re
from d2l import torch as d2l

Reads the lines of text into a list

In [2]:
class TimeMachine(d2l.DataModule): 
    def load(self):
        fname = d2l.download(d2l.DATA_URL+'timemachine.txt', self.root,
                             '090b5e7e70c295757f55df93cb0a180b9691891a')
        with open(fname) as f:
            lines = f.readlines()
            return [re.sub('[^A-Za-z]+', ' ', line).strip().lower()
                    for line in lines]

data = TimeMachine()
lines = data.load()
print(f'
print(lines[0])
print(lines[10])

# text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


We then split each line into a list of tokens

In [3]:
@d2l.add_to_class(TimeMachine)  
def tokenize(self, lines):
    return [list(line) for line in lines]

tokens = data.tokenize(lines)
for i in range(7, 10):
    print(f'line {i+1}: {tokens[i][:12]}')

line 8: []
line 9: ['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 't', 'r', 'a']
line 10: ['w', 'a', 's', ' ', 'e', 'x', 'p', 'o', 'u', 'n', 'd', 'i']


To this end, we will need a class
to construct a *vocabulary*
that assigns a unique index 
to each distinct token value

In [4]:
class Vocab:  
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        counter = collections.Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + [
            token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        return self.token_to_idx['<unk>']

Construct the vocabulary

In [5]:
vocab = Vocab(tokens)
indicies = vocab[tokens[0]]
print('indices:', indicies)
print('words:', vocab.to_tokens(indicies))

indices: [21, 9, 6, 0, 21, 10, 14, 6, 0, 14, 2, 4, 9, 10, 15, 6, 0, 3, 26, 0, 9, 0, 8, 0, 24, 6, 13, 13, 20]
words: ['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm', 'a', 'c', 'h', 'i', 'n', 'e', ' ', 'b', 'y', ' ', 'h', ' ', 'g', ' ', 'w', 'e', 'l', 'l', 's']


Package everything into the `load_corpus_time_machine` function

In [6]:
@d2l.add_to_class(TimeMachine)  
def prepare_data(self):
    tokens = self.tokenize(self.load())
    self.vocab = Vocab(tokens)
    self.corpus = [self.vocab[token] for line in tokens for token in line]

data.prepare_data()
len(data.corpus), len(data.vocab)

(170580, 28)