In [4]:
import collections
import re
from d2l import torch as d2l

We will load HG Well time machine

In [5]:
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
'090b5e7e70c295757f55df93cb0a180b9691891a')


In [6]:
def read_time_machine():
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

In [7]:
lines = read_time_machine()

In [8]:
print(len(lines))

3221


In [9]:
print(lines[0])
print(lines[3220])

the time machine by h g wells
of man


### Tokenization

The following tokenize function takes a list (lines) as the input, where each element is a text
sequence (e.g., a text line). Each text sequence is split into a list of tokens. A token is the basic unit
in text. In the end, a list of token lists are returned, where each token is a string.

In [10]:
def tokenize(lines, token='word'): #@save
    """Split text lines into word or character tokens."""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type: ' + token)
        
tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


### Vocabulary

The string type of the token is inconvenient to be used by models, which take numerical inputs.
Now let us build a dictionary, often called vocabulary as well, to map string tokens into numerical
indices starting from 0. To do so, we first count the unique tokens in all the documents from the
training set, namely a corpus, and then assign a numerical index to each unique token according to
its frequency. Rarely appeared tokens are often removed to reduce the complexity. Any token that
does not exist in the corpus or has been removed is mapped into a special unknown token “<unk>”.
We optionally add a list of reserved tokens, such as “<pad>” for padding, “<bos>” to present the
beginning for a sequence, and “<eos>” for the end of a sequence.


In [11]:
def count_corpus(tokens):
    if len(tokens)==0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)
        

In [27]:
new_collect = count_corpus(tokens)

token_freq = sorted(new_collect.items(), key=lambda x:x[1], reverse=True)

token_freq[:5]

[('the', 2261), ('i', 1267), ('and', 1245), ('of', 1155), ('a', 816)]

In [24]:
class Vocab:
    def __init__(self, tokens=None, min_freq=0,reserved_token=None ):
        if tokens==None:
            tokens = []
        if reserved_token==None:
            reserved_token = []
        
        counter = count_corpus(tokens)
        # this line gives the freq list
        self._token_freqs = sorted(counter.items(), key=lambda x:x[1], reverse=True)
        
        # keeping the unknown token to 0
        self.idx_to_token = ['<unk>'] + reserved_token
        self.token_to_idx = {token : idx for idx, token in enumerate(self.idx_to_token)}
        
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    # called whenever indexed reference is made
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
    def unk(self):
        return 0
    
    def token_freqs(self):
        return self._token_freqs
        

In [25]:
vocab = Vocab(tokens)

print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [28]:
for i in [0, 10]:
    print('words : ', tokens[i])
    print('indices : ', vocab[tokens[i]])

words :  ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
indices :  [1, 19, 50, 40, 2183, 2184, 400]
words :  ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
indices :  [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]


In [29]:
def load_corpus_time_machine(max_tokens=-1):
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    
    corpus = [vocab[token] for line in tokens for token in line] 
    
    if max_tokens>0:
        corpus = corpus[:max_tokens]
    
    return corpus, vocab
        

In [31]:
corpus, vocab = load_corpus_time_machine()

len(corpus),len(vocab)

(170580, 28)

In [41]:
corpus[:4]

[3, 9, 2, 1]

### Exercises
1. Tokenization is a key preprocessing step. It varies for different languages. Try to find another
three commonly used methods to tokenize text.

    Different Methods to Perform Tokenization in Python
    Tokenization using Python split() Function
    Tokenization using Regular Expressions
    Tokenization using NLTK
    Tokenization using Spacy
    Tokenization using Keras
    Tokenization using Gensim

2. In the experiment of this section, tokenize text into words and vary the min_freq arguments
of the Vocab instance. How does this affect the vocabulary size?

    as min_freq increases vocabulary size decreases apparently.