In [11]:
# Case I : One hot encoding to each words.

import numpy as np

samples = ['The cat sat on the mat',
           'The dog ate my homework']

token_index = {}

for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1

print('Token index =', token_index)

max_length = 10
result = np.zeros(shape=(len(samples),
                         max_length,
                         max(token_index.values()) + 1))

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        result[i, j, index] = 1.

print('Vectorized =', result)

Token index = {'The': 1, 'cat': 2, 'sat': 3, 'on': 4, 'the': 5, 'mat': 6, 'dog': 7, 'ate': 8, 'my': 9, 'homework': 10}
Vectorized = [[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]

 [[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  

In [29]:
# Case II : One hot encoding to each characters.

import string

samples = ['The cat sat on the mat',
           'The dog ate my homework']
characters = string.printable
token_index = dict(zip(characters, range(1, len(characters) + 1)))

max_length = 50
result = np.zeros(shape=(len(samples),
                         max_length,
                         max(token_index.values()) + 1))

for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        result[i, j, index] = 1.

In [37]:
# Case III : One hot encoding with keras utilities

from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat',
           'The dog ate my homework']

# Choose most frequently used 1000 words.
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)

# List of indice according to word.
sequences = tokenizer.texts_to_sequences(samples)
print('Indices =', sequences)

one_hot_result = tokenizer.texts_to_matrix(samples, mode='binary')
print('One hot encoded =\n', one_hot_result[:,:10])

# Word index
word_index = tokenizer.word_index
print('Word index =', word_index)

Indices = [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
One hot encoded =
 [[ 0.  1.  1.  1.  1.  1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  1.  1.  1.  1.]]
Word index = {'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}


In [47]:
# Case IV : One hot hashing

samples = ['The cat sat on the mat',
           'The dog ate my homework']

# Length of hash table
dim = 1000
max_length = 10

result = np.zeros(shape=(len(samples), max_length, dim))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dim
        result[i, j, index] = 1.

# If the number of non-duplicated words in samples is much larger
# than the length of hash table, it makes frequently hash collisions.
entry_index = np.nonzero(result)
entry_index = entry_index[-1]
print('One hot hashed, entry indice =', entry_index)

One hot hashed, entry indice = [581 397 220 772 549 633 581  40 312 662 809]
