# Word-level one-hot encoding:
## CH6: Deep Learning for text and sequences
### Deep Learning with Keras pg. 182
#### Toy Example

In [7]:
import numpy as np
samples= ['The cat sat on the mat.','The dog ate my homework.']

token_index={}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word]= len(token_index)+1
max_length= 10
res= np.zeros(shape= (len(samples), max_length, max(token_index.values()) +1))

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index= token_index.get(word)
        res[i,j,index]=1.
print(res)

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


In [8]:
import numpy as np
import string

samples= ['The cat sat on the mat.','The dog ate my homework.']
chars= string.printable
token_index= dict(zip(range(1,len(chars)+1), chars))

max_length=50
res= np.zeros((len(samples), max_length, max(token_index.keys())+ 1))
for i,sample in enumerate(samples):
    for j, char in enumerate(sample):
        index= token_index.get(char)
        res[i,j,index]=1.
print(res)

[[[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [13]:
from keras.preprocessing.text import Tokenizer

samples=['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer= Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)

sequences= tokenizer.texts_to_sequences(samples)

one_hot_res= tokenizer.texts_to_sequences(samples)

word_index= tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))

Found 9 unique tokens.


In [16]:
dimensionality= 1000
max_len= 10
res= np.zeros((len(samples), max_len, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_len]:
        index= abs(hash(word)) % dimensionality
        res[i,j, index]= 1.
print(res)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
