In [5]:
import numpy as np

In [12]:
def generate_token_index(text_corpus):

    token_dict = {}

    for text in text_corpus:
        for token in text.split():
            if token not in token_dict:
                token_dict[token] = len(token_dict) + 1

    return token_dict

# only the first max_len tokens are consisdered here!
def generate_one_hot_encoded_dataset(samples, token_index, max_len, dimensionality = None):

    if dimensionality is None:
        result = np.zeros(shape=(len(samples), max_len, max(token_index.values()) + 1))
        for row, sample in enumerate(samples):
            for col, token in list(enumerate(sample.split()))[:max_len]:
                index = token_index.get(token)
                result[row, col, index] = 1

    else:
        result = np.zeros(shape=(len(samples), max_len, dimensionality))
        for row, sample in enumerate(samples):
            for col, token in list(enumerate(sample.split()))[:max_len]:
                index = abs(hash(token)) % dimensionality # hashes the token into a random index between 0-dimensionality range
                result[row, col, index] = 1

    
    return result

In [13]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
token_index = generate_token_index(samples)
print("Dict: ", token_index)

result = generate_one_hot_encoded_dataset(samples, token_index, 10)
'''
result will be an n-D array of shape (b,m,m+1)

b: batch dimension ; represents the number of samples
m: max length ; represents the max length of words/tokens to be encoded (for the sample vocab, 10 is sufficient)
m+1: the inner dimension is set to `m+1` so that 0-index is not used for token representation

'''
print("Result shape: ", result.shape)
result

Dict:  {'The': 1, 'cat': 2, 'sat': 3, 'on': 4, 'the': 5, 'mat.': 6, 'dog': 7, 'ate': 8, 'my': 9, 'homework.': 10}
Result shape:  (2, 10, 11)


array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

In [8]:
from keras.preprocessing.text import Tokenizer
keras_tokenizer = Tokenizer(num_words=1000) # purpose of num_words is similar to max_len above

Using TensorFlow backend.


In [9]:
keras_tokenizer.fit_on_texts(samples) # Updates internal vocabulary based on a list of texts ; required before texts_to_sequences()
tokens = keras_tokenizer.texts_to_sequences(samples)

print("Text to Sequences: ", tokens)

one_hot = keras_tokenizer.texts_to_matrix(samples, mode='binary') # other modes are supported!

print("One hot: ", one_hot)

Text to Sequences:  [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
One hot:  [[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [14]:
# Hashed one-hot
result = generate_one_hot_encoded_dataset(samples, token_index, 10, dimensionality=1000)
'''
result will be an n-D array of shape (b,m,d)

b: batch dimension ; represents the number of samples
m: max length ; represents the max length of words/tokens to be encoded (for the sample vocab, 10 is sufficient)
d: the inner dimension is set to `dimensionality` (here 1000), so that a 1000-long vector is used for token representation

'''
print(result.shape)
result

(2, 10, 1000)


array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])