In [1]:
import numpy as np
import string
from keras.preprocessing.text import Tokenizer


samples = ['The cat sat on the mat.', 'The dog ate my homework.']

Using TensorFlow backend.


Word level one-hot encoding

In [2]:
splittedSamples = list(map(lambda sentence: sentence.split(), samples))
dictionary = {}

for sentenceWords in splittedSamples:
    for word in sentenceWords:
        if word not in dictionary:
            dictionary[word] = len(dictionary) + 1
            

results = np.zeros(shape=(len(samples),
                          len(dictionary.values()),
                          len(dictionary.values()) + 1))

for sampleIndex, sample in enumerate(list(splittedSamples)):
    for wordIndexInSample, word in enumerate(sample):
        wordDictionaryIndex = dictionary.get(word)
        results[sampleIndex, wordIndexInSample, wordDictionaryIndex] = 1.
        

print(results)

[[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]

 [[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]]


Word level one-hot encoding with hashing function

In [3]:
hashDimensionality = 10000
maxSampleLength = 10

results = np.zeros(shape=(len(samples), maxSampleLength, hashDimensionality))

for sampleIndex, sample in enumerate(list(splittedSamples)):
    for wordIndexInSample, word in enumerate(sample):
        wordDictionaryIndex = abs(hash(word)) % hashDimensionality
        results[sampleIndex, wordIndexInSample, wordDictionaryIndex] = 1.

print(results)

[[[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 [[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]]


Character level one-hot encoding

In [4]:
charsDictionary = dict(zip(string.printable, range(1, len(string.printable) + 1)))

maxSampleLength = 50

results = np.zeros(shape=(len(samples),
                          maxSampleLength,
                          len(charsDictionary.values()) + 1))

for sampleIndex, sample in enumerate(samples):
    for charIndex, character in enumerate(sample):
        charDictionaryIndex = charsDictionary.get(character)
        results[sampleIndex, charIndex, charDictionaryIndex] = 1.
        
print(results)

[[[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 [[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]]


In [8]:
tokenizer = Tokenizer(num_words= 1000)
tokenizer.fit_on_texts(samples)

oneHotResults = tokenizer.texts_to_matrix(samples, mode='binary')
wordIndex = tokenizer.word_index

print('Found %s unique tokens.' % len(wordIndex))

Found 9 unique tokens.
