In [1]:
# word-level one hot encoding in Keras
from keras.preprocessing.text import Tokenizer

samples = [
    "A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave",
    "Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading ""His Photographer"" and ""His Press Agent"" respectively, follow him into the shot; the photographer sets up his camera. ""Teddy"" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. ""Teddy"" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. ""Teddy"" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs"
]

tokenizer = Tokenizer(num_words=100) #picks most frequent 100 words
tokenizer.fit_on_texts( samples )

sequences = tokenizer.texts_to_sequences( samples )
one_hot_result = tokenizer.texts_to_matrix( samples, mode='binary')
word_index = tokenizer.word_index

print('Total unique tokens:', len(word_index))
print('Unique tokens:' , word_index )
print('Sequences for given sample:', sequences)
print('One hot encoding for given samples:', one_hot_result)

Using TensorFlow backend.


Total unique tokens: 145
Unique tokens: {'the': 1, 'and': 2, 'a': 3, 'his': 4, 'to': 5, 'then': 6, 'in': 7, 'of': 8, 'shot': 9, 'is': 10, 'he': 11, 'photographer': 12, 'press': 13, 'agent': 14, 'teddy': 15, 'bartender': 16, 'at': 17, 'irish': 18, 'beer': 19, 'over': 20, 'group': 21, 'two': 22, 'wood': 23, 'towards': 24, 'tree': 25, 'rifle': 26, 'signs': 27, 'camera': 28, 'on': 29, 'path': 30, 'working': 31, 'saloon': 32, 'serving': 33, 'drinks': 34, 'customers': 35, 'after': 36, 'fills': 37, 'stereotypically': 38, "man's": 39, 'bucket': 40, 'with': 41, 'carrie': 42, 'nation': 43, 'her': 44, 'followers': 45, 'burst': 46, 'inside': 47, 'they': 48, 'assault': 49, 'man': 50, 'pulling': 51, 'hat': 52, 'eyes': 53, 'dumping': 54, 'head': 55, 'begin': 56, 'wrecking': 57, 'bar': 58, 'smashing': 59, 'fixtures': 60, 'mirrors': 61, 'breaking': 62, 'cash': 63, 'register': 64, 'sprays': 65, 'seltzer': 66, 'water': 67, "nation's": 68, 'face': 69, 'before': 70, 'policemen': 71, 'appear': 72, 'order': 

In [2]:
# we can also use hashing to tokenize words - susceptible to hash collisions!!
import numpy as np

dimensionality = 150
max_length = 20 #control vector representing 
word_index = {}
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dimensionality
        word_index[ word ] = index #get's overwritten on collision
        results[i,j, index] = 1

print( 'Unique tokens:', word_index )
print( 'Final encoding:', results)

Unique tokens: {'A': 16, 'bartender': 48, 'is': 115, 'working': 144, 'at': 59, 'a': 68, 'saloon,': 69, 'serving': 118, 'drinks': 97, 'to': 84, 'customers.': 78, 'After': 71, 'he': 49, 'fills': 30, 'stereotypically': 28, 'Irish': 14, "man's": 23, 'bucket': 118, 'with': 116, 'Lasting': 12, 'just': 62, '61': 146, 'seconds': 116, 'and': 37, 'consisting': 132, 'of': 138, 'two': 34, 'shots,': 87, 'the': 117, 'first': 105, 'shot': 15, 'set': 89, 'in': 52, 'wood': 42, 'during': 41, 'winter.': 41, 'The': 52}
Final encoding: [[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
