In [20]:
# book example on Listing 6.1 (one-hot encoding words)
import numpy as np
# 2 sentences as an input
samples = ['I study at CityU', 'I study at CityU at Seattle']
token_index = {} # builds an index of all tokens in the data using a dictionary for uniuqe words
                 # key = word, value = running index from 1 -> N
for sample in samples:
    for word in sample.split(): # getting individual word from the each sentence
        if word not in token_index:
            token_index[word] = len(token_index) + 1 # starting from 1
            print(token_index[word], word)
# an arbitrary number of words you will consider for a feature
max_length = 6 
# we are creating a 3D matrix of samples x max_length x # of tokens
results = np.zeros(shape = (len(samples), max_length, max(token_index.values()) + 1))


1 I
2 study
3 at
4 CityU
5 Seattle


In [21]:
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1. 
results

array([[[0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1.]]])

In [22]:
from keras.preprocessing.text import Tokenizer
samples = ['I study at CityU', 'I study at CityU at Seattle']

# Let's create a tokenizer, configured to only take into account the top-1000 most common words
tokenizer = Tokenizer(num_words = 6)
# Updates internal vocabulary based on a list of texts. This method creates the
# vocabulary based on word frequency.
# The output of the vocabulary is word_index["I"] = 1, word_index["study"] = 2,
# word(key):index(value) 
tokenizer.fit_on_texts(samples)
# Transforms those strings into a sequence of interger indices.
# Basically, it takes each word in the text and replaces it with 
# its corresponding integer value from the word_index dictionary
sequences = tokenizer.texts_to_sequences(samples)
# Get the one-hot binary representation of given sentences
one_hot_results = tokenizer.texts_to_matrix(samples, mode = 'binary') # mode='count'
# one_hot_results = tokenizer.texts_to_sequences(samples) # another helper function to produce the encoded sequence
# Obtain the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))
print("Sequences: ", sequences, "\n")
print("word_index: ", tokenizer.word_index)
print("one hot results: ", one_hot_results)



Found 5 unique tokens.
Sequences:  [[2, 3, 1, 4], [2, 3, 1, 4, 1, 5]] 

word_index:  {'at': 1, 'i': 2, 'study': 3, 'cityu': 4, 'seattle': 5}
one hot results:  [[0. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1.]]


In [23]:
numpy.set_printoptions(threshold=sys.maxsize)
one_hot_results


array([[0., 1., 1., 1., 1., 0.],
       [0., 1., 1., 1., 1., 1.]])

word embedding examples using an embedding layer in Keras
learning an embedding layer
source: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/


In [55]:

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
# define documents
docs = ['Well done!',           # + = 1
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Fine work!',
        'Bravo!',
        'Tremendous idea',
        'Awesome!',
        'Perfect work',
        'Weak',                 # - = 0
		'Poor effort!',         
		'not good',
		'poor work',
		'Could have done better.',
        'Sucks',
        'Inferior to your previous work',
        'Substandard',
        'Faulty thoughts',
        'Terrible work to be presented'        
        ]
# define class labels
#labels = array([1,1,1,1,1,0,0,0,0,0])
labels = array([1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0])
# integer encode the documents (hash 
vocab_size = 1000 # hyper parameter#1 affects hash collision (make sure you have enough entries to avoid any hash collision)
# one_hot converts an input sentence into a vector
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
# pad documents to a mavx length of 4 words
max_length = 8 # vector space to accommodate the input text sequence, [1, max_length].
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model = Sequential()
# This Embedding has a vocabulary of 50 and an input length of 4. We will choose a small embedding space of 8 dimensions.
num_layer = 8 # hyper parameter#2 {8, 16, 24, 128 ...}
model.add(Embedding(vocab_size, num_layer, input_length=max_length))
# Importantly, the output from the Embedding layer will be 4 vectors of 8 dimensions each, one for each word. We flatten this to a one 32-element vector to pass on to the Dense output layer.
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

# The model architecture, and training configuration (including the optimizer, losses, and metrics) are stored in saved_model.pb. 
# The weights are saved in the variables/ directory.
model.save('my_model')


[[515, 675], [611, 816], [432, 214], [537, 816], [641], [80, 816], [394], [916, 462], [208], [246, 816], [333], [356, 214], [781, 611], [356, 816], [287, 800, 675, 825], [770], [647, 268, 825, 222, 816], [521], [118, 617], [991, 816, 268, 948, 997]]
[[515 675   0   0   0   0   0   0]
 [611 816   0   0   0   0   0   0]
 [432 214   0   0   0   0   0   0]
 [537 816   0   0   0   0   0   0]
 [641   0   0   0   0   0   0   0]
 [ 80 816   0   0   0   0   0   0]
 [394   0   0   0   0   0   0   0]
 [916 462   0   0   0   0   0   0]
 [208   0   0   0   0   0   0   0]
 [246 816   0   0   0   0   0   0]
 [333   0   0   0   0   0   0   0]
 [356 214   0   0   0   0   0   0]
 [781 611   0   0   0   0   0   0]
 [356 816   0   0   0   0   0   0]
 [287 800 675 825   0   0   0   0]
 [770   0   0   0   0   0   0   0]
 [647 268 825 222 816   0   0   0]
 [521   0   0   0   0   0   0   0]
 [118 617   0   0   0   0   0   0]
 [991 816 268 948 997   0   0   0]]
Model: "sequential_29"
__________________________

How to use pre-trained network (GloVe) in Keras

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B\\glove.6B.100d.txt', encoding='utf8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))