In [25]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten

import gensim

# Basics of Embedding Layer

## Data Preprocessing

In [2]:
# data sets
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1] # 1 - good, 0 - bad

In [11]:
# tokenizing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1 # 
print(vocab_size)

16


In [12]:
# encoding sentences using tokenizer
X_encoded = tokenizer.texts_to_sequences(sentences)
print(X_encoded)

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [13]:
# padding
max_len = max(len(l) for l in X_encoded)
print(max_len)

X_train = pad_sequences(sequences=X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y_train) # 
print(X_train.shape)
print(y_train.shape)

4
(7, 4)
(7,)


## Modeling

In [18]:
embedding_dim = 4 # dimension of embedding vectors

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.add(Flatten()) # flatten embedded vectors to make it 1D for Dense layer
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
hist = model.fit(X_train, y_train, epochs=100, verbose=2)

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 4, 4)              64        
                                                                 
 flatten_4 (Flatten)         (None, 16)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 81 (324.00 Byte)
Trainable params: 81 (324.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
1/1 - 0s - loss: 0.6992 - acc: 0.4286 - 141ms/epoch - 141ms/step
Epoch 2/100
1/1 - 0s - loss: 0.6976 - acc: 0.4286 - 2ms/epoch - 2ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6960 - acc: 0.5714 - 2ms/epoch - 2ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6944 - acc: 0.5714 - 2ms/epo

In [23]:
print(hist.history['acc'][-1])
print(model.predict([[1,0,0,0]]))

1.0
[[0.49260873]]


# Using Pretrained GloVe Embedding

## Data Preprocessing

In [26]:
# load pre-trained word vectors and create embedding dict
model_path = "../model/"
model_name = "glove.6B.100d.txt"

embedding_dict = dict()

f = open(model_path + model_name, encoding='utf8')
for line in f:
    values = line.split() # e.g., ['the', '0.418', '0.24968', '-0.41242', '0.1217', ...]
    word = values[0] # word
    
    vectors = np.asarray(values[1:], 'float32') # vector representations of words
    embedding_dict[word] = vectors
f.close()

print(len(embedding_dict))

400000


In [30]:
# making embedding matrix using embedding dict
embedding_dims = len(embedding_dict['the']) # 
print(embedding_dims)

embedding_matrix = np.zeros((vocab_size, embedding_dims))
print(embedding_matrix.shape)

for word, index in tokenizer.word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

100
(16, 100)


In [31]:
# check embedding matrix
print(embedding_dict['great']) # show vector representation of 'great'
print(tokenizer.word_index['great']) # show index of 'great' -> 2
print(embedding_matrix[2]) # show vector representation of 'great' in embedding matrix

[-0.013786   0.38216    0.53236    0.15261   -0.29694   -0.20558
 -0.41846   -0.58437   -0.77355   -0.87866   -0.37858   -0.18516
 -0.128     -0.20584   -0.22925   -0.42599    0.3725     0.26077
 -1.0702     0.62916   -0.091469   0.70348   -0.4973    -0.77691
  0.66045    0.09465   -0.44893    0.018917   0.33146   -0.35022
 -0.35789    0.030313   0.22253   -0.23236   -0.19719   -0.0053125
 -0.25848    0.58081   -0.10705   -0.17845   -0.16206    0.087086
  0.63029   -0.76649    0.51619    0.14073    1.019     -0.43136
  0.46138   -0.43585   -0.47568    0.19226    0.36065    0.78987
  0.088945  -2.7814    -0.15366    0.01015    1.1798     0.15168
 -0.050112   1.2626    -0.77527    0.36031    0.95761   -0.11385
  0.28035   -0.02591    0.31246   -0.15424    0.3778    -0.13599
  0.2946    -0.31579    0.42943    0.086969   0.019169  -0.27242
 -0.31696    0.37327    0.61997    0.13889    0.17188    0.30363
 -1.2776     0.044423  -0.52736   -0.88536   -0.19428   -0.61947
 -0.10146   -0.26301  

## Modeling

In [32]:
embedding_dims = 100

model = Sequential()
# using embedding_matrix as weights and not trainable to use pre-trained word vectors
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dims, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Flatten())
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
hist = model.fit(X_train, y_train, epochs=100, verbose=2)

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 4, 100)            1600      
                                                                 
 flatten_5 (Flatten)         (None, 400)               0         
                                                                 
 dense_8 (Dense)             (None, 1)                 401       
                                                                 
Total params: 2001 (7.82 KB)
Trainable params: 401 (1.57 KB)
Non-trainable params: 1600 (6.25 KB)
_________________________________________________________________
Epoch 1/100
1/1 - 0s - loss: 0.7263 - acc: 0.4286 - 124ms/epoch - 124ms/step
Epoch 2/100
1/1 - 0s - loss: 0.7029 - acc: 0.4286 - 2ms/epoch - 2ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6804 - acc: 0.5714 - 2ms/epoch - 2ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6588 - acc: 0.5714 - 2ms/epoch -

In [33]:
print(hist.history['acc'][-1])
print(model.predict([[1,0,0,0]]))

1.0
[[0.3936007]]


# Using Pretrained Word2Vec Embedding

## Data Preprocessing

In [36]:
# load pre-trained word vectors and create embedding dict
model_name = "GoogleNews-vectors-negative300.bin.gz"
word_2_vec = gensim.models.KeyedVectors.load_word2vec_format(model_path + model_name, binary=True)
print(word_2_vec.vectors.shape)

(3000000, 300)


In [37]:
# making embedding matrix
embedding_matrix = np.zeros((vocab_size, 300))
print(embedding_matrix.shape)

(16, 300)


In [38]:
def get_word(word):
    if word in word_2_vec:
        return word_2_vec[word]
    else:
        return None
    
for word, index in tokenizer.word_index.items():
    vector = get_word(word)
    if vector is not None:
        embedding_matrix[index] = vector

In [41]:
# check embedding matrix
print(word_2_vec['great'])
print(tokenizer.word_index['great'])
print(embedding_matrix[2])

[ 7.17773438e-02  2.08007812e-01 -2.84423828e-02  1.78710938e-01
  1.32812500e-01 -9.96093750e-02  9.61914062e-02 -1.16699219e-01
 -8.54492188e-03  1.48437500e-01 -3.34472656e-02 -1.85546875e-01
  4.10156250e-02 -8.98437500e-02  2.17285156e-02  6.93359375e-02
  1.80664062e-01  2.22656250e-01 -1.00585938e-01 -6.93359375e-02
  1.04427338e-04  1.60156250e-01  4.07714844e-02  7.37304688e-02
  1.53320312e-01  6.78710938e-02 -1.03027344e-01  4.17480469e-02
  4.27246094e-02 -1.10351562e-01 -6.68945312e-02  4.19921875e-02
  2.50000000e-01  2.12890625e-01  1.59179688e-01  1.44653320e-02
 -4.88281250e-02  1.39770508e-02  3.55529785e-03  2.09960938e-01
  1.52343750e-01 -7.32421875e-02  2.16796875e-01 -5.76171875e-02
 -2.84423828e-02 -3.60107422e-03  1.52343750e-01 -2.63671875e-02
  2.13623047e-02 -1.51367188e-01  1.04003906e-01  3.18359375e-01
 -1.85546875e-01  3.68652344e-02 -1.10839844e-01 -3.17382812e-02
 -1.01562500e-01 -1.21093750e-01  3.22265625e-01 -7.32421875e-02
 -1.52343750e-01  2.67578

## Modeling

In [42]:
embedding_dims = 300

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dims, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Flatten())
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
hist = model.fit(X_train, y_train, epochs=100, verbose=2)

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 4, 300)            4800      
                                                                 
 flatten_6 (Flatten)         (None, 1200)              0         
                                                                 
 dense_9 (Dense)             (None, 1)                 1201      
                                                                 
Total params: 6001 (23.44 KB)
Trainable params: 1201 (4.69 KB)
Non-trainable params: 4800 (18.75 KB)
_________________________________________________________________
Epoch 1/100
1/1 - 0s - loss: 0.6996 - acc: 0.2857 - 119ms/epoch - 119ms/step
Epoch 2/100
1/1 - 0s - loss: 0.6805 - acc: 0.5714 - 2ms/epoch - 2ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6619 - acc: 0.8571 - 1ms/epoch - 1ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6440 - acc: 1.0000 - 2ms/epo

In [43]:
print(hist.history['acc'][-1])
print(model.predict([[1,0,0,0]]))

1.0
[[0.54948443]]
