#### 1. Import required libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.decomposition import PCA
import re

#### 2. Data Preparation

In [2]:
data = ["We are about to study the idea of a computational process",
"Computational processes are abstract beings that inhabit computers",
"As they evolve processes manipulate other abstract things called data",
"The evolution of a process is directed by a pattern of rules called a program", 
"People create programs to direct processes", 
"In effect we conjure the spirits of the computer with our spells"]

In [3]:
# clean data
clean_sentences = []
for sentence in data:
    # skip empty string
    if sentence == "":
        continue
    # remove special characters
    # sentence = re.sub('[^A-Za-z0-9]+', ' ', sentence)
    # # remove 1 letter words
    # sentence = re.sub(r'(?:^| )\w(?:$| )', ' ', sentence).strip()
    # # lower all characters
    sentence = sentence.lower()
    clean_sentences.append(sentence)

In [4]:
clean_sentences

['we are about to study the idea of a computational process',
 'computational processes are abstract beings that inhabit computers',
 'as they evolve processes manipulate other abstract things called data',
 'the evolution of a process is directed by a pattern of rules called a program',
 'people create programs to direct processes',
 'in effect we conjure the spirits of the computer with our spells']

In [5]:
corpus = clean_sentences

In [6]:
# Convert the corpus to a sequence of integers
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(corpus)
# sequences = tokenizer.texts_to_sequences(corpus)
# print("After converting our words in the corpus into vector of integers:")
# print(sequences)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
print(sequences)

[[5, 6, 12, 7, 13, 1, 14, 2, 3, 8, 9], [8, 4, 6, 10, 15, 16, 17, 18], [19, 20, 21, 4, 22, 23, 10, 24, 11, 25], [1, 26, 2, 3, 9, 27, 28, 29, 3, 30, 2, 31, 11, 3, 32], [33, 34, 35, 7, 36, 4], [37, 38, 5, 39, 1, 40, 2, 1, 41, 42, 43, 44]]


In [13]:
# creating dictionary for word to index and index to word
index_to_word_map = {}
word_to_index_map = {}

for index_1, sequence in enumerate(sequences):
    print(sequence)
    words_in_sentence = clean_sentences[index_1].split()
    print(words_in_sentence)
    for index_2, value in enumerate(sequence):
        index_to_word_map[value] = words_in_sentence[index_2]
        word_to_index_map[words_in_sentence[index_2]] = value

# id2w = {}
# w2id = {}

# for index_1, sequence in enumerate(sequences):
#     words_in_sentence = clean_sentences[index_1].split()
#     for index_2, value in enumerate(sequence):
#         id2w[value] = words_in_sentence[index_2]
#         w2id[words_in_sentence[index_2]] = value

[5, 6, 12, 7, 13, 1, 14, 2, 3, 8, 9]
['we', 'are', 'about', 'to', 'study', 'the', 'idea', 'of', 'a', 'computational', 'process']
[8, 4, 6, 10, 15, 16, 17, 18]
['computational', 'processes', 'are', 'abstract', 'beings', 'that', 'inhabit', 'computers']
[19, 20, 21, 4, 22, 23, 10, 24, 11, 25]
['as', 'they', 'evolve', 'processes', 'manipulate', 'other', 'abstract', 'things', 'called', 'data']
[1, 26, 2, 3, 9, 27, 28, 29, 3, 30, 2, 31, 11, 3, 32]
['the', 'evolution', 'of', 'a', 'process', 'is', 'directed', 'by', 'a', 'pattern', 'of', 'rules', 'called', 'a', 'program']
[33, 34, 35, 7, 36, 4]
['people', 'create', 'programs', 'to', 'direct', 'processes']
[37, 38, 5, 39, 1, 40, 2, 1, 41, 42, 43, 44]
['in', 'effect', 'we', 'conjure', 'the', 'spirits', 'of', 'the', 'computer', 'with', 'our', 'spells']


In [14]:
print(index_to_word_map)
print('\n')
print(word_to_index_map)

{5: 'we', 6: 'are', 12: 'about', 7: 'to', 13: 'study', 1: 'the', 14: 'idea', 2: 'of', 3: 'a', 8: 'computational', 9: 'process', 4: 'processes', 10: 'abstract', 15: 'beings', 16: 'that', 17: 'inhabit', 18: 'computers', 19: 'as', 20: 'they', 21: 'evolve', 22: 'manipulate', 23: 'other', 24: 'things', 11: 'called', 25: 'data', 26: 'evolution', 27: 'is', 28: 'directed', 29: 'by', 30: 'pattern', 31: 'rules', 32: 'program', 33: 'people', 34: 'create', 35: 'programs', 36: 'direct', 37: 'in', 38: 'effect', 39: 'conjure', 40: 'spirits', 41: 'computer', 42: 'with', 43: 'our', 44: 'spells'}


{'we': 5, 'are': 6, 'about': 12, 'to': 7, 'study': 13, 'the': 1, 'idea': 14, 'of': 2, 'a': 3, 'computational': 8, 'process': 9, 'processes': 4, 'abstract': 10, 'beings': 15, 'that': 16, 'inhabit': 17, 'computers': 18, 'as': 19, 'they': 20, 'evolve': 21, 'manipulate': 22, 'other': 23, 'things': 24, 'called': 11, 'data': 25, 'evolution': 26, 'is': 27, 'directed': 28, 'by': 29, 'pattern': 30, 'rules': 31, 'progr

#### 3. Generate Training data

In [15]:
# Define the parameters
vocab_size = len(tokenizer.word_index)+1
embedding_size = 10
window_size = 2

contexts = []
targets = []

for sequence in sequences:
    for i in range(window_size, len(sequence)-window_size):
        context = sequence[i - window_size: i] + sequence[i + 1:i + window_size + 1]
        target = sequence[i]
        contexts.append(context)
        targets.append(target)

In [16]:
# Sample of training data
for i in range(5):
    words = []
    target = index_to_word_map.get(targets[i])
    for j in contexts[i]:
        words.append(index_to_word_map.get(j))
    print(words, "=>", target)

['we', 'are', 'to', 'study'] => about
['are', 'about', 'study', 'the'] => to
['about', 'to', 'the', 'idea'] => study
['to', 'study', 'idea', 'of'] => the
['study', 'the', 'of', 'a'] => idea


In [17]:
# Convert the contexts and targets to numpy arrays
X = np.array(contexts)
Y = np.array(targets)

#### 4. Training the Model

In [18]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
output_dim=embedding_size, input_length=2 * window_size))

model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Dense(256, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(units=vocab_size, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 10)             450       
                                                                 
 lambda (Lambda)             (None, 10)                0         
                                                                 
 dense (Dense)               (None, 256)               2816      
                                                                 
 dense_1 (Dense)             (None, 512)               131584    
                                                                 
 dense_2 (Dense)             (None, 45)                23085     
                                                                 
Total params: 157,935
Trainable params: 157,935
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X, Y, epochs=200, verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 1

Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x1f005e38a00>

In [20]:
# Get the word embeddings
embeddings = model.get_weights()[0] # Weights of the model of first layer

#### 5. Output

In [21]:
# test model
test_sentences = [
    "we are to study",
    "create programs direct processes",
    "spirits process study program",
    "idea study people create"
]

#### 6. Predictions

In [22]:
for test_sentence in test_sentences:
    test_words = test_sentence.split(" ")
    print("Words: ", test_words)
    x_test = []
    for i in test_words:
        x_test.append(word_to_index_map.get(i))
    x_test = np.array([x_test])
    print("Indexs: ", x_test)
    test_predictions = model.predict(x_test)
    y_pred = np.argmax(test_predictions[0])
    print("Predictons: ",test_words, " => ", index_to_word_map.get(y_pred))
    print("\n")

Words:  ['we', 'are', 'to', 'study']
Indexs:  [[ 5  6  7 13]]
Predictons:  ['we', 'are', 'to', 'study']  =>  about


Words:  ['create', 'programs', 'direct', 'processes']
Indexs:  [[34 35 36  4]]
Predictons:  ['create', 'programs', 'direct', 'processes']  =>  to


Words:  ['spirits', 'process', 'study', 'program']
Indexs:  [[40  9 13 32]]
Predictons:  ['spirits', 'process', 'study', 'program']  =>  the


Words:  ['idea', 'study', 'people', 'create']
Indexs:  [[14 13 33 34]]
Predictons:  ['idea', 'study', 'people', 'create']  =>  programs


