In [113]:
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras import initializers

In [114]:
with open ("dataset.txt", 'r') as f:
    data = f.read()
    print(data)
    f.close()
data = data.lower()
data = data.split(' ')

After flying a long distance, a thirsty crow was wandering the forest in search of water. Finally, he saw a pot half-filled with water. He tried to drink from it but his beak wasn’t long enough to reach the water inside. He then saw pebbles on the ground and one by one, he put them in the pot until the water rose to the brim. The crow then hastily drank from it and quenched his thirst.



In [115]:
raw_paragraph = []
for text in data:
     raw_paragraph.append(''.join(x for x in text if x.isalpha()))

raw_paragraph = np.array(raw_paragraph)
raw_paragraph[:10]

array(['after', 'flying', 'a', 'long', 'distance', 'a', 'thirsty', 'crow',
       'was', 'wandering'], dtype='<U10')

In [116]:
unique_words = []
for word in raw_paragraph:
    if word not in unique_words:
        unique_words.append(word)
unique_words = np.array(unique_words)
unique_words

array(['after', 'flying', 'a', 'long', 'distance', 'thirsty', 'crow',
       'was', 'wandering', 'the', 'forest', 'in', 'search', 'of', 'water',
       'finally', 'he', 'saw', 'pot', 'halffilled', 'with', 'tried', 'to',
       'drink', 'from', 'it', 'but', 'his', 'beak', 'wasnt', 'enough',
       'reach', 'inside', 'then', 'pebbles', 'on', 'ground', 'and', 'one',
       'by', 'put', 'them', 'until', 'rose', 'brim', 'hastily', 'drank',
       'quenched', 'thirst'], dtype='<U10')

In [117]:
dict_size = len(unique_words)
paragraph_size = len(raw_paragraph)
encoding_mapping = {}

for word in raw_paragraph:
    encoding = np.zeros((dict_size))
    pos = np.where(unique_words == word)[0][0]
    encoding[pos] = 1
    encoding_mapping[word] = encoding

In [118]:
data = []
label =[]
window = [-2,-1,1,2]
for i in range(paragraph_size):
    working_word = raw_paragraph[i]
    for j in window:
        t = i + j
        if 0 <= t < paragraph_size:
            data.append(encoding_mapping[working_word])
            label.append(encoding_mapping[raw_paragraph[t]])
data = np.array(data)
label = np.array(label)

In [119]:
rng_state = np.random.get_state()
np.random.shuffle(data)
np.random.set_state(rng_state)
np.random.shuffle(label)

In [120]:
print(data.shape, label.shape)

(298, 49) (298, 49)


In [121]:
train_size = 0.9

X_train = data[: int(train_size * data.shape[0])]
y_train = label[: int(train_size * label.shape[0])]

X_test = data[int(train_size * data.shape[0]): ]
y_test = label[int(train_size * label.shape[0]): ]

In [124]:
embedding_size = 10
model = tf.keras.Sequential([
  tf.keras.layers.Dense(units = embedding_size,
                        activation = 'linear',
                        input_shape = (dict_size,),
                        name='hidden',
                        kernel_initializer=initializers.RandomNormal(stddev=0.01),
                        bias_initializer=initializers.RandomNormal(stddev=0.01)),

  tf.keras.layers.Dense(units = dict_size,
                        activation = 'softmax',
                        name='output',
                        kernel_initializer=initializers.RandomNormal(stddev=0.01),
                        bias_initializer=initializers.RandomNormal(stddev=0.01)
                        ),
])

In [125]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 hidden (Dense)              (None, 10)                500       
                                                                 
 output (Dense)              (None, 49)                539       
                                                                 
Total params: 1,039
Trainable params: 1,039
Non-trainable params: 0
_________________________________________________________________


In [126]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
loss_function = tf.keras.losses.CategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])

In [None]:
epochs = 15

model.fit(data, label, epochs=epochs, validation_split=0.1)

In [128]:
def nth_word(n):
  return raw_paragraph[n - 1]

In [129]:
def word_occurance(word):
  return len(np.where(raw_paragraph == word)[0])

In [130]:
def get_dict_size():
  return dict_size

In [146]:
#Returns the most similar word to the given word using the NN
def most_similar(word):
  predicted = model.predict(np.array([encoding_mapping[word]]), verbose=0)
  return unique_words[np.argmax(predicted)]

In [151]:
#Returns the least similar word to the given word using the NN
def least_similar(word):
  predicted = model.predict(np.array([encoding_mapping[word]]), verbose=0)
  return unique_words[np.argmin(predicted)]

In [157]:
# test = "_ is to _ as _ is to _"
# "dog is to cat as bird is to"

def question_5(sentence):
  words = sentence.split(" ")
  pred = np.zeros((1, 49))
  for word in words:
    pred += model.predict(np.array([encoding_mapping[word]]), verbose=0)
  return unique_words[np.argmax(pred)]

question_5("water crow ground")

'the'