In [2]:
# import tensorflow as tf
import os
import zipfile
import wget
import pandas as pd
import nltk
import re
import pickle
import contractions
import tensorflow as tf
import numpy as np
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import xml.etree.ElementTree as ET
from scipy.spatial.distance import cdist


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Download data

- will download dataset using datasets library

In [2]:
data_dir = "Data"

# Preprocess data

In [3]:
def preprocess_sent(sentence):
    
    # expand contractions
    # e.g : i'm -> i am
    sentence = contractions.fix(sentence)
    
    #  remove special characters and number
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    
    # remoev multiple spaces with one space
    sentence = re.sub("\ +"," ",sentence)
    
    sentence = sentence.strip()
    
    sentence = sentence.lower() 
    
    return sentence

In [4]:
# for training corpus we will use English bible corpus
# https://github.com/christos-c/bible-corpus


lang = 'English'
xml_path = os.path.join(data_dir,f"{lang}.xml")
txt_path = os.path.join(data_dir,f"{lang}.txt")

root = ET.fromstring(open(xml_path).read())
with open(txt_path, 'w', encoding='utf-8') as out:
    for n in root.iter('seg'):
        out.write(n.text.strip() + '\n')

In [13]:
preprocessed_sentences = []

# will use first 5000 lines
with open(txt_path,'r') as f:
    lines = f.read().splitlines()[:5000]
    for line in lines:
        preprocessed_sentences.append(preprocess_sent(line))

In [14]:
preprocessed_sentences[0]

'in the beginning god created the heaven and the earth'

# generate data from training model


In [15]:
vocab = []
for sent in preprocessed_sentences:
    vocab.extend(set(sent.split()))
vocab = set(vocab)

In [16]:
vocab_size = len(vocab)

In [17]:
print("vocab_size : ",vocab_size)

vocab_size :  4343


In [18]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}


In [19]:
X = []
Y = []
window_size = 2 
for sent in preprocessed_sentences:
    words = sent.split()
    
    for i in range(window_size, len(words) - window_size):
        context = []
        target = ''
        for j in range(i - window_size,i+window_size+1):
            if j ==i:
                target = word_to_ix[words[j]]
            else:
                context.append(word_to_ix[words[j]])
        X.append(context)
        Y.append(target)

In [20]:
print(X[:5])
print(Y[:5])

[[4290, 306, 3954, 1065], [306, 3822, 1065, 306], [3822, 3954, 306, 1506], [3954, 1065, 1506, 375], [1065, 306, 375, 306]]
[3822, 3954, 1065, 306, 1506]


In [21]:
X_categorical = tf.keras.utils.to_categorical(X)


In [22]:
X_categorical.shape

(111922, 4, 4343)

# Train Model

In [23]:
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3,restore_best_weights=True)

def get_model(vocab_size,embedding_size,window_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=[2*window_size,vocab_size]),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(embedding_size,name="hidden_layer"),
    tf.keras.layers.Dense(vocab_size,name="output_layer",activation='softmax'),
    ])
    
    model.compile('adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    
    return model

In [24]:
model = get_model(vocab_size,128,window_size)

In [25]:
history = model.fit(X_categorical,np.array(Y),epochs=50,callbacks=[early_stop_callback],batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# extract word to embeddings for out vocab

In [83]:
model.save_weights("model/embedding_model.h5")

In [None]:
# model.load_weights("model/embedding_model.h5")

In [28]:
embeddings = model.get_layer("output_layer").weights[0].numpy().T

In [29]:
print(embeddings.shape)

(4343, 128)


- 4343 vocab size and 128 is embedding size
- hence for each word we have 128 dimension vector

In [41]:
# build similarity matrix
similarities = 1 - cdist(embeddings, embeddings, metric='cosine')

In [42]:
similarities.shape

(4343, 4343)

In [30]:
word_to_embeddings = dict()

In [31]:
for i,emb in enumerate(embeddings):
    word_to_embeddings[ix_to_word[i]] = emb

In [77]:
def get_similarwords(word,k=5):
    sim_words = []
    if word in vocab:
            
        index_of_word = word_to_ix[word]

        ids = np.argsort(similarities[index_of_word].reshape(1,-1),axis=1)[0]
        
        for i in ids[-k-1:-1]:
            sim_words.append(ix_to_word[i])
        
        return list(reversed(sim_words))
    return "word is not present in vocab"

In [78]:
get_similarwords("god")


['lord', 'abram', 'hath', 'merciful', 'israel']

In [81]:
with open('model/word_to_embeddings.pickle', 'wb') as handle:
    pickle.dump(word_to_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('model/ix_to_word.pickle', 'wb') as handle:
    pickle.dump(ix_to_word, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('model/word_to_ix.pickle', 'wb') as handle:
    pickle.dump(word_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [82]:
with open("model/similarity_matrix.pickle",'wb') as handle:
    pickle.dump(similarities, handle, protocol=pickle.HIGHEST_PROTOCOL)

