In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Lambda, Dense
from keras.utils import to_categorical  # Corrected import here
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from keras import backend as K




In [2]:
# Sample text data (you can replace this with a larger corpus or read from a file)
data = """
The speed of transmission is an important point of difference between the two viruses. 
Influenza has a shorter median incubation period and a shorter serial interval than COVID-19.
The serial interval for COVID-19 is estimated to be 5-6 days, while for influenza, it is 3 days.
Influenza can spread faster than COVID-19.
"""


In [3]:
# Step 1: Preprocess text data - Convert text to lowercase
data = data.lower().split()  # Convert to lowercase and split into words (tokens)


In [4]:
# Tokenize the text (assign a unique index to each word)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])  # Fit the tokenizer on the text data
word_to_index = tokenizer.word_index  # Mapping of words to indices
word_to_index['PAD'] = 0  # Add padding token

In [5]:
# Create reverse mapping (index to word)
index_to_word = {v: k for k, v in word_to_index.items()}

In [6]:
# Convert the words into their respective indices
encoded_data = [word_to_index[word] for word in data]

In [7]:

# Parameters
vocab_size = len(word_to_index)  # Vocabulary size
embed_size = 100  # Size of word embeddings
window_size = 2  # Number of words before and after the target word


In [8]:
# Step 2: Prepare context-target pairs
def generate_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2  # Total number of words in context

    for index, word in enumerate(corpus):
        # Context window
        start = max(0, index - window_size)
        end = min(len(corpus), index + window_size + 1)
        
        # Skip the word itself (target word)
        context = [corpus[i] for i in range(start, end) if i != index]
        
        # One-hot encode the target word
        target = to_categorical([corpus[index]], vocab_size)  # Use to_categorical here
        
        # Pad context if necessary
        context_padded = pad_sequences([context], maxlen=context_length, padding='post')
        
        yield (context_padded, target)

In [9]:
# Step 3: Define the CBOW model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size * 2))  # Embedding layer
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))  # CBOW: Take mean of context words
model.add(Dense(vocab_size, activation='softmax'))  # Output layer (softmax for classification)





In [10]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')





In [11]:
# Step 4: Train the model
for epoch in range(5):
    total_loss = 0
    for context, target in generate_pairs(encoded_data, window_size, vocab_size):
        loss = model.train_on_batch(context, target)
        total_loss += loss
    print(f"Epoch {epoch+1}, Loss: {total_loss}")



Epoch 1, Loss: 192.10788702964783
Epoch 2, Loss: 190.26115560531616
Epoch 3, Loss: 188.67545676231384
Epoch 4, Loss: 187.01941227912903
Epoch 5, Loss: 185.26665496826172


In [12]:
# Step 5: Extract word embeddings and calculate similarities
embeddings = model.get_weights()[0][1:]  # Extract word embeddings (ignore padding token)
print("Embeddings shape:", embeddings.shape)


Embeddings shape: (39, 100)


In [13]:
# Compute Euclidean distances between word vectors
distance_matrix = euclidean_distances(embeddings)


In [14]:
# Step 6: Find similar words
search_term = input("Enter a word to find similar words: ").strip().lower()  # Lowercase the search term
if search_term in word_to_index:
    index = word_to_index[search_term] - 1  # Adjust for padding token
    similar_words = [index_to_word[idx] for idx in distance_matrix[index].argsort()[1:6]]
    print(f"Similar words to '{search_term}': {similar_words}")
else:
    print(f"'{search_term}' not found in the vocabulary.")


Enter a word to find similar words: ganesh
'ganesh' not found in the vocabulary.
