In [66]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import euclidean_distances

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.utils import to_categorical


In [67]:
# Data: list of sentences about deep learning
data = [
    "Deep learning also known as deep structured learning",
    "is part of a broader family of machine learning methods based",
    "on artificial neural networks with representation learning",
    "Learning can be supervised, semi-supervised or unsupervised",
    "Deep-learning architectures such as deep neural networks",
    "deep belief networks, deep reinforcement learning",
    "recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation",
    "where they have produced results comparable to and in some cases surpassing human expert performance"
]

# Tokenize sentences and build vocabulary
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word2id = tokenizer.word_index
word2id['PAD'] = 0  # Add padding for sequence compatibility
id2word = {v: k for k, v in word2id.items()}

# Convert sentences to sequences of word IDs
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]
vocab_size = len(word2id)
embed_size = 100  # Embedding vector size
window_size = 2  # Context window size

print('Vocabulary Size:', vocab_size)



Vocabulary Size: 62


In [68]:
# Function to generate context-target pairs

def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            # Get context words
            context_words = [
                words[i]
                for i in range(index - window_size, index + window_size + 1)
                if 0 <= i < sentence_length and i != index
            ]
            # Pad context to fixed length
            x = pad_sequences([context_words], maxlen=context_length)[0]
            y = to_categorical(word, vocab_size)
            yield (x, y)

# Display a few context-target pairs
print("\nSample Context-Target Pairs:")
i = 0
for x, y in generate_context_word_pairs(wids, window_size, vocab_size):
    print('Context (X):', [id2word[w] for w in x if w != 0], '-> Target (Y):', id2word[np.argmax(y)])
    if i == 5:  # Show only the first 5 pairs
        break
    i += 1



Sample Context-Target Pairs:
Context (X): ['learning', 'also'] -> Target (Y): deep
Context (X): ['deep', 'also', 'known'] -> Target (Y): learning
Context (X): ['deep', 'learning', 'known', 'as'] -> Target (Y): also
Context (X): ['learning', 'also', 'as', 'deep'] -> Target (Y): known
Context (X): ['also', 'known', 'deep', 'structured'] -> Target (Y): as
Context (X): ['known', 'as', 'structured', 'learning'] -> Target (Y): deep


In [62]:
# Define the CBOW model
cbow = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size * 2),
    Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)),
    Dense(vocab_size, activation='softmax')
])

# Compile the model
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(cbow.summary())


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 4, 100)            6200      
                                                                 
 lambda_8 (Lambda)           (None, 100)               0         
                                                                 
 dense_8 (Dense)             (None, 62)                6262      
                                                                 
Total params: 12462 (48.68 KB)
Trainable params: 12462 (48.68 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [63]:
# Train the CBOW model
epochs = 50
for epoch in range(1, epochs + 1):
    loss = 0
    for x, y in generate_context_word_pairs(wids, window_size, vocab_size):
        loss += cbow.train_on_batch(x.reshape(1, -1), y.reshape(1, -1))
    if epoch % 10 == 0:
        print('Epoch:', epoch, '\t  Loss:', loss)


Epoch: 10 	  Loss: 300.8581107854843
Epoch: 20 	  Loss: 249.75314646959305
Epoch: 30 	  Loss: 210.08205100893974
Epoch: 40 	  Loss: 178.21800927817822
Epoch: 50 	  Loss: 150.4565201178193


In [64]:
# Extract the learned word embeddings
embedding_weights = cbow.get_weights()[0][1:]  # Exclude the padding index
print("\nEmbedding matrix shape:", embedding_weights.shape)

# Display word embeddings as a DataFrame
print("\nWord Embeddings (Sample):")
embedding_df = pd.DataFrame(embedding_weights, index=[id2word[i] for i in range(1, vocab_size)])
print(embedding_df.head())



Embedding matrix shape: (61, 100)

Word Embeddings (Sample):
                0         1         2         3         4         5   \
learning  0.160850 -0.500997  0.247963  0.258451 -0.310178  0.255225   
deep      0.043551  0.298463  0.269905 -0.181816 -0.095490 -0.017680   
networks  0.168837 -0.390784 -0.777694  0.566408 -0.159862  0.279692   
neural    0.609843 -0.749018  0.272168  0.025571 -0.482311 -0.356484   
as        0.164156 -0.149009  0.076493  0.308174  0.225752  0.388742   

                6         7         8         9   ...        90        91  \
learning -0.027897  0.511773  0.229951  0.232851  ... -0.252065  0.040186   
deep      0.204720 -0.239713  0.086499 -0.344364  ... -0.245923 -0.075849   
networks  0.005954  0.438587  0.057010  0.132329  ... -0.289924 -0.543459   
neural   -0.268259 -0.050787  0.316267 -0.479768  ... -0.078136 -0.605091   
as        0.024141  0.322571  0.136768  0.215855  ... -0.409961  0.249628   

                92        93        94    

In [65]:
# Compute pairwise Euclidean distances between embeddings
distance_matrix = euclidean_distances(embedding_weights)
print("\nDistance matrix shape:", distance_matrix.shape)

# Find and display similar words for specific search terms
similar_words = {
    search_term: [
        id2word[idx + 1] for idx in distance_matrix[word2id[search_term] - 1].argsort()[1:6]
    ]
    for search_term in ['deep', 'unsupervised']
}
print("\nSimilar Words:", similar_words)



Distance matrix shape: (61, 61)

Similar Words: {'deep': ['representation', 'with', 'known', 'recurrent', 'convolutional'], 'unsupervised': ['semi', 'or', 'can', 'be', 'on']}
