In [6]:
import numpy as np
from keras.preprocessing.text import Tokenizer

# Sample vocabulary
x = {'text', 'the', 'leader', 'prime', 'natural', 'language'}

# Create the tokenizer and fit on texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

# Number of unique words in the dictionary
print("Number of unique words in dictionary=", len(tokenizer.word_index))
print("Dictionary is =", tokenizer.word_index)

# Function to load GloVe embeddings and create the embedding matrix
def embedding_for_vocab(glove_file, word_index, embedding_dim):
    embeddings_index = {}
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

# Path to GloVe embeddings file (ensure you have the file in the correct path)
glove_file = '/content/sample_data/glove.6B.50d.txt'
embedding_dim = 50

# Create the embedding matrix for the vocabulary
embedding_matrix_vocab = embedding_for_vocab(glove_file, tokenizer.word_index, embedding_dim)

# Print the dense vector for the first word in the dictionary
print("Dense vector for first word is =>", embedding_matrix_vocab[1])

# Function to calculate cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Words to compare
word1 = "natural"
word2 = "language"

# Check if words are in the tokenizer's word index
if word1 in tokenizer.word_index and word2 in tokenizer.word_index:
    vec1 = embedding_matrix_vocab[tokenizer.word_index[word1]]
    vec2 = embedding_matrix_vocab[tokenizer.word_index[word2]]

    # Calculate and print cosine similarity
    similarity = cosine_similarity(vec1, vec2)
    print(f"Cosine similarity between '{word1}' and '{word2}': {similarity}")
else:
    print(f"One or both words are not in the vocabulary: '{word1}', '{word2}'")


Number of unique words in dictionary= 6
Dictionary is = {'language': 1, 'natural': 2, 'the': 3, 'leader': 4, 'text': 5, 'prime': 6}
Dense vector for first word is => [-5.79900026e-01 -1.10100001e-01 -1.15569997e+00 -2.99059995e-03
 -2.06129998e-01  4.52890009e-01 -1.66710004e-01 -1.03820002e+00
 -9.92410004e-01  3.98840010e-01  5.92299998e-01  2.29900002e-01
  1.52129996e+00 -1.77640006e-01 -2.97259986e-01 -3.92349988e-01
 -7.84709990e-01  1.55939996e-01  6.90769970e-01  5.95369995e-01
 -4.43399996e-01  5.35139978e-01  3.28530014e-01  1.24370003e+00
  1.29719996e+00 -1.38779998e+00 -1.09249997e+00 -4.09249991e-01
 -5.69710016e-01 -3.46560001e-01  3.71630001e+00 -1.04890001e+00
 -4.67079997e-01 -4.47389990e-01  6.22999994e-03  1.96490008e-02
 -4.01609987e-01 -6.29130006e-01 -8.25060010e-01  4.55909997e-01
  8.26259971e-01  5.70909977e-01  2.11989999e-01  4.68650013e-01
 -6.00269973e-01  2.99199998e-01  6.79440022e-01  1.42379999e+00
 -3.21520008e-02 -1.26029998e-01]
Cosine similarity be