In [1]:
word2index_dict = {
    "impossible": 3394,
    "mr": 4305,
    "bennet": 813,
    "when": 7078,
    "i": 3315,
    "am": 415,
    "not": 4436,
    "acquainted": 239,
    "with": 7148,
    "him": 3215
}
sentence = "impossible mr bennet impossible when i am not acquainted with him".split()
vocab_size = 7261
sentence_length = len(sentence)
print("Sentence:", sentence)
print("Sentence length:", sentence_length)


Sentence: ['impossible', 'mr', 'bennet', 'impossible', 'when', 'i', 'am', 'not', 'acquainted', 'with', 'him']
Sentence length: 11


In [None]:
import torch

# Initialize a 2D tensor of zeros with shape (sentence_length, vocab_size).
# Each row in this tensor will hold the one-hot representation of a word.
word_tensor = torch.zeros(sentence_length, vocab_size)

# Loop through the words to build their one-hot vectors.
for i, word in enumerate(sentence):
    # Retrieve the numeric index of the current word.
    word_index = word2index_dict[word]
    # Set the corresponding position in the tensor to 1 to mark this word.
    word_tensor[i][word_index] = 1
    # Print each word's index and related data for verification.
    print("{:2d} {:4d} {}".format(i, word_index, word))

# Print the shape of the one-hot encoded tensor for confirmation.
print("One-hot encoded tensor shape:", word_tensor.shape)
# One-hot encoding is used to represent words as sparse vectors 
# for further processing in NLP tasks.


 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
One-hot encoded tensor shape: torch.Size([11, 7261])


In [None]:
import torch.nn as nn
'''
print(r"""
+------------------+         +-------------+       +----------------------+
|   Input Words    |  -->    | Embedding   |  -->  | Dense Vectors (R^d) |
+------------------+         |  Layer      |       +----------------------+
                              \___________/
                              Each word index is
                              mapped to a dense
                              vector of dimension d
""")
'''
# Instead of a 7261-dimensional one-hot vector, we use an embedding layer to map each word to a 100-dimensional vector.
embedding_dim = 100
embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
# Convert the sentence into a tensor of word indices
word_indices = [word2index_dict[word] for word in sentence]
word_indices_tensor = torch.tensor(word_indices, dtype=torch.long)
print("\nWord indices tensor:", word_indices_tensor)
# Get the dense 100-dimensional representation for each word
embedded_sentence = embedding(word_indices_tensor)
print("Embedded sentence shape:", embedded_sentence.shape)
# Expected: torch.Size([11, 100])
