In [8]:
import numpy as np
from scipy.sparse import dok_matrix
from pdb import set_trace
from collections import defaultdict, Counter
from scipy.sparse import coo_matrix
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

## Exercise 

Build the corpus in a that works. Take this small corpus I built for you, and change it so the meaning is better captured. It will help you understand what the model does.

In [10]:
with open('../datasets/text8', 'r') as f:
    raw_text = f.read()

In [11]:
%time splitted_txt = raw_text.split()

CPU times: user 2.53 s, sys: 629 ms, total: 3.16 s
Wall time: 3.24 s


In [12]:
small_text = raw_text[:100]

In [13]:
splitted_txt[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [61]:
# from collections import defaultdict, Counter
# from scipy.sparse import coo_matrix

# def build_cooccurrence_matrix_from_string(huge_string, vocab, window_size=5):
#     cooccurrences = defaultdict(Counter)
    
#     # Tokenize the string into words
#     tokens = huge_string.split()  # Split by spaces (simple tokenization)
#     tokens = [vocab[word] for word in tokens if word in vocab]  # Map to indices
    
#     # Generate co-occurrences
#     for center_idx, center_word in enumerate(tokens):
#         start = max(center_idx - window_size, 0)
#         end = min(center_idx + window_size + 1, len(tokens))
#         for context_idx in range(start, end):
#             if center_idx != context_idx:
#                 cooccurrences[center_word][tokens[context_idx]] += 1

#     # Convert co-occurrences to sparse matrix
#     row, col, data = [], [], []
#     for word, contexts in cooccurrences.items():
#         for context, count in contexts.items():
#             row.append(word)
#             col.append(context)
#             data.append(count)

#     return coo_matrix((data, (row, col)), shape=(len(vocab), len(vocab)))


In [14]:
from collections import defaultdict, Counter
from scipy.sparse import coo_matrix

def build_cooccurrence_matrix_from_string(huge_string, vocab, window_size=5):
    cooccurrences = defaultdict(Counter)
    
    # Tokenize the string into words
    tokens = huge_string.split()  # Split by spaces (simple tokenization)
    tokens = [vocab[word] for word in tokens if word in vocab]  # Map to indices
    
    # Generate co-occurrences
    for center_idx, center_word in enumerate(tokens):
        start = max(center_idx - window_size, 0)
        end = min(center_idx + window_size + 1, len(tokens))
        for context_idx in range(start, end):
            if center_idx != context_idx:
                cooccurrences[center_word][tokens[context_idx]] += 1

    # Prepare for sparse matrix construction
    row, col, data = [], [], []
    vocab_size = len(vocab)
    for word, contexts in cooccurrences.items():
        if word >= vocab_size:  # Safety check for invalid indices
            continue
        for context, count in contexts.items():
            if context >= vocab_size:  # Safety check for invalid indices
                continue
            row.append(word)
            col.append(context)
            data.append(count)

    # Convert co-occurrences to sparse matrix
    return coo_matrix((data, (row, col)), shape=(vocab_size, vocab_size))


In [15]:
%%time

word_to_index = {word: idx for idx, word in enumerate(small_text)}

CPU times: user 13 µs, sys: 1 µs, total: 14 µs
Wall time: 16.9 µs


In [16]:
build_cooccurrence_matrix_from_string(small_text, word_to_index)

<21x21 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in COOrdinate format>

# Class Version

In [17]:
class GloVe:
    def __init__(self, vocab_size, embedding_dim, x_max=100, alpha=0.75):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.x_max = x_max
        self.alpha = alpha
        
        # Initialize word and context embeddings, biases
        self.word_embeddings = np.random.rand(vocab_size, embedding_dim) / np.sqrt(embedding_dim)
        self.context_embeddings = np.random.rand(vocab_size, embedding_dim) / np.sqrt(embedding_dim)
        self.word_biases = np.zeros(vocab_size)
        self.context_biases = np.zeros(vocab_size)
        
    def fit(self, cooccurrence_matrix, epochs=50, learning_rate=0.05):
        for epoch in range(epochs):
            loss = 0
            for i, j, Xij in zip(cooccurrence_matrix.row, cooccurrence_matrix.col, cooccurrence_matrix.data):
                # Weight function
                weight = (Xij / self.x_max)**self.alpha if Xij < self.x_max else 1.0
                
                # Compute the loss and gradients
                word_vec = self.word_embeddings[i]
                context_vec = self.context_embeddings[j]
                inner_product = np.dot(word_vec, context_vec)
                diff = inner_product + self.word_biases[i] + self.context_biases[j] - np.log(Xij)
                loss += 0.5 * weight * diff**2
                
                grad_common = weight * diff
                self.word_embeddings[i] -= learning_rate * grad_common * context_vec
                self.context_embeddings[j] -= learning_rate * grad_common * word_vec
                self.word_biases[i] -= learning_rate * grad_common
                self.context_biases[j] -= learning_rate * grad_common
            
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")
            
    def get_embeddings(self):
        return self.word_embeddings + self.context_embeddings  # Combine embeddings

In [18]:
def build_cooccurrence_matrix(corpus, vocab, window_size=5):
    cooccurrences = defaultdict(Counter)
    for sentence in corpus:
        tokens= corpus
#         tokens = [vocab[word] for word in sentence if word in vocab]
        for center_idx, center_word in enumerate(tokens):
            start = max(center_idx - window_size, 0)
            end = min(center_idx + window_size + 1, len(tokens))
            for context_idx in range(start, end):
                if center_idx != context_idx:
                    cooccurrences[center_word][tokens[context_idx]] += 1

    # Convert co-occurrences to sparse matrix
    row, col, data = [], [], []
    for word, contexts in cooccurrences.items():
        for context, count in contexts.items():
            row.append(word)
            col.append(context)
            data.append(count)

    return coo_matrix((data, (row, col)), shape=(len(vocab), len(vocab)))

In [19]:
def cosine_similarity(vec1, vec2):
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return np.dot(vec1, vec2) / (norm1 * norm2)


In [20]:
def find_similar_words(word, vocab, embeddings, top_n=5):
    if word not in vocab:
        raise ValueError(f"Word '{word}' not found in vocabulary.")
    
    word_idx = vocab[word]
    word_vec = embeddings[word_idx]
    similarities = [
        (other_word, cosine_similarity(word_vec, embeddings[idx]))
        for other_word, idx in vocab.items()
        if other_word != word
    ]
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]


In [21]:
def visualize_embeddings(vocab, embeddings, num_words=50, perplexity=5):


    # Limit words to visualize
    words = list(vocab.keys())[:num_words]
    indices = [vocab[word] for word in words]
    word_vectors = embeddings[indices]
    
    # Adjust perplexity if too few samples
    if len(words) <= perplexity:
        perplexity = max(len(words) // 2, 1)

    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    reduced_vectors = tsne.fit_transform(word_vectors)
    
    plt.figure(figsize=(10, 10))
    for i, word in enumerate(words):
        plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
        plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]))
    plt.title("Word Embedding Visualization")
    plt.show()


In [22]:
def sentence_to_embedding(sentence, vocab, embeddings):
    """
    Convert a sentence into a single embedding by averaging the word embeddings.
    """
    words = sentence.split()
    valid_embeddings = [
        embeddings[vocab[word]] for word in words if word in vocab
    ]
    if not valid_embeddings:
        raise ValueError("None of the words in the sentence are in the vocabulary.")
    return np.mean(valid_embeddings, axis=0)


In [23]:
def word_analogy(word_a, word_b, word_c, vocab, embeddings, top_n=5):
    """
    Solve word analogy tasks: "word_a is to word_b as word_c is to ?".
    Example: king - man + woman = queen.
    """
    if word_a not in vocab or word_b not in vocab or word_c not in vocab:
        raise ValueError("One of the words is not in the vocabulary.")
    
    vec_a = embeddings[vocab[word_a]]
    vec_b = embeddings[vocab[word_b]]
    vec_c = embeddings[vocab[word_c]]
    target_vec = vec_b - vec_a + vec_c
    
    similarities = [
        (word, cosine_similarity(target_vec, embeddings[idx]))
        for word, idx in vocab.items()
        if word not in {word_a, word_b, word_c}
    ]
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]


# NEW

In [35]:
corpus = [
    ['banana is a fruit'],
    ['banana is yellow'],
    ['some fruits are yellow'],
    ['apple is also a fruit'],
    ['banana and apple are fruits'],
    ['kiwi is a fruit'],
    ['kiwi is also a fruit'],
    ['banana and wiki are fruits']
]

In [36]:
tokenized_corpus = [sentence[0].split() for sentence in corpus]

In [37]:
tokenized_corpus

[['banana', 'is', 'a', 'fruit'],
 ['banana', 'is', 'yellow'],
 ['some', 'fruits', 'are', 'yellow'],
 ['apple', 'is', 'also', 'a', 'fruit'],
 ['banana', 'and', 'apple', 'are', 'fruits'],
 ['kiwi', 'is', 'a', 'fruit'],
 ['kiwi', 'is', 'also', 'a', 'fruit'],
 ['banana', 'and', 'wiki', 'are', 'fruits']]

In [38]:
small_splitted_text = splitted_txt[:300]

In [39]:
%time vocab = set(small_splitted_text)

CPU times: user 135 µs, sys: 2 µs, total: 137 µs
Wall time: 148 µs


In [29]:
%%time
vocab = {}
for idx, word in enumerate(splitted_txt):
    if word not in vocab:
        vocab[word] = idx

CPU times: user 12.8 s, sys: 2.15 s, total: 15 s
Wall time: 16.4 s


In [30]:
len(vocab)

833184

In [84]:
%%time

# Build vocabulary from the same corpus used
vocab = {word: i for i, word in enumerate(set(word for sentence in splitted_txt for word in sentence))}
vocab_size = len(vocab)

CPU times: user 17 s, sys: 2.13 s, total: 19.1 s
Wall time: 19.5 s


In [85]:
%%time
vocab = {word: i for i in enumerate(set(splitted_txt))}

CPU times: user 3.8 s, sys: 62.6 ms, total: 3.87 s
Wall time: 3.88 s


In [86]:
%%time

word_to_index = {word: idx for idx, word in enumerate(small_splitted_text)}

CPU times: user 26 µs, sys: 2 µs, total: 28 µs
Wall time: 27.9 µs


In [87]:
%%time
# Filter corpus to ensure only words in vocab are included
filtered_corpus = [[word for word in sentence if word in vocab] for sentence in splitted_txt]

CPU times: user 35 s, sys: 35.3 s, total: 1min 10s
Wall time: 1min 28s


In [88]:
len(word_to_index)

182

In [89]:
# small_splitted_text

In [94]:
%%time
cooccurrence_matrix = build_cooccurrence_matrix(small_splitted_text, vocab, window_size=5)
assert cooccurrence_matrix.shape == (vocab_size, vocab_size), "Matrix dimensions do not match vocab size."

ValueError: invalid literal for int() with base 10: 'anarchism'

In [40]:
corpus = [
    ['banana is a fruit'],
    ['banana is yellow'],
    ['some fruits are yellow'],
    ['apple is also a fruit'],
    ['banana and apple are fruits'],
    ['kiwi is a fruit'],
    ['kiwi is also a fruit'],
    ['banana and wiki are fruits']
]

In [41]:
def build_cooccurrence_matrix(corpus, vocab, window_size=5):
    cooccurrences = defaultdict(Counter)
    for sentence in corpus:
        tokens = [vocab[word] for word in sentence if word in vocab]
        for center_idx, center_word in enumerate(tokens):
            start = max(center_idx - window_size, 0)
            end = min(center_idx + window_size + 1, len(tokens))
            for context_idx in range(start, end):
                if center_idx != context_idx:
                    cooccurrences[center_word][tokens[context_idx]] += 1

    # Convert co-occurrences to sparse matrix
    row, col, data = [], [], []
    for word, contexts in cooccurrences.items():
        for context, count in contexts.items():
            row.append(word)
            col.append(context)
            data.append(count)

    return coo_matrix((data, (row, col)), shape=(len(vocab), len(vocab)))


In [42]:
%%time
# Step 1: Build Vocabulary
vocab = {word: i for i, word in enumerate(set(word for sentence in corpus for word in sentence))}
vocab_size = len(vocab)

CPU times: user 25 µs, sys: 1 µs, total: 26 µs
Wall time: 31.2 µs


In [43]:
%%time
# Step 2: Filter Corpus to Match Vocabulary
filtered_corpus = [[word for word in sentence if word in vocab] for sentence in corpus]

CPU times: user 10 µs, sys: 1 µs, total: 11 µs
Wall time: 14.3 µs


In [45]:
%%time

# Step 3: Build Co-occurrence Matrix
cooccurrence_matrix = build_cooccurrence_matrix(filtered_corpus, vocab, window_size=5)

CPU times: user 237 µs, sys: 216 µs, total: 453 µs
Wall time: 459 µs


In [46]:
%%time
# Step 4: Train GloVe
embedding_dim = 50
glove = GloVe(vocab_size, embedding_dim)
glove.fit(cooccurrence_matrix, epochs=50, learning_rate=0.05)

Epoch 1/50, Loss: 0.0000
Epoch 2/50, Loss: 0.0000
Epoch 3/50, Loss: 0.0000
Epoch 4/50, Loss: 0.0000
Epoch 5/50, Loss: 0.0000
Epoch 6/50, Loss: 0.0000
Epoch 7/50, Loss: 0.0000
Epoch 8/50, Loss: 0.0000
Epoch 9/50, Loss: 0.0000
Epoch 10/50, Loss: 0.0000
Epoch 11/50, Loss: 0.0000
Epoch 12/50, Loss: 0.0000
Epoch 13/50, Loss: 0.0000
Epoch 14/50, Loss: 0.0000
Epoch 15/50, Loss: 0.0000
Epoch 16/50, Loss: 0.0000
Epoch 17/50, Loss: 0.0000
Epoch 18/50, Loss: 0.0000
Epoch 19/50, Loss: 0.0000
Epoch 20/50, Loss: 0.0000
Epoch 21/50, Loss: 0.0000
Epoch 22/50, Loss: 0.0000
Epoch 23/50, Loss: 0.0000
Epoch 24/50, Loss: 0.0000
Epoch 25/50, Loss: 0.0000
Epoch 26/50, Loss: 0.0000
Epoch 27/50, Loss: 0.0000
Epoch 28/50, Loss: 0.0000
Epoch 29/50, Loss: 0.0000
Epoch 30/50, Loss: 0.0000
Epoch 31/50, Loss: 0.0000
Epoch 32/50, Loss: 0.0000
Epoch 33/50, Loss: 0.0000
Epoch 34/50, Loss: 0.0000
Epoch 35/50, Loss: 0.0000
Epoch 36/50, Loss: 0.0000
Epoch 37/50, Loss: 0.0000
Epoch 38/50, Loss: 0.0000
Epoch 39/50, Loss: 0.

In [47]:
# Step 5: Retrieve Embeddings
embeddings = glove.get_embeddings()

In [120]:
# # Save embeddings and vocabulary
# np.save("embeddings.npy", embeddings)  # Save embeddings
# np.save("vocab.npy", vocab)            # Save vocabulary

In [2]:
# Load embeddings and vocabulary
embeddings = np.load("embeddings.npy")
vocab = np.load("vocab.npy", allow_pickle=True).item()  # Load as dictionary

In [53]:
# Save entire GloVe model
np.savez("glove_model.npz",
         word_embeddings=glove.word_embeddings,
         context_embeddings=glove.context_embeddings,
         word_biases=glove.word_biases,
         context_biases=glove.context_biases,
         vocab=vocab)

In [5]:
# Load entire GloVe model
model_data = np.load("glove_model.npz", allow_pickle=True)
word_embeddings = model_data["word_embeddings"]
context_embeddings = model_data["context_embeddings"]
word_biases = model_data["word_biases"]
context_biases = model_data["context_biases"]
# vocab = model_data["vocab"].item()  # Convert back to dictionary

In [7]:
word_embeddings.shape

(1, 50)

In [51]:
def find_similar_words(word, vocab, embeddings, top_n=5):
    if word not in vocab:
        raise ValueError(f"Word '{word}' not found in vocabulary.")
    
    word_idx = vocab[word]
    word_vec = embeddings[word_idx]
    
    similarities = []
    for other_word, idx in vocab.items():
        if other_word != word:
            other_vec = embeddings[idx]
            similarity = cosine_similarity(word_vec, other_vec)
            similarities.append((other_word, similarity))
    
    # Sort by similarity in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

In [52]:
def find_similar_words(word, vocab, embeddings, top_n=5):
    if word not in vocab:
        raise ValueError(f"Word '{word}' not found in vocabulary.")
    
    word_idx = vocab[word]
    word_vec = embeddings[word_idx]
    
    similarities = []
    for other_word, idx in vocab.items():
        if other_word != word:
            other_vec = embeddings[idx]
            similarity = cosine_similarity(word_vec, other_vec)
            similarities.append((other_word, similarity))
    
    # Sort by similarity in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

In [54]:
corpus

[['banana is a fruit'],
 ['banana is yellow'],
 ['some fruits are yellow'],
 ['apple is also a fruit'],
 ['banana and apple are fruits'],
 ['kiwi is a fruit'],
 ['kiwi is also a fruit'],
 ['banana and wiki are fruits']]

In [57]:
vocab

{'some fruits are yellow': 0,
 'kiwi is a fruit': 1,
 'kiwi is also a fruit': 2,
 'apple is also a fruit': 3,
 'banana is a fruit': 4,
 'banana and apple are fruits': 5,
 'banana is yellow': 6,
 'banana and wiki are fruits': 7}

In [56]:
# 1. Find similar words
word = "banana"
print(f"Words similar to '{word}':")
print(find_similar_words(word, vocab, embeddings, top_n=5))

Words similar to 'banana':


ValueError: Word 'banana' not found in vocabulary.

In [123]:



# 2. Visualize embeddings
print("Visualizing embeddings:")
visualize_embeddings(vocab, embeddings, num_words=6)

# # 3. Sentence to embedding
# sentence = "I like learning"
# embedding = sentence_to_embedding(sentence, vocab, final_embeddings)
# print(f"Sentence embedding for '{sentence}': {embedding}")

# # 4. Word analogy
# print("Word analogy (king - man + woman = ?):")
# print(word_analogy("king", "man", "woman", vocab, final_embeddings, top_n=3))


Words similar to 'tree':


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [56]:
# Select words to visualize
# selected_words = ["cat", "dog", "bird", "tree", "hill", "mat", "log", "snow", "fly"]
selected_words = ["tree"]

# Get indices and embeddings of selected words
indices = [vocab[word] for word in selected_words]
selected_embeddings = embeddings[indices]

# Visualize using t-SNE
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, random_state=42, perplexity=5)
reduced_vectors = tsne.fit_transform(selected_embeddings)

plt.figure(figsize=(10, 10))
for i, word in enumerate(selected_words):
    plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
    plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]), fontsize=12)
plt.title("Word Embedding Visualization (Selected Words)")
plt.show()


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

# Second Version

# GloVe and FastText: From Theory to Implementation

This notebook provides a comprehensive explanation of GloVe and FastText, two popular word embedding algorithms. We will first cover the theoretical underpinnings of each algorithm, followed by implementing them from scratch.

## What is GloVe?

GloVe (Global Vectors for Word Representation) is a word embedding technique that captures global statistical information from a corpus of text. It constructs a word-word co-occurrence matrix and learns word vectors such that their dot products approximate the logarithm of the probabilities of co-occurrence.

### Key Ideas:
- Uses a co-occurrence matrix to capture the relationship between words.
- Optimizes an objective that balances co-occurrence frequency and word similarity.

**Objective Function:**
$$ J = \sum_{i,j} f(X_{ij})(w_i^T \tilde{w}_j + b_i + \tilde{b}_j - \log(X_{ij}))^2 $$

## What is FastText?

FastText extends word embeddings by incorporating subword information (n-grams). This allows it to generate embeddings for rare words and out-of-vocabulary words by aggregating n-gram embeddings.

### Key Ideas:
- Represents words as a bag of character n-grams.
- Embeddings for a word are computed by summing up the embeddings of its n-grams.

**Objective Function:**
\[ \sigma(w_c^T \cdot w_t) \] (CBOW) or \[ \sigma(w_c^T \cdot w_{context}) \] (Skip-gram)

In [58]:
import pandas as pd

In [59]:
df = pd.read_csv('../datasets/players_stats_by_season_full_details.csv')

In [60]:
df.head()

Unnamed: 0,League,Season,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,...,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team
0,NBA,1999 - 2000,Regular_Season,Shaquille O'Neal,LAL,79,3163.0,956,1665,0,...,"Mar 6, 1972",7-1,216.0,325.0,147.0,United States,Robert G. Cole High School,1.0,1.0,Orlando Magic
1,NBA,1999 - 2000,Regular_Season,Vince Carter,TOR,82,3126.0,788,1696,95,...,"Jan 26, 1977",6-6,198.0,220.0,100.0,United States,Mainland High School,1.0,5.0,Golden State Warriors
2,NBA,1999 - 2000,Regular_Season,Karl Malone,UTA,82,2947.0,752,1476,2,...,"Jul 24, 1963",6-9,206.0,265.0,120.0,United States,Summerfield High School,1.0,13.0,Utah Jazz
3,NBA,1999 - 2000,Regular_Season,Allen Iverson,PHI,70,2853.0,729,1733,89,...,"Jun 7, 1975",6-0,183.0,165.0,75.0,United States,Bethel High School,1.0,1.0,Philadelphia Sixers
4,NBA,1999 - 2000,Regular_Season,Gary Payton,SEA,82,3425.0,747,1666,177,...,"Jul 23, 1968",6-4,193.0,180.0,82.0,United States,Skyline High School,1.0,2.0,Seattle SuperSonics


In [61]:
selected_cols = df[['Player', 'nationality', 'high_school', 'draft_team']]

In [62]:
%%time

corpus = []
for _, row in selected_cols.iterrows():
    string = [f"The player {row['Player']} is from {row['nationality']}, went to the school {row['high_school']} \
    and plays for the team {row['draft_team']}."]
    corpus.append(string)


print(corpus[:3])

[["The player Shaquille O'Neal is from United States, went to the school Robert G. Cole High School      and plays for the team Orlando Magic."], ['The player Vince Carter is from United States, went to the school Mainland High School     and plays for the team Golden State Warriors.'], ['The player Karl Malone is from United States, went to the school Summerfield High School     and plays for the team Utah Jazz.']]


In [64]:
len(corpus)

53949

In [65]:
small_corpus = corpus[:100]

## Implementation of GloVe from Scratch

In [67]:
corpus = [
    ["soccer", "is", "a", "popular", "sport", "around", "the", "world"],
    ["the", "goalkeeper", "saved", "a", "penalty", "kick"],
    ["basketball", "players", "need", "to", "dribble", "the", "ball"],
    ["the", "coach", "motivates", "the", "team", "to", "perform", "better"],
    ["tennis", "matches", "are", "often", "played", "on", "grass", "courts"],
    ["the", "referee", "blew", "the", "whistle", "to", "start", "the", "match"],
    ["a", "home", "run", "is", "an", "exciting", "moment", "in", "baseball"],
    ["athletes", "train", "hard", "to", "compete", "in", "the", "olympics"],
    ["the", "runner", "crossed", "the", "finish", "line", "to", "win", "gold"],
    ["a", "football", "team", "needs", "both", "offense", "and", "defense"],
    ["the", "crowd", "cheered", "loudly", "after", "the", "goal"],
    ["cricket", "is", "a", "bat-and-ball", "game", "played", "in", "many", "countries"]
]

In [68]:
corpus = corpus[:5]

In [69]:
corpus[0]

['soccer', 'is', 'a', 'popular', 'sport', 'around', 'the', 'world']

In [70]:
%time tokenized_corpus = [sentence.split() for inner_list in small_corpus for sentence in inner_list]

CPU times: user 281 µs, sys: 2 µs, total: 283 µs
Wall time: 291 µs


In [71]:
vocab = list(set(word for sentence in tokenized_corpus for word in sentence))
vocab[:5]

['Cold', 'Pershing', 'Armstrong', 'Alonzo', 'Flint']

In [72]:
%%time

def build_cooccurrence_matrix(corpus, vocab, window_size=4):
    vocab_size = len(vocab)
    word_to_id = {word: i for i, word in enumerate(vocab)}
    cooccurrence = dok_matrix((vocab_size, vocab_size), dtype=np.float32)

    for sentence in corpus:
        for i, word in enumerate(sentence):
            for j in range(max(i - window_size, 0), min(i + window_size + 1, len(sentence))):
                if i != j:
                    cooccurrence[word_to_id[word], word_to_id[sentence[j]]] += 1

    return cooccurrence.toarray()

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs


In [73]:
fake_corpus = [
    ['soccer is 90 minutes long'],
    ['soccer is competitive']
]

fake_vocab = list(set(word for sentence in fake_corpus for word in sentence))
fake_vocab

['soccer is 90 minutes long', 'soccer is competitive']

In [74]:
build_cooccurrence_matrix(fake_corpus, fake_vocab)

array([[0., 0.],
       [0., 0.]], dtype=float32)

In [75]:
print("Co-occurrence Matrix:")
print(cooccurrence_matrix)

Co-occurrence Matrix:



In [76]:
EMBEDDING_DIM = 50  # Dimension of word vectors
X_MAX = 100
ALPHA = 0.75
LEARNING_RATE = 0.05
EPOCHS = 100

In [77]:
# Initialize embeddings and biases
def initialize_glove(vocab_size, embedding_dim):
    W = np.random.rand(vocab_size, embedding_dim) * 0.01  # Word vectors
    W_context = np.random.rand(vocab_size, embedding_dim) * 0.01  # Context word vectors
    b = np.random.rand(vocab_size) * 0.01  # Word biases
    b_context = np.random.rand(vocab_size) * 0.01  # Context word biases
    return W, W_context, b, b_context

In [78]:
%%time



# Weighting function
def weighting_function(x, x_max=X_MAX, alpha=ALPHA):
    return (x / x_max) ** alpha if x < x_max else 1

# GloVe training loop
def train_glove(cooccurrence, vocab_size, embedding_dim, epochs=EPOCHS, learning_rate=LEARNING_RATE):
    W, W_context, b, b_context = initialize_glove(vocab_size, embedding_dim)
    for epoch in range(epochs):
        total_loss = 0
        for i in range(vocab_size):
            for j in range(vocab_size):
                X_ij = cooccurrence[i, j]
                if X_ij > 0:
                    weight = weighting_function(X_ij)
                    log_X_ij = np.log(X_ij)
                    # Compute the error
                    diff = np.dot(W[i], W_context[j]) + b[i] + b_context[j] - log_X_ij
                    loss = weight * (diff ** 2)
                    total_loss += loss
                    
                    # Gradients and updates
                    grad_wi = weight * diff * W_context[j]
                    grad_wj = weight * diff * W[i]
                    grad_bi = weight * diff
                    grad_bj = weight * diff

                    W[i] -= learning_rate * grad_wi
                    W_context[j] -= learning_rate * grad_wj
                    b[i] -= learning_rate * grad_bi
                    b_context[j] -= learning_rate * grad_bj
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss}")
    return W + W_context  # Combine word and context embeddings



CPU times: user 11 µs, sys: 1e+03 ns, total: 12 µs
Wall time: 16.9 µs


In [79]:
# Train GloVe
vocab_size = len(vocab)
embedding_dim = EMBEDDING_DIM
train_glove(cooccurrence_matrix, vocab_size, embedding_dim)

TypeError: 'coo_matrix' object is not subscriptable

In [None]:
word_to_embedding = {word: glove_embeddings[i] for i, word in enumerate(vocab)}

In [None]:
test_word = 'Proviso'

In [None]:
# Retrieve the embedding for a specific word
word = test_word  # Example
if word in word_to_embedding:
    print(f"Embedding for '{word}': {word_to_embedding[word]}")
else:
    print(f"'{word}' not in vocabulary.")


In [None]:
from numpy.linalg import norm

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def find_similar_words(word, top_n=5):
    if word not in word_to_embedding:
        return f"'{word}' not in vocabulary."

    word_vec = word_to_embedding[word]
    similarities = {
        other_word: cosine_similarity(word_vec, word_to_embedding[other_word])
        for other_word in word_to_embedding if other_word != word
    }
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_similarities[:top_n]

# Example: Find words similar to "apples"
similar_words = find_similar_words(test_word)
print(f"Words similar to {test_word}:", similar_words)


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def visualize_embeddings(word_to_embedding, words_to_visualize):
    embeddings = np.array([word_to_embedding[word] for word in words_to_visualize])
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)

    plt.figure(figsize=(10, 8))
    for i, word in enumerate(words_to_visualize):
        plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1])
        plt.text(reduced_embeddings[i, 0] + 0.01, reduced_embeddings[i, 1] + 0.01, word, fontsize=9)
    plt.title("Word Embedding Visualization")
    plt.show()

# Example: Visualize 10 random words
words_to_visualize = list(word_to_embedding.keys())[:10]
visualize_embeddings(word_to_embedding, words_to_visualize)


In [None]:
def sentence_to_embedding(sentence, word_to_embedding):
    embeddings = [word_to_embedding[word] for word in sentence if word in word_to_embedding]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(EMBEDDING_DIM)

# Example sentence embedding
sentence = ["i", "like", "soccer"]
sentence_embedding = sentence_to_embedding(sentence, word_to_embedding)
print("Sentence Embedding:", sentence_embedding)

In [None]:
def word_analogy(word1, word2, word3, word_to_embedding):
    if word1 not in word_to_embedding or word2 not in word_to_embedding or word3 not in word_to_embedding:
        return "One of the words is not in the vocabulary."

    analogy_vec = word_to_embedding[word1] - word_to_embedding[word2] + word_to_embedding[word3]
    similarities = {
        other_word: cosine_similarity(analogy_vec, word_to_embedding[other_word])
        for other_word in word_to_embedding
    }
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_similarities[:5]

# Example analogy: "man" is to "king" as "woman" is to "?"
analogy_result = word_analogy("soccer", "ball", "basketball", word_to_embedding)
print("Word Analogy Result:", analogy_result)

In [None]:
def cosine_similarity(vec1, vec2):
    """
    Compute the cosine similarity between two vectors.
    """
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return np.dot(vec1, vec2) / (norm1 * norm2)


In [None]:
def find_similar_words(word, vocab, embeddings, top_n=5):
    """
    Find the most similar words to a given word using cosine similarity.
    """
    if word not in vocab:
        raise ValueError(f"Word '{word}' not found in vocabulary.")
    
    word_idx = vocab[word]
    word_vec = embeddings[word_idx]
    similarities = [
        (other_word, cosine_similarity(word_vec, embeddings[idx]))
        for other_word, idx in vocab.items()
        if other_word != word
    ]
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]


In [None]:
def visualize_embeddings(vocab, embeddings, num_words=50):
    """
    Visualize embeddings using t-SNE.
    """
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    words = list(vocab.keys())[:num_words]
    indices = [vocab[word] for word in words]
    word_vectors = embeddings[indices]
    
    tsne = TSNE(n_components=2, random_state=42)
    reduced_vectors = tsne.fit_transform(word_vectors)
    
    plt.figure(figsize=(10, 10))
    for i, word in enumerate(words):
        plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
        plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]))
    plt.title("Word Embedding Visualization")
    plt.show()


In [None]:
def sentence_to_embedding(sentence, vocab, embeddings):
    """
    Convert a sentence into a single embedding by averaging the word embeddings.
    """
    words = sentence.split()
    valid_embeddings = [
        embeddings[vocab[word]] for word in words if word in vocab
    ]
    if not valid_embeddings:
        raise ValueError("None of the words in the sentence are in the vocabulary.")
    return np.mean(valid_embeddings, axis=0)


In [None]:
def word_analogy(word_a, word_b, word_c, vocab, embeddings, top_n=5):
    """
    Solve word analogy tasks: "word_a is to word_b as word_c is to ?".
    Example: king - man + woman = queen.
    """
    if word_a not in vocab or word_b not in vocab or word_c not in vocab:
        raise ValueError("One of the words is not in the vocabulary.")
    
    vec_a = embeddings[vocab[word_a]]
    vec_b = embeddings[vocab[word_b]]
    vec_c = embeddings[vocab[word_c]]
    target_vec = vec_b - vec_a + vec_c
    
    similarities = [
        (word, cosine_similarity(target_vec, embeddings[idx]))
        for word, idx in vocab.items()
        if word not in {word_a, word_b, word_c}
    ]
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]


## Implementation of FastText from Scratch

In [80]:
corpus = [
    "The forest is home to many species of animals.",
    "Rivers provide water for plants and animals.",
    "Flowers bloom beautifully in the spring.",
    "The mountain peaks are covered in snow.",
    "Rain nourishes the earth and helps crops grow.",
    "The ocean is vast and full of marine life.",
    "Birds migrate to warmer places in winter.",
    "Sunsets over the desert are breathtaking.",
    "The jungle is dense and full of life.",
    "A calm lake reflects the surrounding trees.",
]

# Labels corresponding to the themes in the corpus
labels = [
    "forest",  # Sentence 1
    "river",   # Sentence 2
    "flower",  # Sentence 3
    "mountain",# Sentence 4
    "rain",    # Sentence 5
    "ocean",   # Sentence 6
    "bird",    # Sentence 7
    "desert",  # Sentence 8
    "jungle",  # Sentence 9
    "lake",    # Sentence 10
]

In [81]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import random

# Tokenization and n-gram generation
def tokenize(text):
    return text.lower().split()

def generate_ngrams(word, n=3):
    word = f"<{word}>"
    return [word[i:i+n] for i in range(len(word)-n+1)]

# Build vocabulary
def build_vocab(corpus, n=3):
    vocab = set()
    ngram_vocab = set()
    for sentence in corpus:
        tokens = tokenize(sentence)
        vocab.update(tokens)
        for token in tokens:
            ngram_vocab.update(generate_ngrams(token, n))
    return vocab, ngram_vocab

vocab, ngram_vocab = build_vocab(corpus)
vocab = {word: idx for idx, word in enumerate(vocab)}
ngram_vocab = {ngram: idx for idx, ngram in enumerate(ngram_vocab)}

# Encode sentences and labels
def encode_sentence(sentence, vocab, ngram_vocab, n=3):
    tokens = tokenize(sentence)
    word_indices = [vocab[token] for token in tokens if token in vocab]
    ngram_indices = []
    for token in tokens:
        ngram_indices.extend(
            [ngram_vocab[ngram] for ngram in generate_ngrams(token, n) if ngram in ngram_vocab]
        )
    return word_indices, ngram_indices

encoded_data = [
    encode_sentence(sentence, vocab, ngram_vocab) for sentence in corpus
]
label_to_idx = {label: idx for idx, label in enumerate(set(labels))}
encoded_labels = [label_to_idx[label] for label in labels]

train_data, test_data, train_labels, test_labels = train_test_split(
    encoded_data, encoded_labels, test_size=0.2, random_state=42
)

# Define FastText model
class FastText(nn.Module):
    def __init__(self, vocab_size, ngram_vocab_size, embedding_dim, num_classes):
        super(FastText, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.ngram_embeddings = nn.Embedding(ngram_vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, word_indices, ngram_indices):
        word_embeds = self.word_embeddings(word_indices).mean(dim=1)
        ngram_embeds = self.ngram_embeddings(ngram_indices).mean(dim=1)
        combined = word_embeds + ngram_embeds
        return self.fc(combined)

# Training parameters
embedding_dim = 50
num_classes = len(label_to_idx)
model = FastText(len(vocab), len(ngram_vocab), embedding_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    model.train()
    total_loss = 0
    for (word_indices, ngram_indices), label in zip(train_data, train_labels):
        word_tensor = torch.tensor(word_indices).unsqueeze(0)
        ngram_tensor = torch.tensor(ngram_indices).unsqueeze(0)
        label_tensor = torch.tensor([label])
        
        optimizer.zero_grad()
        output = model(word_tensor, ngram_tensor)
        loss = criterion(output, label_tensor)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Evaluation
model.eval()
correct = 0
total = 0



Epoch 1, Loss: 19.0294
Epoch 2, Loss: 13.0412
Epoch 3, Loss: 8.8429
Epoch 4, Loss: 5.5563
Epoch 5, Loss: 3.2311
Epoch 6, Loss: 1.8025
Epoch 7, Loss: 1.0328
Epoch 8, Loss: 0.6419
Epoch 9, Loss: 0.4383
Epoch 10, Loss: 0.3242


In [82]:
with torch.no_grad():
    for (word_indices, ngram_indices), label in zip(test_data, test_labels):
        word_tensor = torch.tensor(word_indices).unsqueeze(0)
        ngram_tensor = torch.tensor(ngram_indices).unsqueeze(0)
        label_tensor = torch.tensor([label])
        
        output = model(word_tensor, ngram_tensor)
        pred = output.argmax(dim=1)
        print(pred)
        print(label_tensor)
        correct += (pred == label_tensor).sum().item()
        total += label_tensor.size(0)

print(f"Accuracy: {correct / total * 100:.2f}%")

tensor([7])
tensor([4])
tensor([9])
tensor([5])
Accuracy: 0.00%


In [83]:
test_data[0][0]

[24, 48, 5, 15, 10, 43, 11, 37]

In [84]:
for i in [52, 27, 37, 47, 42, 13, 51, 35]:
    print(vocab[i])

KeyError: 52

In [None]:
idx_to_word = {idx: word for word, idx in vocab.items()}

In [None]:
for i in [52, 27, 37, 47, 42, 13, 51, 35]:
    print(idx_to_word[i])

In [None]:
test_labels

In [None]:
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

In [None]:
idx_to_label

In [None]:
label_to_idx = {label: idx for idx, label in enumerate(set(labels))}


# Real FastText

1. Setup and Preprocessing
We first tokenize the text, create character n-grams, and build mappings for the vocabulary.

In [120]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import random

# Sample corpus
# corpus = [
#     "The forest is home to many species of animals.",
#     "Rivers provide water for plants and animals.",
#     "Flowers bloom beautifully in the spring.",
#     "The mountain peaks are covered in snow.",
#     "Rain nourishes the earth and helps crops grow.",
#     "The ocean is vast and full of marine life.",
#     "Birds migrate to warmer places in winter.",
#     "Sunsets over the desert are breathtaking.",
#     "The jungle is dense and full of life.",
#     "A calm lake reflects the surrounding trees.",
# ]

corpus = [
    "Dogs are loyal and friendly animals.",
    "Cats are independent and curious animals.",
    "Dogs and cats both make great pets.",
    "Dogs love to play in the yard.",
    "Cats love to climb and explore new places.",
    "Both dogs and cats are part of many families.",
    "Clouds drift gently across the sky.",
    "Nature is peaceful and full of life.",
    "Clouds bring rain, which nourishes nature.",
    "The beauty of clouds is part of the beauty of nature.",
    "Clouds form over mountains and forests.",
    "Nature includes the sky, the earth, and all living things.",
]


# Tokenization
def tokenize(text):
    return text.lower().split()

# Generate character n-grams
def generate_ngrams(word, n=3):
    word = f"<{word}>"
    return [word[i:i+n] for i in range(len(word)-n+1)]

# Build vocabulary and n-gram vocabulary
def build_vocab(corpus, n=3):
    word_counts = Counter()
    ngram_vocab = set()
    for sentence in corpus:
        tokens = tokenize(sentence)
        word_counts.update(tokens)
        for token in tokens:
            ngram_vocab.update(generate_ngrams(token, n))
    vocab = {word: idx for idx, word in enumerate(word_counts.keys())}
    ngram_vocab = {ngram: idx for idx, ngram in enumerate(ngram_vocab)}
    return vocab, ngram_vocab

# Build vocabularies
vocab, ngram_vocab = build_vocab(corpus)
idx_to_word = {idx: word for word, idx in vocab.items()}
idx_to_ngram = {idx: ngram for ngram, idx in ngram_vocab.items()}


2. Skip-Gram Training Data Generation
We create training samples by generating target-context pairs for each word in the corpus.

In [121]:
# Generate skip-gram pairs
def generate_skip_gram_pairs(corpus, vocab, window_size=2):
    pairs = []
    for sentence in corpus:
        tokens = tokenize(sentence)
        token_indices = [vocab[token] for token in tokens if token in vocab]
        for i, target in enumerate(token_indices):
            start = max(0, i - window_size)
            end = min(len(token_indices), i + window_size + 1)
            for context in token_indices[start:i] + token_indices[i+1:end]:
                pairs.append((target, context))
    return pairs

skip_gram_pairs = generate_skip_gram_pairs(corpus, vocab)


3. FastText Model
The model combines word and n-gram embeddings for both target and context words.

In [122]:
class FastText(nn.Module):
    def __init__(self, vocab_size, ngram_vocab_size, embedding_dim):
        super(FastText, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.ngram_embeddings = nn.Embedding(ngram_vocab_size, embedding_dim)
        self.output_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, target_idx, context_idx, ngram_indices):
        # Word embeddings
        target_word_embed = self.word_embeddings(target_idx)
        context_word_embed = self.output_embeddings(context_idx)

        # N-gram embeddings for target word
        if len(ngram_indices) > 0:
            ngram_embeds = self.ngram_embeddings(torch.tensor(ngram_indices))
            target_word_embed += ngram_embeds.mean(dim=0)

        # Return dot product
        return torch.matmul(target_word_embed, context_word_embed.T).squeeze()


4. Negative Sampling Loss
We implement the skip-gram loss with negative sampling.

In [123]:
def negative_sampling_loss(model, target_idx, context_idx, neg_indices, ngram_indices):
    # Positive score
    pos_score = model(target_idx, context_idx, ngram_indices)
    pos_loss = -torch.log(torch.sigmoid(pos_score))

    # Negative scores
    neg_scores = model(target_idx, neg_indices, ngram_indices)
    neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_scores)))

    return pos_loss + neg_loss


5. Training Loop
The model is trained using skip-gram pairs, negative sampling, and character n-grams.

In [157]:
# Parameters
embedding_dim = 100
num_neg = 5
learning_rate = 0.01
epochs = 30

# Model and optimizer
model = FastText(len(vocab), len(ngram_vocab), embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [160]:
# Training loop
for epoch in range(epochs):
    total_loss = 0
    random.shuffle(skip_gram_pairs)
    for target, context in skip_gram_pairs:
        # Prepare data
        target_idx = torch.tensor([target])
        context_idx = torch.tensor([context])
        neg_indices = torch.randint(0, len(vocab), (num_neg,))
        
        # Generate n-grams for target word
        target_word = idx_to_word[target]
        ngram_indices = [ngram_vocab[ngram] for ngram in generate_ngrams(target_word) if ngram in ngram_vocab]

        # Compute loss
        optimizer.zero_grad()
        loss = negative_sampling_loss(model, target_idx, context_idx, neg_indices, ngram_indices)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 574.7978
Epoch 2, Loss: 597.2410
Epoch 3, Loss: 662.0351
Epoch 4, Loss: 621.6660
Epoch 5, Loss: 554.5450
Epoch 6, Loss: 526.7131
Epoch 7, Loss: 559.4607
Epoch 8, Loss: 592.9697
Epoch 9, Loss: 560.8083
Epoch 10, Loss: 546.5706
Epoch 11, Loss: 562.0060
Epoch 12, Loss: 517.5833
Epoch 13, Loss: 548.4230
Epoch 14, Loss: 572.2907
Epoch 15, Loss: 585.7272
Epoch 16, Loss: 597.3325
Epoch 17, Loss: 563.0022
Epoch 18, Loss: 571.3578
Epoch 19, Loss: 578.0705
Epoch 20, Loss: 533.3662
Epoch 21, Loss: 561.2059
Epoch 22, Loss: 586.1269
Epoch 23, Loss: 518.6841
Epoch 24, Loss: 493.5637
Epoch 25, Loss: 500.7670
Epoch 26, Loss: 552.7023
Epoch 27, Loss: 576.5710
Epoch 28, Loss: 556.9758
Epoch 29, Loss: 553.6536
Epoch 30, Loss: 551.1399


6. Word Embeddings
After training, you can extract word embeddings directly from the model.

In [145]:
# Retrieve embeddings
word_embeddings = model.word_embeddings.weight.data

# Example: Get the embedding for "forest"
word_idx = vocab["pets."]
forest_embedding = word_embeddings[word_idx]
print(f"Embedding for 'forest': {forest_embedding}")


Embedding for 'forest': tensor([ 0.2482,  0.2730, -0.3515, -0.0534,  0.7431, -0.2644, -0.1067, -0.0041,
        -0.2399,  0.3199,  0.7748,  0.0358,  0.4299,  0.4340,  0.4636, -0.2827,
        -0.4597, -0.0702, -0.5977,  1.7149,  0.7438, -0.6372, -1.3275, -0.3866,
         1.4076,  0.8844, -0.5750, -0.5022, -0.4124,  0.9881, -0.6303, -1.5119,
         0.5574,  1.3756,  0.1139,  0.5222, -0.7006,  0.6642,  0.7499, -0.1634,
        -2.0700, -0.5422, -0.8774,  1.7897,  0.6975,  0.4746,  0.6791,  0.7973,
        -0.0936, -0.5318])


Prediction Test: Find Similar Words
We’ll predict similar words based on their embeddings using cosine similarity.

1. Define Cosine Similarity
Cosine similarity measures how close two vectors are in a high-dimensional space.

In [140]:
import torch.nn.functional as F

def cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item()

2. Find the Most Similar Words
Use the embeddings from the trained model to find the most similar words.

In [141]:
def find_similar_words(target_word, model, vocab, top_n=5):
    if target_word not in vocab:
        return f"'{target_word}' not in vocabulary."

    # Get the target word's embedding
    target_idx = vocab[target_word]
    target_embedding = model.word_embeddings.weight[target_idx]

    # Compute similarity with all other words
    similarities = {}
    for word, idx in vocab.items():
        if word == target_word:
            continue
        word_embedding = model.word_embeddings.weight[idx]
        similarity = cosine_similarity(target_embedding, word_embedding)
        similarities[word] = similarity

    # Sort and return top N most similar words
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_similarities[:top_n]


3. Run a Prediction Test
Use the function to find similar words for a given input.

In [147]:
# Test for similar words
test_word = "peaceful"
similar_words = find_similar_words(test_word, model, vocab, top_n=5)
print(f"Words similar to '{test_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")


Words similar to 'peaceful':
animals.: 0.4202
sky.: 0.4082
life.: 0.3680
of: 0.3386
forests.: 0.2859


In [119]:
# torch.save(model.word_embeddings.weight.data, "word_embeddings.pt")

## Exercise: Implement Both Algorithms

Given the corpus below, implement GloVe and FastText, and evaluate the embeddings generated:

### Corpus:
```
corpus = [
    ["machine", "learning", "is", "fun"],
    ["deep", "learning", "is", "a", "subset", "of", "machine", "learning"]
]
```
### Tasks:
- Build the co-occurrence matrix and train GloVe.
- Create n-grams and train FastText embeddings.
- Compare the embeddings for the word "learning".