In [31]:
import numpy as np
import math
import re
from collections import Counter
from collections import defaultdict

# Pre-Processing

In [22]:
def read_corpus(file_path):
    with open(file_path,'r',encoding='utf-8') as f:
        text=f.read()
    return text
def tokenize(text):
    text=text.lower()
    text=re.sub(r'[^a-z\s]', '', text)
    tokens=text.split()
    return tokens

In [23]:
file_path='wikipedia.txt'
raw_text=read_corpus(file_path)
tokens=tokenize(raw_text)

In [41]:
word_counts=Counter(tokens)
vocab={word:i for i, word in enumerate(word_counts)} #{'apple': 0, 'banana': 1, 'orange': 2}
inv_vocab = {i: word for word, i in vocab.items()}
vocab_size=sum([1 for i in dict(word_counts).keys() ])# i like it the cpmlex way...just for experiment

In [None]:
WINDOW_SIZE = 5
cooccurrence = defaultdict(lambda: defaultdict(float))  # cooccurrence[i][j]

for i, word in enumerate(tokens):
    word_i = vocab[word]
    start = max(i - WINDOW_SIZE, 0)
    end = min(i + WINDOW_SIZE + 1, len(tokens))  

    for j in range(start, end):
        if i != j:
            word_j = vocab[tokens[j]]
            distance = abs(i - j)
            cooccurrence[word_i][word_j] += 1.0


In [43]:
embedding_dim = 50 
W = np.random.randn(vocab_size, embedding_dim) / np.sqrt(embedding_dim)  # word vector
W_tilde = np.random.randn(vocab_size, embedding_dim) / np.sqrt(embedding_dim)  # context vector
b = np.zeros(vocab_size)           # word bias
b_tilde = np.zeros(vocab_size)

#adagrad Acumulator
gradsq_W = np.ones_like(W)
gradsq_W_tilde = np.ones_like(W_tilde)
gradsq_b = np.ones_like(b)
gradsq_b_tilde = np.ones_like(b_tilde)

In [44]:
def f(x, x_max=100, alpha=0.75):
    return (x / x_max) ** alpha if x < x_max else 1

In [45]:
#Training
epochs=50
learning_rate=0.05
for epoch in range(epochs):
    total_loss=0
    for i in cooccurrence:
        for j in cooccurrence[i]:
            x_ij=cooccurrence[i][j]
            weight=f(x_ij)
            
            
        #loss function
        dot=np.dot(W[i],W_tilde[j])
        loss=(dot+b[i]+b_tilde[j]-np.log(x_ij))
        
        total_loss+=0.5*weight*(loss**2)
        
        
        grad_common = weight * loss

        grad_Wi = grad_common * W_tilde[j]
        grad_Wj = grad_common * W[i]
        grad_bi = grad_common
        grad_bj = grad_common
        
        W[i] -= (learning_rate * grad_Wi) / np.sqrt(gradsq_W[i])
        W_tilde[j] -= (learning_rate * grad_Wj) / np.sqrt(gradsq_W_tilde[j])
        b[i] -= (learning_rate * grad_bi) / np.sqrt(gradsq_b[i])
        b_tilde[j] -= (learning_rate * grad_bj) / np.sqrt(gradsq_b_tilde[j])
        
        gradsq_W[i] += grad_Wi ** 2
        gradsq_W_tilde[j] += grad_Wj ** 2
        gradsq_b[i] += grad_bi ** 2
        gradsq_b_tilde[j] += grad_bj ** 2
        
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
        

Epoch 1, Loss: 0.0683
Epoch 2, Loss: 0.0674
Epoch 3, Loss: 0.0665
Epoch 4, Loss: 0.0655
Epoch 5, Loss: 0.0646
Epoch 6, Loss: 0.0638
Epoch 7, Loss: 0.0629
Epoch 8, Loss: 0.0620
Epoch 9, Loss: 0.0612
Epoch 10, Loss: 0.0603
Epoch 11, Loss: 0.0595
Epoch 12, Loss: 0.0587
Epoch 13, Loss: 0.0579
Epoch 14, Loss: 0.0571
Epoch 15, Loss: 0.0564
Epoch 16, Loss: 0.0556
Epoch 17, Loss: 0.0549
Epoch 18, Loss: 0.0541
Epoch 19, Loss: 0.0534
Epoch 20, Loss: 0.0527
Epoch 21, Loss: 0.0520
Epoch 22, Loss: 0.0513
Epoch 23, Loss: 0.0506
Epoch 24, Loss: 0.0500
Epoch 25, Loss: 0.0493
Epoch 26, Loss: 0.0486
Epoch 27, Loss: 0.0480
Epoch 28, Loss: 0.0474
Epoch 29, Loss: 0.0467
Epoch 30, Loss: 0.0461
Epoch 31, Loss: 0.0455
Epoch 32, Loss: 0.0449
Epoch 33, Loss: 0.0443
Epoch 34, Loss: 0.0438
Epoch 35, Loss: 0.0432
Epoch 36, Loss: 0.0426
Epoch 37, Loss: 0.0421
Epoch 38, Loss: 0.0415
Epoch 39, Loss: 0.0410
Epoch 40, Loss: 0.0405
Epoch 41, Loss: 0.0399
Epoch 42, Loss: 0.0394
Epoch 43, Loss: 0.0389
Epoch 44, Loss: 0.03

In [50]:
word_embeddings = (W + W_tilde)/2  # final embeddings as per paper

In [None]:
def most_similar(word, top_n=5):
    if word not in vocab:
        return []

    idx = vocab[word]
    vec = word_embeddings[idx]
    norms = np.linalg.norm(word_embeddings, axis=1)
    vec_norm = np.linalg.norm(vec)

    sims = word_embeddings @ vec / (norms * vec_norm + 1e-10)
    top_idxs = np.argsort(-sims)[:top_n+1]  

    return [inv_vocab[i] for i in top_idxs if i != idx][:top_n]

# Example:
print(most_similar("reptiles"))


['emerges', 'an', 'larvae', 'much', 'species']
