In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import random

In [2]:
# Read corpus from file
with open('wikipedia.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

# Convert to list of sentences (if needed)
from nltk.tokenize import sent_tokenize
import nltk
# nltk.download('punkt')  # ensure you have this

corpus = sent_tokenize(raw_text)


In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
def preprocess(Corpus):
    token=[]
    for sentence in corpus:
        l=[]
        sentence=sentence.lower()
        for w in sentence.split():
            w=lemmatizer.lemmatize(w)
            if w.isalpha():# and w not in stop_words:
                l.append(w)
        token.append(l)
    return token
                

In [5]:
tokens=(preprocess(corpus))
vocab = set( j for i in tokens for j in i)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)
print(vocab)

{'program', 'appear', 'began', 'guinea', 'affected', 'called', 'is', 'treat', 'education', 'causing', 'larva', 'around', 'encourage', 'since', 'for', 'pain', 'across', 'severe', 'wound', 'eradicated', 'reside', 'of', 'per', 'affecting', 'to', 'or', 'successful', 'leaving', 'africa', 'much', 'type', 'place', 'spread', 'early', 'previously', 'painful', 'exit', 'on', 'break', 'dropped', 'account', 'infected', 'each', 'after', 'endemic', 'blister', 'other', 'kill', 'well', 'week', 'relieve', 'person', 'though', 'mainstay', 'can', 'eradication', 'allows', 'take', 'dracunculiasis', 'stomach', 'treatment', 'from', 'the', 'specie', 'which', 'gentle', 'medication', 'tract', 'had', 'continuing', 'over', 'ten', 'will', 'attempt', 'throughout', 'gauze', 'digestive', 'million', 'water', 'limb', 'speed', 'when', 'cause', 'acid', 'worm', 'three', 'prevention', 'gradually', 'induces', 'disabling', 'creating', 'have', 'prevent', 'digest', 'human', 'widespread', 'this', 'turned', 'year', 'wrapping', 'fa

In [6]:
def generate_train(tokens,windowsize=2):
    data=[]
    for i in tokens:
        for idx,target in enumerate(i):
            context=[]
            for j in range(max(0,idx-windowsize),min(idx+windowsize+1,len(i))):
                if j!=idx:
                    context.append(i[j])
            data.append([context,target])
        
    return data         

In [7]:
data=generate_train(tokens)
print(data)

[[['called', 'is'], 'also'], [['also', 'is', 'a'], 'called'], [['also', 'called', 'a', 'parasitic'], 'is'], [['called', 'is', 'parasitic', 'infection'], 'a'], [['is', 'a', 'infection', 'by'], 'parasitic'], [['a', 'parasitic', 'by', 'the'], 'infection'], [['parasitic', 'infection', 'the', 'guinea'], 'by'], [['infection', 'by', 'guinea', 'dracunculus'], 'the'], [['by', 'the', 'dracunculus'], 'guinea'], [['the', 'guinea'], 'dracunculus'], [['person', 'becomes'], 'a'], [['a', 'becomes', 'infected'], 'person'], [['a', 'person', 'infected', 'by'], 'becomes'], [['person', 'becomes', 'by', 'drinking'], 'infected'], [['becomes', 'infected', 'drinking', 'water'], 'by'], [['infected', 'by', 'water', 'contaminated'], 'drinking'], [['by', 'drinking', 'contaminated', 'with'], 'water'], [['drinking', 'water', 'with', 'larva'], 'contaminated'], [['water', 'contaminated', 'larva', 'that'], 'with'], [['contaminated', 'with', 'that', 'reside'], 'larva'], [['with', 'larva', 'reside', 'inside'], 'that'], [

In [8]:
def one_hot_vector(word):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec


In [9]:
embedding_dim = 10  
W1 = np.random.randn(vocab_size, embedding_dim)
W2 = np.random.randn(embedding_dim, vocab_size) 

In [10]:
[['dracunculiasis', 'will', 'the', 'second'], 'become']

[['dracunculiasis', 'will', 'the', 'second'], 'become']

In [17]:
def softmax(x):
    e_x = np.exp(x - np.max(x))  # numerical stability trick
    return e_x / e_x.sum()

In [18]:
def forward_pass(context_words):
    embedding = [one_hot_vector(i) for i in context_words]  # pass words, not indices
    x = np.mean(embedding, axis=0)  # Average context embedding
    h = np.dot(x, W1)
    u = np.dot(h, W2)
    y_pred = softmax(u)
    return x, h, u, y_pred


In [19]:
def backprop(x, h, y_pred, target_word, learning_rate=0.01):
    global W1, W2
    y_true = one_hot_vector(target_word)
    error = y_pred - y_true
    dW2 = np.outer(h, error)
    dW1 = np.outer(x, np.dot(W2, error))
    
    # global W1, W2
    W1 -= learning_rate * dW1
    W2 -= learning_rate * dW2


In [20]:
for epoch in range(1000):
    loss = 0
    for context, target in data:
        x, h, u, y_pred = forward_pass(context)
        backprop(x, h, y_pred, target)
        loss += -np.log(y_pred[word2idx[target]] + 1e-7)
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


Epoch 0, Loss: 2309.0970
Epoch 100, Loss: 962.7927
Epoch 200, Loss: 511.7481
Epoch 300, Loss: 268.3586
Epoch 400, Loss: 143.0857
Epoch 500, Loss: 83.8589
Epoch 600, Loss: 54.7626
Epoch 700, Loss: 39.1089
Epoch 800, Loss: 29.7237
Epoch 900, Loss: 23.6216


In [27]:
def get_embedding(word):
    return W1[word2idx[word]]


In [28]:
print("\n📌 Word Embeddings:")
for word in word2idx:
    print(f"{word:>10} : {get_embedding(word)}")


📌 Word Embeddings:
   program : [ 3.64111583 -1.47231993  3.05302788  0.91679647 -2.05311774  1.02946092
 -1.00200864  1.46569032  1.13057985 -1.51866066]
    appear : [ 1.31551717  0.83348543 -1.76556074 -0.50540898  0.52478831  0.06463102
  1.24031461 -2.99016287  0.57035074 -0.4042294 ]
     began : [-0.69000381  2.45901541  0.38549763 -0.28742727  1.46801408 -1.76531164
 -2.1724194  -1.70540226  2.69336505  0.82578977]
    guinea : [ 3.37911846  2.62448282  0.95655211  1.939482   -1.52135062  1.82997442
 -0.66074892 -0.25905448  3.73437436  0.94462536]
  affected : [ 3.99264942  0.42610958  0.56081594  2.4565443  -1.33902399 -3.33429525
  0.8796953  -1.32049329 -0.38988565 -4.79029851]
    called : [ 0.26279394 -0.30313918 -2.68482713  2.01602027 -3.09536934 -0.02581016
  0.85765583  2.08836786 -1.05685107 -2.06339633]
        is : [ 2.84988505  1.00189257 -1.75703254 -1.59361668  1.12556279  0.687441
 -4.27193456  2.38202489  4.00212469 -1.89873889]
     treat : [-0.75762835  1.8