In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import random

In [2]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dipak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Read corpus from file
with open('wikipedia.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

# Convert to list of sentences (if needed)
from nltk.tokenize import sent_tokenize
import nltk
# nltk.download('punkt')  # ensure you have this

corpus = sent_tokenize(raw_text)


In [4]:
corpus

['Dracunculiasis, also called Guinea-worm disease, is a parasitic infection by the Guinea worm, Dracunculus medinensis.',
 'A person becomes infected by drinking water contaminated with Guinea-worm larvae that reside inside copepods (a type of small crustacean).',
 'Stomach acid digests the copepod and releases the Guinea worm, which penetrates the digestive tract and escapes into the body.',
 'Around a year later, the adult female migrates to an exit site – usually the lower leg – and induces an intensely painful blister on the skin.',
 'Eventually, the blister bursts, creating a painful wound from which the worm gradually emerges over several weeks.',
 "The wound remains painful throughout the worm's emergence, disabling the affected person for the three to ten weeks it takes the worm to emerge.",
 'The female worm releases larvae when the host submerges the wound in water in attempts to relieve the pain, thus continuing the life cycle.',
 'There is no medication to treat or prevent 

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [6]:
def preprocess(Corpus):
    token=[]
    for sentence in corpus:
        l=[]
        sentence=sentence.lower()
        for w in sentence.split():
            w=lemmatizer.lemmatize(w)
            if w.isalpha():# and w not in stop_words:
                l.append(w)
        token.append(l)
    return token
                

In [7]:
tokens=(preprocess(corpus))
# print(tokens)
vocab = set( j for i in tokens for j in i)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)
# print(vocab_size)
# print(vocab)

In [8]:
def generate_train(tokens,windowsize=2):
    data=[]
    for i in tokens:
        for idx,center in enumerate(i):
            context_range=list(range(max(0,idx-windowsize),min(len(i),idx+windowsize+1)))
            context_range.remove(idx)
            for context_idx in context_range:
                context=i[context_idx]
                data.append((center,context))
    return data

training_data = generate_train(tokens, windowsize=1)
print("Sample training pairs:", training_data[:])       
    

Sample training pairs: [('also', 'called'), ('called', 'also'), ('called', 'is'), ('is', 'called'), ('is', 'a'), ('a', 'is'), ('a', 'parasitic'), ('parasitic', 'a'), ('parasitic', 'infection'), ('infection', 'parasitic'), ('infection', 'by'), ('by', 'infection'), ('by', 'the'), ('the', 'by'), ('the', 'guinea'), ('guinea', 'the'), ('guinea', 'dracunculus'), ('dracunculus', 'guinea'), ('a', 'person'), ('person', 'a'), ('person', 'becomes'), ('becomes', 'person'), ('becomes', 'infected'), ('infected', 'becomes'), ('infected', 'by'), ('by', 'infected'), ('by', 'drinking'), ('drinking', 'by'), ('drinking', 'water'), ('water', 'drinking'), ('water', 'contaminated'), ('contaminated', 'water'), ('contaminated', 'with'), ('with', 'contaminated'), ('with', 'larva'), ('larva', 'with'), ('larva', 'that'), ('that', 'larva'), ('that', 'reside'), ('reside', 'that'), ('reside', 'inside'), ('inside', 'reside'), ('inside', 'copepod'), ('copepod', 'inside'), ('copepod', 'type'), ('type', 'copepod'), ('ty

In [9]:
def one_hot(idx,vocab_size):
    vec=np.zeros(vocab_size)
    vec[idx]=1
    return vec

In [10]:
EMBEDDIND_DIM=10
W1 = np.random.randn(vocab_size,EMBEDDIND_DIM)
W2 = np.random.randn(EMBEDDIND_DIM, vocab_size)
lr = 0.05


In [11]:
def softmax(x):
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum(axis=0)

In [None]:
def train(epoch=1000):
    global W1,W2 
    for i in range(epoch):
        loss=0
        for center,context in training_data:
            center_idx=word2idx[center]
            context_idx=word2idx[context]
            
            x=one_hot(center_idx,vocab_size)
            h=np.dot(W1.T,x)
            u=np.dot(W2.T,h)
            y_pred=softmax(u)
            
            y_true=one_hot(context_idx,vocab_size)
            loss += -np.sum(y_true * np.log(y_pred + 1e-9))

            e = y_pred - y_true   # error
            dW2 = np.outer(h, e)
            dW1 = np.outer(x, np.dot(W2, e))

            W1 -= lr * dW1
            W2 -= lr * dW2

        if i % 100 == 0:
            print(f"Epoch {i}, Loss: {loss:.4f}")
            

In [13]:
train(epoch=1000)

Epoch 0, Loss: 5412.5335
Epoch 100, Loss: 1461.6839
Epoch 200, Loss: 1437.7078
Epoch 300, Loss: 1429.5790
Epoch 400, Loss: 1424.5993
Epoch 500, Loss: 1419.3579
Epoch 600, Loss: 1414.4335
Epoch 700, Loss: 1410.2821
Epoch 800, Loss: 1407.1325
Epoch 900, Loss: 1405.3946


In [14]:
def get_embedding(word):
    idx = word2idx[word]
    return W1[idx]

In [15]:
print("\n📌 Word Embeddings:")
for word in word2idx:
    print(f"{word:>10} : {get_embedding(word)}")


📌 Word Embeddings:
   copepod : [-1.94219292 -0.53091809  1.66112717 -0.9109353  -0.40742521  0.65022562
  1.30600801 -0.72903426 -0.07405321  0.35757568]
      when : [-0.56486018  1.0213674   1.60483744 -0.33200092 -2.39554341  0.47156343
  0.09432638  0.28745448 -0.29214757  1.07739333]
     treat : [-0.88751134  1.91405089  0.8388414  -1.01631521 -0.2583052   0.24202269
 -0.46986365 -1.0347917  -1.52046925 -0.32146322]
    source : [ 0.16068114  0.80190578  1.35186005 -0.94218351  0.03426037 -0.38944823
 -0.89603359 -2.1719718   1.76605064 -0.45822207]
        no : [ 0.2263338   1.13773501 -1.22539374 -1.26581755 -2.29490582 -0.47668002
 -0.29667899 -1.73220168  0.08314085 -0.47515829]
    around : [ 1.56444523 -1.34395642  1.79367082 -0.02078183 -1.00276184 -0.52823956
 -0.60670468 -0.41816968  0.89887875  1.55770113]
  document : [-1.19768667 -1.51623431 -0.92766547 -0.52100344 -0.22849979  1.06229842
 -1.38579869 -1.01160678  0.94374505 -0.40458646]
    access : [-1.18600579  0