In [19]:
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
from nltk.tokenize import sent_tokenize
import random
from collections import Counter
# nltk.download('punkt_tab')

In [8]:
with open('wikipedia.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
corpus = sent_tokenize(raw_text)

In [9]:
corpus

['Dracunculiasis, also called Guinea-worm disease, is a parasitic infection by the Guinea worm, Dracunculus medinensis.',
 'A person becomes infected by drinking water contaminated with Guinea-worm larvae that reside inside copepods (a type of small crustacean).',
 'Stomach acid digests the copepod and releases the Guinea worm, which penetrates the digestive tract and escapes into the body.',
 'Around a year later, the adult female migrates to an exit site – usually the lower leg – and induces an intensely painful blister on the skin.',
 'Eventually, the blister bursts, creating a painful wound from which the worm gradually emerges over several weeks.',
 "The wound remains painful throughout the worm's emergence, disabling the affected person for the three to ten weeks it takes the worm to emerge.",
 'The female worm releases larvae when the host submerges the wound in water in attempts to relieve the pain, thus continuing the life cycle.',
 'There is no medication to treat or prevent 

In [11]:
lemmatizer = WordNetLemmatizer()
def preprocess(Corpus):
    token=[]
    for sentence in corpus:
        l=[]
        sentence=sentence.lower()
        for w in sentence.split():
            w=lemmatizer.lemmatize(w)
            if w.isalpha():# and w not in stop_words:
                l.append(w)
        token.append(l)
    return token

In [13]:
tokens=(preprocess(corpus))
vocab = set( j for i in tokens for j in i)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

In [14]:
def generate_train(tokens,windowsize=2):
    data=[]
    for i in tokens:
        for idx,center in enumerate(i):
            context_range=list(range(max(0,idx-windowsize),min(len(i),idx+windowsize+1)))
            context_range.remove(idx)
            for context_idx in context_range:
                context=i[context_idx]
                data.append((center,context))
    return data

training_data = generate_train(tokens, windowsize=1)
print("Sample training pairs:", training_data[:]) 

Sample training pairs: [('also', 'called'), ('called', 'also'), ('called', 'is'), ('is', 'called'), ('is', 'a'), ('a', 'is'), ('a', 'parasitic'), ('parasitic', 'a'), ('parasitic', 'infection'), ('infection', 'parasitic'), ('infection', 'by'), ('by', 'infection'), ('by', 'the'), ('the', 'by'), ('the', 'guinea'), ('guinea', 'the'), ('guinea', 'dracunculus'), ('dracunculus', 'guinea'), ('a', 'person'), ('person', 'a'), ('person', 'becomes'), ('becomes', 'person'), ('becomes', 'infected'), ('infected', 'becomes'), ('infected', 'by'), ('by', 'infected'), ('by', 'drinking'), ('drinking', 'by'), ('drinking', 'water'), ('water', 'drinking'), ('water', 'contaminated'), ('contaminated', 'water'), ('contaminated', 'with'), ('with', 'contaminated'), ('with', 'larva'), ('larva', 'with'), ('larva', 'that'), ('that', 'larva'), ('that', 'reside'), ('reside', 'that'), ('reside', 'inside'), ('inside', 'reside'), ('inside', 'copepod'), ('copepod', 'inside'), ('copepod', 'type'), ('type', 'copepod'), ('ty

# NEGATIVE_SAMPLING:

In [26]:
word_counts=Counter([center for center,_ in training_data])
total_count=sum(word_counts.values())
#negative-Sampling Distribution
word_freqs = {word: (count / total_count) ** 0.75 for word, count in word_counts.items()}
norm_factor=sum(word_freqs.values())
word_prob={word:freq/norm_factor for word,freq in word_freqs.items()}
word_prob_list=[word_prob[word] for word in vocab]
word_list=list(vocab)


In [29]:
def generate_negative_sampling(pos_idx,num_neg=5):
    neg_sam=[]
    while len(neg_sam)<num_neg:
        sample=word2idx[random.choices(word_list,weights=word_prob_list,k=1)[0]]
        if sample !=pos_idx:
            neg_sam.append(sample)
    return neg_sam
    

In [51]:
EMBEDDIND_DIM=50
W1 = np.random.randn(vocab_size,EMBEDDIND_DIM)*0.01
W2 = np.random.randn(vocab_size,EMBEDDIND_DIM)*0.01
lr = 0.05


In [52]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [53]:
def  train_sgns(epochs=100,lr=0.05,num_neg=5):
    global W1,W2
    for epoch in range(epochs):
        total_loss=0
        random.shuffle(training_data) 
        for center,context  in training_data:
            center_idx=word2idx[center]
            context_idx=word2idx[context]
            neg_sample_idxs=generate_negative_sampling(context_idx,num_neg)
            
            v_c = W1[center_idx]          # input embedding
            u_o = W2[context_idx]         # positive output embedding
            u_ks = W2[neg_sample_idxs]    # negative output embeddings
            
            
            score_pos = sigmoid(np.dot(u_o, v_c))
            score_neg = sigmoid(-np.dot(u_ks, v_c))
            
            loss = -np.log(score_pos + 1e-9) - np.sum(np.log(score_neg + 1e-9))
            total_loss += loss
            
            grad_v_c = (score_pos - 1) * u_o + np.sum((1 - score_neg).reshape(-1, 1) * u_ks, axis=0)
            grad_u_o = (score_pos - 1) * v_c
            grad_u_ks = (1 - score_neg).reshape(-1, 1) * v_c

            W1[center_idx] -= lr * grad_v_c
            W2[context_idx] -= lr * grad_u_o
            for i, neg_idx in enumerate(neg_sample_idxs):
                W2[neg_idx] -= lr * grad_u_ks[i]

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")
                      

In [54]:
train_sgns(epochs=200,lr=0.05,num_neg=5)

Epoch 1/200, Loss: 2769.7453
Epoch 2/200, Loss: 2768.8707
Epoch 3/200, Loss: 2763.6505
Epoch 4/200, Loss: 2734.1866
Epoch 5/200, Loss: 2656.4328
Epoch 6/200, Loss: 2542.7508
Epoch 7/200, Loss: 2423.2192
Epoch 8/200, Loss: 2314.5980
Epoch 9/200, Loss: 2221.7238
Epoch 10/200, Loss: 2133.1329
Epoch 11/200, Loss: 2059.4242
Epoch 12/200, Loss: 1986.9846
Epoch 13/200, Loss: 1925.9117
Epoch 14/200, Loss: 1862.2479
Epoch 15/200, Loss: 1816.5898
Epoch 16/200, Loss: 1776.9325
Epoch 17/200, Loss: 1736.2761
Epoch 18/200, Loss: 1677.4753
Epoch 19/200, Loss: 1653.4895
Epoch 20/200, Loss: 1617.8187
Epoch 21/200, Loss: 1575.2670
Epoch 22/200, Loss: 1525.7726
Epoch 23/200, Loss: 1482.3153
Epoch 24/200, Loss: 1443.6776
Epoch 25/200, Loss: 1414.1760
Epoch 26/200, Loss: 1359.0482
Epoch 27/200, Loss: 1302.3643
Epoch 28/200, Loss: 1292.7603
Epoch 29/200, Loss: 1236.9487
Epoch 30/200, Loss: 1207.0020
Epoch 31/200, Loss: 1191.2131
Epoch 32/200, Loss: 1156.0254
Epoch 33/200, Loss: 1106.2494
Epoch 34/200, Loss:

In [55]:
def get_embedding(word):
    idx=word2idx[word]
    embedding_vector=W1[idx]
    return embedding_vector



In [56]:
word="stick"
get_embedding(word)

array([ 0.12730081, -0.04733894,  0.44813435,  0.59995592, -0.25541219,
       -1.39030146, -0.21501393,  1.40308117, -0.27026929, -0.32534394,
        0.82516256,  0.2793382 ,  0.33673013, -0.36671548,  0.89696803,
        0.05750638, -0.79021538, -0.5079529 ,  0.5643798 , -0.25281927,
       -0.19598863,  0.3576339 , -0.53398151,  0.43728472, -0.26415383,
       -0.69851274, -0.1544669 ,  0.32351434,  0.22515842,  0.83996435,
       -0.15117261,  0.44157255, -1.62597237,  0.03109387,  0.14616883,
        0.01142669, -0.52129826, -0.80749581, -0.68639286,  0.78699795,
       -0.96529083,  0.12886452,  0.21721924, -0.54764942, -1.01903309,
        0.16575632, -0.3472307 ,  0.35443023,  0.16197742,  0.57952915])