In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
import numpy as np

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/arun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
corpus= '''
I am Arun Ghimire, born and raised in Nepal.
I completed my Bachelor from IOE Pashchimanchal Campus as an Electronics and Communication Engineer.
I am currently working as a Machine Learning Engineer at a renowned technology company.
I design and develop production level Machine Learning capabilities.
'''

In [4]:
def preprocessing(corpus):
  stop_words = set(stopwords.words("english"))

  training_data = []

  sentences = corpus.split(".")

  for i in range(len(sentences)):
    sentences[i] = sentences[i].strip()

    words = sentences[i].split()

    training_words =  [word.strip(string.punctuation) for word in words if word not in stop_words]

    lowered_training_words = [word.lower() for word in training_words]

    training_data.append(lowered_training_words)

  return training_data

In [5]:
len(preprocessing(corpus))

5

In [6]:
sentence_token = preprocessing(corpus)

In [7]:
sentence_token

[['i', 'arun', 'ghimire', 'born', 'raised', 'nepal'],
 ['i',
  'completed',
  'bachelor',
  'ioe',
  'pashchimanchal',
  'campus',
  'electronics',
  'communication',
  'engineer'],
 ['i',
  'currently',
  'working',
  'machine',
  'learning',
  'engineer',
  'renowned',
  'technology',
  'company'],
 ['i',
  'design',
  'develop',
  'production',
  'level',
  'machine',
  'learning',
  'capabilities'],
 []]

In [8]:
def generate_vocab_index(sentences):
  data = list(set(word for sentence in sentences for word in sentence))
  data = sorted(data)
  vocab = {}
  for i in range(len(data)):
    vocab[data[i]] = i
  return vocab ,data

In [9]:
vocab_index, data = generate_vocab_index(sentence_token)

In [10]:
vocab_index

{'arun': 0,
 'bachelor': 1,
 'born': 2,
 'campus': 3,
 'capabilities': 4,
 'communication': 5,
 'company': 6,
 'completed': 7,
 'currently': 8,
 'design': 9,
 'develop': 10,
 'electronics': 11,
 'engineer': 12,
 'ghimire': 13,
 'i': 14,
 'ioe': 15,
 'learning': 16,
 'level': 17,
 'machine': 18,
 'nepal': 19,
 'pashchimanchal': 20,
 'production': 21,
 'raised': 22,
 'renowned': 23,
 'technology': 24,
 'working': 25}

In [11]:
window_size = 2
X_train = []
y_train = []

for sentence in sentence_token:
  for i in range(len(sentence)):
    center_word = [0 for x in range(len(vocab_index))]
    center_word[vocab_index[sentence[i]]] = 1
    context = [0 for x in range(len(vocab_index))]

    for j in range(i-window_size,i+window_size+1):
      if i!=j and j>=0 and j<len(sentence):
        context[vocab_index[sentence[j]]] += 1
    X_train.append(context)
    y_train.append(center_word)

In [12]:
X_train

[[1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0],
 [1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0],
 [0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0

In [13]:
y_train

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [14]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [15]:
class CBOWWord2Vec:
    def __init__(self, words, word_index):
        self.N = 10
        self.alpha = 0.001
        self.words = words
        self.word_index = word_index
        self.initialize_weight()

    def initialize_weight(self):
        self.W = np.random.uniform(-0.8, 0.8, (len(self.words), self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, len(self.words)))

    def feed_forward(self, X):
        self.h = np.dot(self.W.T, X).reshape(self.N,1)
        self.u = np.dot(self.W1.T, self.h)
        self.y = softmax(self.u)
        return self.y

    # def backpropagate(self, context, center_word):
    def backpropagate(self, x, t):
        e = self.y - np.asarray(t).reshape(len(self.words),1)
        # e.shape is V x 1
        dLdW1 = np.dot(self.h,e.T)
        X = np.array(x).reshape(len(self.words),1)
        dLdW = np.dot(X, np.dot(self.W1,e).T)
        self.W1 = self.W1 - self.alpha*dLdW1
        self.W = self.W - self.alpha*dLdW

    def train(self,epochs,x_train, y_train):
        for x in range(1,epochs):
          self.loss = 0
          for j in range(len(x_train)):
            self.feed_forward(x_train[j])
            self.backpropagate(x_train[j],y_train[j])
            C = 0
            for m in range(len(self.words)):
              if(y_train[j][m]):
                self.loss += -1*self.u[m][0]
                C += 1
            self.loss += C*np.log(np.sum(np.exp(self.u)))
          print("epoch ",x, " loss = ",self.loss)
          self.alpha *= 1/( (1+self.alpha*x) )

    def predict(self, context_words, number_of_predictions):
        context_vector = [0 for i in range(len(self.words))]
        for word in context_words:
            if word in self.words:
                index = self.word_index[word]
                context_vector[index] = 1
        # context_indices[i] = [self.word_index[word] for word in self.words if word in self.words]
        
        if context_vector and any(item == 1 for item in context_vector):
            prediction = self.feed_forward(context_vector)
            output = {}
            for i in range(len(self.words)):
                output[prediction[i][0]] = i
    
            top_focus_words = []
            for k in sorted(output, reverse=True):
                top_focus_words.append(self.words[output[k]])
                if len(top_focus_words) >= number_of_predictions:
                    break
    
            return top_focus_words
        else:
            print("Context words not found in dictionary")

In [16]:
w2v = CBOWWord2Vec(
    words= data,
    word_index = vocab_index
)

w2v.train(
    epochs = 10,
    x_train = X_train,
    y_train = y_train
)

epoch  1  loss =  130.8846068925503
epoch  2  loss =  130.4014874895394
epoch  3  loss =  129.92326359136354
epoch  4  loss =  129.45031040417683
epoch  5  loss =  128.98298331276763
epoch  6  loss =  128.52161625310345
epoch  7  loss =  128.0665203201461
epoch  8  loss =  127.61798262393275
epoch  9  loss =  127.17626540164149


In [17]:
print(w2v.predict(["around", "the", "globe"], 3))

Context words not found in dictionary
None


In [18]:
print(w2v.predict(['arun', 'ghimire', 'raised', 'nepal'], 3))

['electronics', 'bachelor', 'raised']
