In [28]:
import io
import os
import sys
import requests
from collections import OrderedDict 
import math
import random
import numpy as np
from collections import defaultdict

In [29]:
def load_text8():
    with open("./text8.txt", "r") as f:
        corpus = f.read().strip("\n")
    f.close()

    return corpus

In [30]:
def data_preprocess(corpus):
    #由于英文单词出现在句首的时候经常要大写，所以我们把所有英文字符都转换为小写，
    #以便对语料进行归一化处理（Apple vs apple等）
    corpus = corpus.strip().lower()
    corpus = corpus.split(" ")

    return corpus


# #构造词典，统计每个词的频率，并根据频率将每个词转换为一个整数id
# def build_dict(corpus):
#     #首先统计每个不同词的频率（出现的次数），使用一个词典记录
#     word_freq_dict = dict()
#     for word in corpus:
#         if word not in word_freq_dict:
#             word_freq_dict[word] = 0
#         word_freq_dict[word] += 1

#     #将这个词典中的词，按照出现次数排序，出现次数越高，排序越靠前
#     #一般来说，出现频率高的高频词往往是：I，the，you这种代词，而出现频率低的词，往往是一些名词，如：nlp
#     word_freq_dict = sorted(word_freq_dict.items(), key = lambda x:x[1], reverse = True)
    
#     #构造3个不同的词典，分别存储，
#     #每个词到id的映射关系：word2id_dict
#     #每个id出现的频率：word2id_freq
#     #每个id到词典映射关系：id2word_dict
#     word2id_dict = dict()
#     word2id_freq = dict()
#     id2word_dict = dict()

#     #按照频率，从高到低，开始遍历每个单词，并为这个单词构造一个独一无二的id
#     for word, freq in word_freq_dict:
#         curr_id = len(word2id_dict)
#         word2id_dict[word] = curr_id
#         word2id_freq[word2id_dict[word]] = freq
#         id2word_dict[curr_id] = word

#     return word2id_freq, word2id_dict, id2word_dict


In [31]:
# corpus = data_preprocess(corpus)
# print(corpus[:50])
# word2id_freq, word2id_dict, id2word_dict = build_dict(corpus)
# vocab_size = len(word2id_freq)
# print("there are totoally %d different words in the corpus" % vocab_size)
# for _, (word, word_id) in zip(range(50), word2id_dict.items()):
#     print("word %s, its id %d, its word freq %d" % (word, word_id, word2id_freq[word_id]))

In [32]:
class word2vec():
    def __init__ (self):
        self.n = settings['n']
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
        pass
    
    
    # GENERATE TRAINING DATA
    def generate_training_data(self,corpus):

        # GENERATE WORD COUNTS
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1

        self.v_count = len(word_counts.keys())

        # GENERATE LOOKUP DICTIONARIES
        self.words_list = sorted(list(word_counts.keys()),reverse=False)
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
        training_data = []
        # CYCLE THROUGH EACH SENTENCE IN CORPUS
        for sentence in corpus:
            sent_len = len(sentence)

            # CYCLE THROUGH EACH WORD IN SENTENCE
            for i, word in enumerate(sentence):
                
                #w_target = sentence[i]
                w_target = np.array(self.word2onehot(sentence[i]))

                # CYCLE THROUGH CONTEXT WINDOW
                w_context = np.array([0 for i in range(0, self.v_count)])
                num=0
                for j in range(i-self.window, i+self.window+1):
                    if j!=i and j<=sent_len-1 and j>=0:
                        w_context+=np.array(self.word2onehot(sentence[j]))
                        num+=1
                training_data.append([w_target, w_context/num])
        return np.array(training_data)


    # SOFTMAX ACTIVATION FUNCTION
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)


    # CONVERT WORD TO ONE HOT ENCODING
    def word2onehot(self, word):
        word_vec = [0 for i in range(0, self.v_count)]
        word_index = self.word_index[word]
        word_vec[word_index] = 1
        return word_vec


    # FORWARD PASS
    def forward_pass(self, x):
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = self.softmax(u)
        return y_c, h, u
                

    # BACKPROPAGATION
    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))

        # UPDATE WEIGHTS
        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)
        pass


    # TRAIN W2V model
    def train(self, training_data):
        # INITIALIZE WEIGHT MATRICES
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))     # embedding matrix
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))     # context matrix
        
        # CYCLE THROUGH EACH EPOCH
        for i in range(0, self.epochs):

            self.loss = 0

            # CYCLE THROUGH EACH TRAINING SAMPLE
            for j in training_data:
                w_c = j[1]
                w_t = j[0]
                # FORWARD PASS
                y_pred, h, u = self.forward_pass(w_c)
                
                # CALCULATE ERROR
                EI = y_pred - w_t

                # BACKPROPAGATION
                self.backprop(EI, h, w_t)

                # CALCULATE LOSS
                self.loss += np.max(y_pred)
                #self.loss += -2*np.log(len(w_c)) -np.sum([u[word.index(1)] for word in w_c]) + (len(w_c) * np.log(np.sum(np.exp(u))))

            print ('EPOCH:',i, 'LOSS:', self.loss)
        # print(self.w1)
        # print(self.w2)


    # input a word, returns a vector (if available)
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return self.softmax(v_w)



    # input a vector, returns nearest word(s)
    def vec_sim(self, vec, top_n):

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(vec, v_w2)
            theta_den = np.linalg.norm(vec) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda kv :kv[1] , reverse=True)

        for word, sim in words_sorted[:top_n]:
            print (word, sim)
            
        pass

    # input word, returns top [n] most similar words
    def word_sim(self, word, top_n):
        
        w1_index = self.word_index[word]
        v_w1 = self.w1[w1_index]

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda kv :kv[1] , reverse=True)

        for word, sim in words_sorted[:top_n]:
            print (word, sim)
            
        pass

In [33]:
settings = {}
settings['n'] = 20                 # dimension of word embeddings
settings['window_size'] = 2         # context window +/- center word
settings['epochs'] = 300       # number of training epochs
settings['neg_samp'] = 10           # number of negative words to use during training
settings['learning_rate'] = 0.1    # learning rate
np.random.seed(0)                   # set the seed for reproducibility

In [34]:
corpus = load_text8()
corpus = [data_preprocess(corpus[:10000])]
# print(corpus)

In [35]:
w2v = word2vec()

training_data = w2v.generate_training_data(corpus)

w2v.train(training_data)

EPOCH: 0 LOSS: 55.31009358492388
EPOCH: 1 LOSS: 141.45011712647894
EPOCH: 2 LOSS: 223.42116476605153
EPOCH: 3 LOSS: 289.5856682138619
EPOCH: 4 LOSS: 348.5551096161694
EPOCH: 5 LOSS: 399.96460666980187
EPOCH: 6 LOSS: 442.0149012149577
EPOCH: 7 LOSS: 472.94429878122514
EPOCH: 8 LOSS: 509.7104456417818
EPOCH: 9 LOSS: 546.5729570769716
EPOCH: 10 LOSS: 577.5777754655145
EPOCH: 11 LOSS: 600.0240127719813
EPOCH: 12 LOSS: 612.6819109883329
EPOCH: 13 LOSS: 633.6250620494292
EPOCH: 14 LOSS: 659.2464257200375
EPOCH: 15 LOSS: 684.1533967353755
EPOCH: 16 LOSS: 697.6498154881535
EPOCH: 17 LOSS: 722.7551773308976
EPOCH: 18 LOSS: 734.7615626267316
EPOCH: 19 LOSS: 765.0605050190073
EPOCH: 20 LOSS: 764.6785517659976
EPOCH: 21 LOSS: 785.1556726890545
EPOCH: 22 LOSS: 784.449440973969
EPOCH: 23 LOSS: 797.8221859836378
EPOCH: 24 LOSS: 802.5830065485978
EPOCH: 25 LOSS: 822.0444825658482
EPOCH: 26 LOSS: 837.8661529776679
EPOCH: 27 LOSS: 854.7421973729838
EPOCH: 28 LOSS: 850.1737486815
EPOCH: 29 LOSS: 858.2166

In [36]:
print(w2v.word_vec('liberty'))

print(w2v.word_sim('free',3))

[1.15310406e-08 6.71997606e-12 7.90544248e-09 2.09505141e-02
 3.01808844e-09 1.19278395e-07 5.86240899e-01 1.86539216e-13
 3.92752523e-01 1.70656192e-10 3.60911077e-11 1.06090043e-05
 6.17848159e-11 3.53067709e-08 3.23412595e-14 1.49427662e-12
 7.54634302e-15 4.16399784e-09 4.50506427e-05 2.22658538e-07]
free 0.9999999999999999
did 0.5575728091186204
truly 0.5340838911766466
None
