## word embedding hand made
- CBOW (contexts -> center word) or Skip-gram (center word -> contexts)
- negative sampling


The implemenation if mainly for demostration and understanding. It is slow and not taking a lot of advantages of GPU - word by word.

In [1]:
%matplotlib inline

In [25]:
import theano.tensor as T
import theano
from theano import shared, function
floatX = theano.config.floatX

import numpy as np

In [7]:
theano.config.device

'gpu'

In [231]:
## read data, take 80000 common words

import re
from collections import Counter

text8 = open("../data/text8").read()
words = re.compile("\w+").findall(text8)
common_words, _ = zip(*Counter(words).most_common(10000))
common_words = set(common_words)
words = [w if w in common_words else "UNKNOWN" for w in words]

vocab = list(set(words))
ind2word = dict([(i,w) for i,w in enumerate(vocab)])
word2ind = dict([(w,i) for i,w in enumerate(vocab)])
vocab_size = len(vocab)

data = map(word2ind.get, words)

print len(words), len(vocab)

17005207 10001


In [232]:
## unigram sample frequences = 3/4 power of raw frequences
from collections import Counter
wc = Counter(words)
N = len(words)
sample_probs = np.array([wc[ind2word[i]]
                         for i in xrange(len(vocab))])
sample_probs = np.power(sample_probs, 3./4)
sample_probs = sample_probs / sample_probs.sum()

In [233]:
## sampling methods for continous bag of words (CBOW) and skip-gram
def sample(data, sample_probs, vocab_size, window_size):
    N = len(data)
    iword = window_size ## one side window size
    while True:
        w = data[iword]
        positives = np.array(data[iword-window_size:iword] 
                     + data[iword+1:iword+window_size+1])
        negatives = np.random.choice(vocab_size, 2*window_size, 
                                     p = sample_probs, replace = False)
        yield (w, positives, negatives)  
        iword += 1
        if iword + window_size >= N:
            iword = window_size
            
def random_sample(data, sample_probs, vocab_size, window_size):
    N = len(data)
    
    while True:
        iword = np.random.randint(window_size, N-window_size-1) ## one side window size
        w = data[iword]
        positives = np.array(data[iword-window_size:iword] 
                     + data[iword+1:iword+window_size+1])
        negatives = np.random.choice(vocab_size, 2*window_size, 
                                     p = sample_probs, replace = False)
        yield (w, positives, negatives)  

In [234]:
#floatX = "float64" ## CPU
floatX = theano.config.floatX

class CBOW(object):
    def __init__(self, window_size, word_dim, lr):
        self.window_size = window_size
        self.word_dim = word_dim
        self.lr = lr ## learning rate
        
        ## center word matrix
        U = shared(np.random.randn(vocab_size, word_dim).astype(floatX) 
                   / np.sqrt(vocab_size), name = "U")
        ## context word matrix
        V = shared(np.random.randn(vocab_size, word_dim).astype(floatX) 
                   / np.sqrt(vocab_size), name = "V")
        
        ## inputs
        word = T.lscalar(name = "word")
        positives = T.lvector(name = "positives")
        negatives = T.lvector(name = "negatives")
        
        positive_scores = V[positives, :].dot(U[word, :].T).flatten()
        positive_probs = 1. / (1. + T.exp(-positive_scores))
        negative_scores = -V[negatives, :].dot(U[word, :].T).flatten()
        negative_probs = 1. / (1. + T.exp(-negative_scores))
        
        data_loss = -T.log(positive_probs).mean() -T.log(negative_probs).mean()
        loss = data_loss ## ignore regularization for simiplicty
        
        dU = T.grad(loss, U)
        dV = T.grad(loss, V)
        
        self.train = function(inputs = [word, positives, negatives], 
                              outputs = loss, 
                              updates = [(U, U-lr*dU)
                                        , (V, V-lr*dV)])
        self.get_word_vectors = function(inputs = [], outputs = [U, V])

In [235]:
window_size = 5
word_dim = 100
lr = 0.1


cbow = CBOW(window_size, word_dim, lr)
print cbow.train(word = 0, positives = range(1, 6), negatives=range(1, 6)), -np.log(.5) * 2 # for both positive and negative


In [249]:
sampler = random_sample(data, sample_probs, vocab_size, window_size)

print "test with initialization"

iword = 0
total_loss = 0
after_n_words = 50000
iteration = 0

while True:
    if iword + window_size >= len(data):
        iteration += 1
        iword = 0
    if iword % after_n_words == 0:
        print iteration, iword, total_loss / after_n_words
        total_loss = 0
    iword += 1
    word, positives, negatives = sampler.next()
    loss = cbow.train(word, positives, negatives)
    total_loss += loss
    if iteration >= 2: break

test with initialization
0 0 0
0 50000 1.22511088363
0 100000 1.22330685316
0 150000 1.22278726044
0 200000 1.22251286374
0 250000 1.22094391867
0 300000 1.2214876091
0 350000 1.219557284


KeyboardInterrupt: 

In [250]:
from sklearn.preprocessing import normalize
word_vecs, context_vecs = cbow.get_word_vectors()
word_vecs = np.asarray(word_vecs)
normalized_word_vecs = normalize(word_vecs, axis = 1)

In [251]:
v = normalized_word_vecs[word2ind["nine"]]
i = normalized_word_vecs.dot(v).argsort()[-10:]
print map(ind2word.get, i)

['october', 'july', 'august', 'december', 'april', 'six', 'isbn', 'seven', 'eight', 'nine']


In [252]:
v = normalized_word_vecs[word2ind["king"]]
i = normalized_word_vecs.dot(v).argsort()[-10:]
print map(ind2word.get, i)

['duke', 'son', 'charles', 'henry', 'pope', 'mary', 'prince', 'iii', 'emperor', 'king']


In [253]:
v = normalized_word_vecs[word2ind["london"]]
i = normalized_word_vecs.dot(v).argsort()[-10:]
print map(ind2word.get, i)

['wales', 'academy', 'irish', 'revolution', 'poland', 'league', 'spain', 'scotland', 'founded', 'london']


In [254]:
v1 = normalized_word_vecs[word2ind["king"]]
v2 = normalized_word_vecs[word2ind["man"]]
v3 = normalized_word_vecs[word2ind["woman"]]
i = normalized_word_vecs.dot(v1-v2+v3).argsort()[-10:]
print map(ind2word.get, i)

['thomas', 'davis', 'friedrich', 'iv', 'champion', 'ca', 'johnson', 'rd', 'architect', 'duke']


Not so good because the training is not sufficient