In [7]:
from collections import defaultdict

In [9]:
d = defaultdict(int)

In [14]:
T = [x.split() for x in T]

In [16]:
for w in T[0]:
    d[w] += 1

In [19]:
import numpy as np

In [20]:
x = np.random.randint(0, 10, 100)

{y:i for i, y in enumerate(x) if y > 5}

{6: 95, 7: 90, 8: 92, 9: 87}

In [23]:
import tensorflow as tf, numpy as np

import operator

class word2vec:
    
    def __init__(self, sentences=None, size=100, embeddings=None, min_count=5, epochs=5, window=5,
                 lr=1e-1, N=None, loss='nce', num_sampled=5, threads=5, cbow=1):
        
        #create dictionary
        #sentences is iterable of strings, dealing whole batch in one step
        self.min_count = min_count
        self.size = size
        self.window = window
        self.epochs = epochs
        
        
        self.checker = lambda x: isinstance(x, str)
        
        if sentences is not None:
            self.sents = sentences
            self.create_matrix()
        
        elif embeddings is not None and N is not None:
            self.embeddings = tf.Variable(embeddings)
            self.quantity = N
            
        else:
            raise ValueError('provide either sentences of embedding matrix')
        #set up data flow
        
        
        self.words_indices_ph = tf.placeholder(tf.int32, [None, 2*window])
        self.targets_indices_ph = tf.placeholder(tf.int32, [None, 1])
        
        self.vecs = tf.nn.embedding_lookup(self.embeddings, self.words_indices_ph)
        
        self.context = tf.reduce_mean(self.vecs, axis=1)
        
        self.weights, self.biases = tf.Variable(tf.truncated_normal([self.quantity, self.size])), tf.Variable(tf.truncated_normal([self.quantity]))
        
        if loss == 'nce':
            self.loss = tf.nn.nce_loss(self.weights, self.biases, self.targets_indices_ph, 
                                       self.context, num_sampled, self.quantity)
            
        elif loss == 'sampled':
            self.loss = tf.nn.sampled_softmax(self.weights, self.biases, self.targets_indices_ph,
                                             self.context, num_sampled, self.quantity)
            
            
        self.opt = tf.train.AdamOptimizer(lr)
        self.optimizer = self.opt.minimize(tf.reduce_mean(self.loss))
        
        self.probs = tf.nn.softmax(tf.matmul(self.context, tf.transpose(self.weights))+self.biases)
        self.indices = tf.argmax(self.probs, 1)
        
        
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        
            
    def create_matrix(self, sentences=None):
        all_words_count_dict = {}
        
        if sentences is not None:
            self.sents = sentences
        
        for words in self.sents:
            if isinstance(words, str):
                words = sent.split()
            assert isinstance(words, list) and all(list(map(self.checker, words)))
            for word in words:
                if word not in all_words_count_dict:
                    all_words_count_dict[word] = 0
                    
                all_words_count_dict[word] += 1
                
        self.word_to_int = sorted(all_words_count_dict.items(), key=operator.itemgetter(1))[::-1]
        self.word_to_int = {x[0]:i for i, x in enumerate(self.word_to_int) if x[1] > self.min_count}
        self.int_to_word = dict([reversed(item) for item in self.word_to_int.items()])
        
        self.quantity = len(self.word_to_int)+2
        
        self.embeddings = tf.Variable(tf.truncated_normal([self.quantity, self.size], mean=0, stddev=1e-1))
        
    
    def train(self, new_sentences=None, verbose=True, display=100):
        
        #mozna sprawdzic jakos, czy nowe zdania nie wychodza z korpusem poza poprzednie
        for epoch in range(self.epochs):
            
            if verbose:
                print("epoch", epoch+1)
            
            for sent_nr, sentence in enumerate(self.sents):
                
                nrs = [self.word_to_int[x] if x in self.word_to_int else self.quantity - 2 for x in sentence.split()]
                nrs = [self.quantity-1]*self.window + nrs + [self.quantity-1]*self.window
                
                X = [nrs[i-self.window:i] + nrs[i+1:i+self.window+1] for i in range(self.window, len(nrs)-self.window)]
                Y = nrs[self.window:len(nrs)-self.window]
                feed_dict={self.words_indices_ph:np.asarray(X), 
                            self.targets_indices_ph:np.asarray(Y)[None].T}
                
                self.sess.run(self.optimizer, feed_dict=feed_dict)
                
                if verbose and sent_nr%display==0:
                    print('sentence {} of {}'.format(sent_nr, len(self.sents)))
                    
        
        self.end_vectors = tf.nn.l2_normalize( self.embeddings + self.weights, 1)
        
        self.sim_inds = tf.placeholder(tf.int32, [None], name='sim_inds_or_pairwise')
        self.sim_vecs = tf.nn.embedding_lookup(self.end_vectors, self.sim_inds)
        
        self.cosine = tf.matmul(self.sim_vecs, tf.transpose(self.normed_embeddings))
        
        self.best_k = tf.placeholder(tf.int32, [], name='best_k')
        
        self.topk = tf.nn.top_k(self.cosine, k = self.best_k)
        
        self.pairwise_vecs = tf.nn.embedding_lookup(self.normed_embeddings, self.sim_inds)
        
        self.vec = tf.reduce_sum(tf.gather(self.pairwise_vecs, [0,1])) - tf.gather(self.pairwise_vecs, 2)
        
        self.relevant = tf.matmul(tf.expand_dims(tf.nn.l2_normalize(self.vec, 0), 0), tf.transpose(self.normed_embeddings))
        
        self.topk_pair = tf.nn.top_k(self.relevant, k= self.best_k)
        
    
    def pairwise_comparison(self, pair_1, pair_2, topk=10):
        
        if isinstance(pair_1, str):
            pair_1 = pair_1.split()
            
        if isinstance(pair_2, str):
            pair_2 = pair_2.split()
        
        feed = {self.sim_inds:[self.word_to_int[pair_1[0]], self.word_to_int[pair_2[1]], self.word_to_int[pair_1[1]]], self.best_k:topk}
        topk = self.sess.run(self.topk_pair, feed_dict=feed)
        
        return list(zip([self.int_to_word[x] for x in topk[1][0]], topk[0][0]))
        
    def most_similar(self, words, topk=10):
        inds = [self.word_to_int[x] for x in words if x in self.word_to_int]
        topk = self.sess.run(self.topk, feed_dict={self.sim_inds:inds, self.best_k:topk})
        
        lists = []
        for r in range(len(words)):
            
            lists.append(list(zip([self.int_to_word[x] for x in topk[1][r]], topk[0][r]))[1:])
        
        return lists
        
    def __getitem__(self, key):
        return self.sess.run(self.embeddings)[self.word_to_int[key], :]

In [126]:
class GloVe:
    
    def __init__(self, sentences=None,size=100, embeddings=None, min_count=0, num_sampled=2, window=2, 
                 epochs=5, lr=1e-1, cooccurence=None, batch=32):
        
        self.epochs = epochs
        self.window = window
        self.min_count = min_count
        self.batch = batch
        
        if sentences is not None:
            self.sents = sentences
            self.create_matrix()
            self.embeddings = tf.Variable(tf.truncated_normal([self.quantity+2, size]))
            
        elif embeddings is not None and cooccurence is not None and sentences is None:
            self.embeddings = embeddings
            self.cooccurence = cooccurence
        
        else:
            raise ValueError
        
        self.first_word_ph = tf.placeholder(tf.int32, [None])
        self.second_word_ph = tf.placeholder(tf.int32, [None])
        
        self.first_word = tf.nn.embedding_lookup(self.embeddings, self.first_word_ph, name='first_words')
        self.second_word = tf.nn.embedding_lookup(self.embeddings, self.second_word_ph, name='second_words')
        
        self.targets_ph = tf.placeholder('float', [None, None], name='targets') #macierz wycinków z macierzy współwystępowania
        
        self.dots_products = tf.matmul( self.first_word, tf.transpose(self.second_word))
        
        self.diffs = tf.multiply( tf.pow( self.targets_ph/self.max_occurence, 3/4), tf.square( self.dots_products - tf.log(self.targets_ph) ))
        self.loss = tf.reduce_mean(self.diffs)
        
        self.optimizer = tf.train.AdamOptimizer(lr).minimize(self.loss)
        
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
            
    def create_matrix(self, sentences=None, append=False):
        all_words_count_dict = {}
        #append = True jeśli chcesz dodać do starej macierzy, False jeśli chcesz stworzyć ją od nowa
        #jeśli nie podasz nowych zdań, nie będzie miało znaczenia
        
        if sentences is not None:
            self.sents = sentences
        
        for sent in iter(self.sents):
            words = sent.split()
            for word in words:
                if word not in all_words_count_dict:
                    all_words_count_dict[word] = 0
                    
                all_words_count_dict[word] += 1
                
        self.word_to_int = sorted(all_words_count_dict.items(), key=operator.itemgetter(1))[::-1]
        self.word_to_int = {x[0]:i for i, x in enumerate(self.word_to_int) if x[1] > self.min_count}
        
        self.quantity = len(self.word_to_int)+2
        
        self.cooccurence = np.zeros((self.quantity-1, self.quantity-1)).astype(int)
        
        for sentence in iter(self.sents):
            
            nrs = [self.word_to_int[x] if x in self.word_to_int else self.quantity - 2 for x in sentence.split()]
            
            n = len(nrs)
            for i in range(n):
                
                inds = nrs[max(0, i-self.window):i] + nrs[i+1:min(n-1, i+self.window+1)]
                self.cooccurence[nrs[i], inds] += 1
                
        self.max_occurence = tf.constant(np.amax(self.cooccurence).astype(np.float32))
                
                
    def train(self, sentences=None, append=True, randomize=False, batch=32):
        
        #w zaleznosci od append i sentences bedzie wywolanie tworzenia macierzy raz jeszcze
        for epoch in range(self.epochs):
            
            #nie iterujemy po zdaniach tylko bierzemy po kolei wszystkie pary słów
            x_offset, y_offset = 0, 0
            for x in range(self.quantity // batch):
                
                x_inds = np.arange(batch*x, min(self.quantity-1, batch*(x+1)))
                for y in range(self.quantity // batch):
                    
                    y_inds = np.arange(batch*y, min(self.quantity-1, batch*(y+1)))
                    
                    feed_dict = {self.first_word_ph:x_inds, self.second_word_ph:y_inds, self.targets_ph:self.cooccurence[x*batch:(x+1)*batch, y*batch:(y+1)*batch]}
                    
                    self.sess.run(self.optimizer, feed_dict=feed_dict)
                    
        
                
 
    def get_cooccurence(self, word_1, word_2):
        i, j = self.word_to_int[word_1], self.word_to_int[word_2]
        
        return self.cooccurence[i, j]

In [2]:
import os

In [11]:
with open('ted_talks_eng_pure.txt','r') as f:
    T = f.read().split('\n')

In [12]:
T = [x for x in T if len(x.split())]

In [24]:
mod = word2vec(T, min_count=5)
# mod.train()

In [25]:
mod.vecs

<tf.Tensor 'embedding_lookup:0' shape=(?, 10, 100) dtype=float32>

In [110]:
from gensim.models.word2vec import Word2Vec

In [115]:
model = Word2Vec([x.split() for x in T])

In [119]:
model.most_similar('damage')

[('radiation', 0.700118362903595),
 ('gains', 0.6583161354064941),
 ('gain', 0.6563538312911987),
 ('oxytocin', 0.6500513553619385),
 ('prevent', 0.6465193629264832),
 ('pollution', 0.6462498903274536),
 ('skeletal', 0.6421383023262024),
 ('exposure', 0.6407454013824463),
 ('fitness', 0.6394904851913452),
 ('nutrients', 0.6326676607131958)]

In [120]:
mod.most_similar(['damage'])

[[('fix', 0.42376211),
  ('liar', 0.40842801),
  ('up', 0.38568267),
  ('gene', 0.38455483),
  ('murderer', 0.37005201),
  ('fivestar', 0.36978459),
  ('psychopathic', 0.36635658),
  ('cortex', 0.36425489),
  ('outfit', 0.3609606)]]

In [123]:
model.most_similar(['woman', 'king'], ['queen'])

[('man', 0.7578160762786865),
 ('named', 0.748059868812561),
 ('dr', 0.7321184277534485),
 ('david', 0.6964748501777649),
 ('writer', 0.6917567849159241),
 ('poet', 0.690264880657196),
 ('boy', 0.680991530418396),
 ('paul', 0.6693485975265503),
 ('professor', 0.6665024161338806),
 ('lady', 0.6614017486572266)]

In [124]:
mod.pairwise_comparison(['woman', 'man'], ['queen', 'king'])

[('mixture', 0.38606822),
 ('assessments', 0.34245074),
 ('arnold', 0.33533168),
 ('pioneering', 0.32957149),
 ('habitability', 0.32862407),
 ('downloads', 0.32733685),
 ('conferencing', 0.32668549),
 ('supply', 0.3162491),
 ('faced', 0.31500465),
 ('disappearing', 0.31416351)]