In [3]:
from gensim.models.word2vec import LineSentence

## Hyperparameters

In [33]:
d = 1000
sparsity = 0.01
index_set = (-1, 1)
context_size = (2, 2)
data_path = 'data/corpusAktuell.validation copy.txt'
#data_path = '../wiki/data/wiki.de.txt'
data = LineSentence(data_path)

## Create the dictionary

In [34]:
import threading

import numpy as np
from scipy import sparse
from gensim.corpora.dictionary import Dictionary
from gensim.utils import SaveLoad

try:
    from queue import Queue, Empty
except ImportError:
    from Queue import Queue, Empty

import logging

class RandomIndexing(SaveLoad):
    
    def __init__(self, data=None, dims=1000, sparsity=0.01, index_set=(-1, 1), context_size=(2,2), workers=1):
        
        self.data = data
        self.dims = dims
        self.sparsity = sparsity
        self.index_set = index_set
        self.context_size = context_size
        self.workers = workers
        
        self.logger = logging.getLogger('RandomIndexing')
        
        self.dict = Dictionary()
        
        if data is not None:
            self.learn(data, workers)
        
    def get_index_vectors(self, wc):
        index_vectors = sparse.csr_matrix((wc, self.dims), dtype=np.int8)
        
        def urn_sampler(length):
            return np.random.choice(self.index_set, size=length)
        
        for i in xrange(wc):
            random_sparse_vector = sparse.random(1, self.dims, density = self.sparsity, data_rvs=urn_sampler)
            index_vectors[i] = random_sparse_vector
            
        return index_vectors
    
    def learn(self, data, report_every=10000):
        # add the documents to the dictionary
        old_dict_len = len(self.dict)
        self.dict.add_documents(data)
        
        if not hasattr(self, 'index_vectors'):
            self.index_vectors = self.get_index_vectors(len(self.dict))
        elif (len(self.dict) - old_dict_len) > 0:
            new_index_vectors = self.get_index_vectors(len(self.dict) - old_dict_len)
            self.index_vectors = sparse.vstack(self.index_vectors, new_index_vectors)
            
        # initialize the wordvectors if not already happened
        if not hasattr(self, 'word_vectors'):
           self.word_vectors = np.zeros((len(self.dict), self.dims), dtype=np.int32) 
        
        job_queue = Queue(maxsize=100)
        progress_queue = Queue(maxsize=self.workers)
        worker_vector_spaces = np.zeros((self.workers, len(self.dict), self.dims), dtype=np.int32)
                    
        def get_context(document, index, before, after):
            context = []
            for n in range(max(index - before, 0), index):
                context.append(document[n])
                
            for n in range(index + 1, min(index + after + 1, len(document))):
                context.append(document[n])
                
            return context
        
        def train_worker(worker_num):
            docs_processed = 0
            while True:
                document = job_queue.get()
                
                # queue is empty, kill this worker
                if document is None:
                    break;
                    
                docs_processed += 1
                
                if docs_processed % report_every == 0:
                    progress_queue.put((worker_num, 'progress', docs_processed))
                    
                #precompute document as a list of IDs in the dict 
                document_by_id = [self.dict.token2id[word] for word in document]

                for n, word in enumerate(document_by_id):
                    context_words = get_context(document_by_id, n, *self.context_size)
                    word_vec = worker_vector_spaces[worker_num, word]

                    for context in context_words:
                        index_vec = self.index_vectors[context]
                        word_vec += index_vec

                    worker_vector_spaces[worker_num, word] = word_vec
                    
            progress_queue.put((worker_num, 'done', docs_processed))
                                        
        def job_producer():
            # fill in the queue
            for document in data:
                # put blocks until a space is available
                job_queue.put(document)
                
            # put an empty element in the queue to notify the workers they are done
            for _ in xrange(self.workers):
                job_queue.put(None)
                
            
        workers = []
        workers.append(threading.Thread(target=job_producer))
        workers += [threading.Thread(target=train_worker, args=(num,)) for num in xrange(self.workers)]
        
        
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()
        self.logger.info('started {} jobs'.format(self.workers))
        
        unfinished_jobs = self.workers
        worker_processed_documents = [0 for _ in xrange(self.workers)]
        
        while unfinished_jobs > 0:
            # handle progress updates from the workers    
            status = progress_queue.get();
            if status[1] == 'done':
                self.logger.info('job #{} finished. processed {} documents'.format(status[0], status[2]))
                unfinished_jobs -= 1
            elif status[1] == 'progress':
                worker_processed_documents[status[0]] = status[2]
                self.logger.info('processed {} documents'.format(sum(worker_processed_documents)))
        
        # add up all the worker vector spaces
        new_vectors = np.sum(worker_vector_spaces, axis=0)
        self.word_vectors = np.add(self.word_vectors, new_vectors)
                      
    def __contains__(self, word):
        return word in self.dict
    
    def __getitem__(self, word):
        return self.word_vectors[self.dict.token2id[word]]
                
    def save(self, *args, **kwargs):
        # don't bother storing the cached normalized vectors, recalculable table
        kwargs['ignore'] = kwargs.get('ignore', ['logger'])
        super(RandomIndexing, self).save(*args, **kwargs)

    save.__doc__ = SaveLoad.save.__doc__
    
    @classmethod
    def load(cls, *args, **kwargs):
        model = super(RandomIndexing, cls).load(*args, **kwargs)
        return model

In [35]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

np.random.seed(0)

model = RandomIndexing(workers= 4)
#model.learn(data)

data_path = 'data/corpusAktuell.validation.txt'
data = LineSentence(data_path)
model.learn(data)

model.save('data/ri.test.model')

print(model['trump'])

model = RandomIndexing.load('data/ri.test.model')

print(model['trump'])

2017-02-20 22:16:59,311 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-02-20 22:16:59,313 : INFO : built Dictionary(264 unique tokens: [u'wochen', u'hingegen', u'rechts', u'formal', u'regierungslager']...) from 1 documents (total 402 corpus positions)
2017-02-20 22:16:59,526 : INFO : started 4 jobs
2017-02-20 22:16:59,532 : INFO : job #2 finished. processed 0 documents
2017-02-20 22:16:59,541 : INFO : job #3 finished. processed 0 documents
2017-02-20 22:16:59,547 : INFO : job #1 finished. processed 0 documents
2017-02-20 22:16:59,831 : INFO : job #0 finished. processed 1 documents
2017-02-20 22:16:59,840 : INFO : adding document #0 to Dictionary(264 unique tokens: [u'wochen', u'hingegen', u'rechts', u'formal', u'regierungslager']...)
2017-02-20 22:16:59,852 : INFO : built Dictionary(1842 unique tokens: [u'wochen', u'ostseekusten', u'regierungslager', u'gesehen', u'protest']...) from 12 documents (total 4811 corpus positions)


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''