In [1]:
from gensim.models.word2vec import LineSentence

## Hyperparameters

In [26]:
d = 1000
sparsity = 0.01
index_set = (-1, 1)
context_size = (2, 2)
data_path = 'data/corpusAktuell.validation copy.txt'
#data_path = '../wiki/data/wiki.de.txt'
data = LineSentence(data_path)

## Create the dictionary

In [34]:
import numpy as np
from scipy import sparse
from gensim.corpora.dictionary import Dictionary
from gensim.utils import SaveLoad

import logging

class RandomIndexing(SaveLoad):
    
    def __init__(self, data, dims, sparsity, index_set, context_size):
        
        self.data = data
        self.dims = dims
        self.sparsity = sparsity
        self.index_set = index_set
        self.context_size = context_size
        
        self.create_dict(data)
        self.initialize_word_vectors(len(self.dict))
        
        self.learn(data, self.dict, self.index_vectors)
        
    def create_dict(self, data):
        self.dict = Dictionary(data)
        
    def initialize_word_vectors(self, wc):
        self.index_vectors = sparse.csr_matrix((wc, self.dims), dtype=np.int8)
        
        def urn_sampler(length):
            return np.random.choice(self.index_set, size=length)
        
        for i in xrange(wc):
            random_sparse_vector = sparse.random(1, self.dims, density = self.sparsity, data_rvs=urn_sampler)
            self.index_vectors[i] = random_sparse_vector
    
    def learn(self, data, dict, index_vectors):
        self.word_vectors = np.zeros((len(dict), self.dims), dtype=np.int32)
        
        def get_context(document, index, before, after):
            context = []
            for n in range(max(index - before, 0), index):
                context.append(document[n])
                
            for n in range(index + 1, min(index + after + 1, len(document))):
                context.append(document[n])
                
            return context
        
        for document in data:
            #precompute document as a list of IDs in the dict 
            document_by_id = [self.dict.token2id[word] for word in document]
            
            print document
            for n, word in enumerate(document_by_id):
                context_words = get_context(document_by_id, n, *self.context_size)
                word_vec = self.word_vectors[word]
                
                for context in context_words:
                    index_vec = self.index_vectors[context]
                    word_vec += index_vec
                    
                self.word_vectors[word] = word_vec
        print(self.word_vectors.shape)
                      
    def __contains__(self, word):
        return word in self.dict
    
    def __getitem__(self, word):
        return self.word_vectors[self.dict.token2id[word]]
                
    def save(self, *args, **kwargs):
        super(RandomIndexing, self).save(*args, **kwargs)

    save.__doc__ = SaveLoad.save.__doc__
    
    @classmethod
    def load(cls, *args, **kwargs):
        model = super(RandomIndexing, cls).load(*args, **kwargs)
        return model

In [36]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

np.random.seed(0)

model = RandomIndexing(data, d, sparsity, index_set, context_size)

model.save('data/ri.test.model')

print(model['den'])

model = RandomIndexing.load('data/ri.test.model')

print(model['den'])

2017-02-20 17:47:09,011 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-02-20 17:47:09,013 : INFO : built Dictionary(264 unique tokens: [u'wochen', u'hingegen', u'rechts', u'formal', u'regierungslager']...) from 1 documents (total 402 corpus positions)


[u'masendemonstrationen', u'fordern', u'den', u'rucktritt', u'der', u'prasidentin', u'staatsanwaltschaft', u'hat', u'anklage', u'gegen', u'drei', u'ihrer', u'engsten', u'vertrauten', u'erhoben', u'in', u'sudkorea', u'spitzt', u'sich', u'die', u'regierungskrise', u'weiter', u'zu', u'am', u'samstag', u'ging', u'erneut', u'fast', u'eine', u'million', u'menschen', u'im', u'ganzen', u'land', u'auf', u'die', u'stra\xdfe', u'um', u'den', u'rucktritt', u'der', u'rechts', u'konservativen', u'prasidentin', u'park', u'geun', u'hye', u'zu', u'fordern', u'das', u'schreibt', u'unter', u'anderem', u'die', u'korea', u'times', u'in', u'der', u'landeshauptstadt', u'seoul', u'nahmen', u'danach', u'allein', u'600', u'000', u'menschen', u'an', u'den', u'protesten', u'teil', u'und', u'weitere', u'350', u'000', u'in', u'diversen', u'stadten', u'im', u'ganzen', u'land', u'unterdessen', u'hat', u'die', u'sudkoreanische', u'staatsanwaltschaft', u'nach', u'einem', u'bericht', u'der', u'chinesischen', u'nachricht

2017-02-20 17:47:09,432 : INFO : saving RandomIndexing object under data/ri.test.model, separately None
2017-02-20 17:47:09,436 : INFO : saved data/ri.test.model
2017-02-20 17:47:09,447 : INFO : loading RandomIndexing object from data/ri.test.model
2017-02-20 17:47:09,449 : INFO : loading dict recursively from data/ri.test.model.dict.* with mmap=None
2017-02-20 17:47:09,449 : INFO : loaded data/ri.test.model


(264, 1000)
[ 0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  1  1  0  0  0  0 -1
 -1 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0
  0  2  0  0  0  0  0  0  0  0  0  0  0  0 -2  0  0  1  0  0 -2  1  0  0  1
  1  0  0  0 -1  0 -2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1
  1  0  0  0  1 -1  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  1  0  0
  0  0  0  0  0  0  0  2  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  1  0  0  0  0 -1  1  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  2  0  0  0  2  0 -1  0  0 -1  0  0  1  0  0  0  2  0  0  0  1
  0  0  0  0 -2  0  0  2  0  0  2 -1  0  0  0 -1  0  0  0  0  0  0  1  0  0
  0  0  0  0 -1  0 -1  0  0  0  0  0  1  1  0  0  0 -1 -1  1  0  0  0  0  0
  0  0  1  0  0  0  0  0  0  0 -1  0  0  1  0 -1 -2  0  0  0  0 -1  0  0  0
  0  0  0  0  0  0  1 -1 -1  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0
  0  1  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
