In [17]:
import gensim 
import numpy as np
dictFileName = 'wiki-news-300d-1M.vec'
embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName+".bin", binary=True)

from gensim.models import TfidfModel
from gensim.corpora import Dictionary, MmCorpus

tf_idf_model = TfidfModel.load('/mnt/disk/wikipedia/wikipedia.tfidf_model')
dct = Dictionary.load_from_text('/mnt/disk/wikipedia/wikipedia_wordids.txt.bz2')
corpus = MmCorpus('/mnt/disk/wikipedia/wikipedia_bow.mm')

In [2]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

w2v_fasttext = load_vectors('wiki-news-300d-1M.vec')

In [34]:
import tqdm

def map_w2v(w):
    if w in w2v_fasttext:
        return w2v_fasttext[w]
    else:
        return np.zeros(300)
        
def build_doc_emb(dictionary, w2v, corpus_wiki, tfidf_model):
    init_stack = np.empty((0,300), float)
    pbar = tqdm.tqdm(total=len(corpus_wiki))
    for i in range(len(corpus)):
        words = map(lambda x: dictionary[x], [j[0] for j in corpus_wiki[i]])
        w2v_corpus = map(map_w2v, words)
        tf_idf_weights = [x[1] for x in tfidf_model[corpus_wiki[i]]]
        mean_emb = np.mean([
            np.dot(tf_idf_weights[j], w2v_corpus[j]) for j in range(len(w2v_corpus))], axis=0)
        init_stack = np.vstack([init_stack, mean_emb])
        pbar.update()
    return init_stack

matrix = build_doc_emb(dct, w2v_fasttext, corpus, tf_idf_model)


  0%|          | 0/4524343 [00:00<?, ?it/s][A
  0%|          | 1/4524343 [00:00<616:29:17,  2.04it/s][A
  0%|          | 5/4524343 [00:00<445:26:14,  2.82it/s][A
  0%|          | 7/4524343 [00:00<339:07:31,  3.71it/s][A
  0%|          | 10/4524343 [00:00<252:16:30,  4.98it/s][A
  0%|          | 16/4524343 [00:01<185:10:46,  6.79it/s][A
  0%|          | 20/4524343 [00:01<140:08:54,  8.97it/s][A
  0%|          | 23/4524343 [00:01<111:31:29, 11.27it/s][A
  0%|          | 29/4524343 [00:01<86:34:27, 14.52it/s] [A
  0%|          | 33/4524343 [00:01<72:41:08, 17.29it/s][A
  0%|          | 37/4524343 [00:01<62:14:23, 20.19it/s][A
  0%|          | 41/4524343 [00:01<57:38:55, 21.80it/s][A
  0%|          | 45/4524343 [00:01<49:47:25, 25.24it/s][A
  0%|          | 50/4524343 [00:02<50:53:24, 24.70it/s][A

In [None]:
np.save('/mnt/wiki_mean_tfidf_weighted_fasttext_.npy', matrix)