In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
import nltk
import string
import os
import collections
import smart_open
import random
import datetime
import json
import re
import heapq
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer

# Understand the table

In [2]:

post_df = pd.read_csv('all_posts.csv', sep = "\t")
post_df.head()

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount
0,1,1,,15.0,2010-07-19T19:12:12.510,36,2577.0,How should I elicit prior distributions from e...,8.0,2010-09-15T21:08:26.077,Eliciting priors from experts,<bayesian><prior><elicitation>,5.0,1,23.0
1,2,1,,59.0,2010-07-19T19:12:57.157,29,23368.0,In many different statistical methods there is...,24.0,2016-06-27T06:44:40.147,What is normality?,<distributions><normality>,7.0,1,10.0
2,3,1,,5.0,2010-07-19T19:13:28.577,66,5792.0,What are some valuable Statistical Analysis op...,18.0,2013-05-27T14:48:36.927,What are some valuable Statistical Analysis op...,<software><open-source>,19.0,4,39.0
3,4,1,,135.0,2010-07-19T19:13:31.617,17,26414.0,I have two groups of data. Each with a differ...,23.0,2010-09-08T03:00:19.690,Assessing the significance of differences in d...,<distributions><statistical-significance>,5.0,2,5.0
4,5,2,3.0,,2010-07-19T19:14:43.050,87,,The R-project\n\nhttp://www.r-project.org/\n\n...,23.0,2010-07-19T19:21:15.063,,,,3,


In [3]:
post_df.tail()

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount
215957,299976,2,299970.0,,2017-08-27T01:55:50.117,0,,\n We cannot simulate separate values (becaus...,8336.0,2017-08-27T01:55:50.117,,,,0,
215958,299977,2,299963.0,,2017-08-27T02:07:55.553,0,,"Neural networks could ""learn"" where the labels...",30621.0,2017-08-27T02:07:55.553,,,,0,
215959,299978,2,299669.0,,2017-08-27T03:26:04.497,0,,"Calculating $\displaystyle\hat{t}_i=\int t\, p...",8336.0,2017-08-27T03:26:04.497,,,,0,
215960,299979,1,,,2017-08-27T03:52:26.230,0,2.0,I have collected primary data of BISP with hel...,175092.0,2017-08-27T03:52:26.230,What should be the solution of insignificant (...,<statistical-significance>,0.0,0,
215961,299980,2,299446.0,,2017-08-27T04:05:22.103,0,,While it's possible to combine word embeddings...,107579.0,2017-08-27T04:05:22.103,,,,0,


### How many posts?

In [4]:
n = post_df.shape[0]
n

215962

In [5]:
len(post_df['Id'].unique()) == post_df.shape[0] 

True

In [6]:
215962 - 108954

107008

### How many questions?

In [7]:
post_df['PostTypeId'].value_counts()

1    108954
2    104797
5      1091
4      1091
6        18
3         6
7         5
Name: PostTypeId, dtype: int64

In [8]:
Type1 = post_df.loc[post_df['PostTypeId'] == 1]
Type1.shape

(108954, 15)

### Contents of questions

In [9]:
len(Type1['Body'].unique()) 

108937

In [10]:
sum(Type1['Body'].isnull())

0


So the unique values of "Body" is less than the number of questions. This is because some asked their
questions more than once and maybe the titles are different, but the contents are exactly the same.


### Confirm answers to questions.

I found the answers to question_id = 1 online and get the rows from the table to check that their parentId is indeed 1. 
But only on answer is accepted and is called accepted_answer. 

In [11]:
answer = 'John Cook gives some interesting recommendations.' # answer to question id = 1
post_df.loc[post_df['Body'].apply(lambda x: answer in str(x))]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount
14,15,2,1.0,,2010-07-19T19:19:46.160,17,,John Cook gives some interesting recommendatio...,6.0,2010-07-19T19:19:46.160,,,,0,


In [12]:
answer = 'Experts are given counters (or what one can think of as casino chips) representing equal densities whose total would sum up' # answer to question id = 1
post_df.loc[post_df['Body'].apply(lambda x: answer in str(x))]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount
142,154,2,1.0,,2010-07-19T22:40:47.947,25,,I am currently researching the trial roulette ...,108.0,2010-09-03T17:46:44.017,,,,2,


### What are PostTypeIds?

Only post with postTypeId = 1 have titles. Because only questions have titles and answers don't have title.  Or if PostTypeId != 1, then title is missing.

In [13]:
sum(post_df["Title"].isnull()) + 108954 == post_df.shape[0] # 

True

A guess: if PostTypeId is not 1 and 2, then they are answers and they are not accepted answers?

In [14]:

acceptedAnswerId = post_df['AcceptedAnswerId'].unique()

In [15]:
#34167 answers are accepted.
len(acceptedAnswerId)
not_accepted = post_df.loc[post_df['Id'].apply(lambda x: x not in acceptedAnswerId)]

In [16]:
not_accepted['PostTypeId'].unique()

array([1, 2, 7, 5, 4, 6, 3])

So PostTypeId is not related to whether an answer is accepted or not.

** What is it?**

In [17]:
post_df.loc[post_df['PostTypeId'] == 4]['Body'][0:10]

2182    Use this tag for any *on-topic* question that ...
2264    Mixed (aka multilevel or hierarchical) models ...
2419    Psychometrics has evolved as a subfield of psy...
2778    Model selection is a problem of judging which ...
2780    Cluster analysis is the task of partitioning d...
2782    Time series are data observed over time (eithe...
2784    Hypothesis testing assesses whether data suppo...
2935    Prediction of the future events. It is a speci...
3714    Stata is a proprietary  cross-platform general...
5647    IBM SPSS Statistics (formerly SPSS, i.e. "Stat...
Name: Body, dtype: object

### Question posts: 

In [3]:
Q_df = post_df.loc[post_df['PostTypeId'] == 1][['Id', 'Title','Body']]

Q_df.head()

Unnamed: 0,Id,Title,Body
0,1,Eliciting priors from experts,How should I elicit prior distributions from e...
1,2,What is normality?,In many different statistical methods there is...
2,3,What are some valuable Statistical Analysis op...,What are some valuable Statistical Analysis op...
3,4,Assessing the significance of differences in d...,I have two groups of data. Each with a differ...
5,6,The Two Cultures: statistics vs. machine learn...,"Last year, I read a blog post from Brendan O'C..."


### Answer posts:

In [4]:
A_df =  post_df.loc[post_df['PostTypeId'] != 1][['Id', 'Body']]
A_df.head()

Unnamed: 0,Id,Body
4,5,The R-project\n\nhttp://www.r-project.org/\n\n...
8,9,"Incanter is a Clojure-based, R-like platform (..."
11,12,"See my response to ""Datasets for Running Stati..."
12,13,Machine Learning seems to have its basis in th...
13,14,I second that Jay. Why is R valuable? Here's a...


### original post data

In [5]:
original_q_posts = Q_df['Body']
original_a_posts = A_df['Body']

original_q_titles = Q_df['Title']


print(original_q_posts[0:10])
print(original_a_posts[0:10])

print(original_q_titles[0:10])

0     How should I elicit prior distributions from e...
1     In many different statistical methods there is...
2     What are some valuable Statistical Analysis op...
3     I have two groups of data.  Each with a differ...
5     Last year, I read a blog post from Brendan O'C...
6     I've been working on a new method for analyzin...
7     Sorry, but the emptyness was a bit overwhelmin...
9     Many studies in the social sciences use Likert...
10    Is there a good, modern treatment covering the...
16    I have four competing models which I use to pr...
Name: Body, dtype: object
4     The R-project\n\nhttp://www.r-project.org/\n\n...
8     Incanter is a Clojure-based, R-like platform (...
11    See my response to "Datasets for Running Stati...
12    Machine Learning seems to have its basis in th...
13    I second that Jay. Why is R valuable? Here's a...
14    John Cook gives some interesting recommendatio...
15    Two projects spring to mind:\n\n\nBugs - takin...
17    Also see the UCI

In [6]:
original_q_titles.shape

(108954,)

original_q_titles is series.

In [25]:
thefile = open('./text/questions_body.txt', 'w')
for post in original_q_posts:
    post = str(post).replace('\n', ' ')
    post = re.sub(r"\$.*\$", "", post)
    thefile.write("%s\n" % post)

thefile = open('./text/answers_body.txt', 'w')
for post in original_a_posts:
    post = str(post).replace('\n', ' ')
    post = re.sub(r"\$.*\$", "", post)
    thefile.write("%s\n" % post)

thefile = open('./text/questions_title.txt', 'w')
for i, post in enumerate(original_q_titles):
    post = "*"+ str(i+1) + str(post).replace('\n', "") 
    post = re.sub(r"\$.*\$", "", post)
    thefile.write("%s\n" % post)
    
    

### data preprocessing


In [7]:
def show_work_status(singleCount, totalCount, currentCount = 0):
    currentCount += singleCount
    percentage = currentCount/totalCount *100
    status = ">" * int(percentage) +  " " * (100-int(percentage))
    sys.stdout.write('\rStatus:[{0}] {1:.2f}%'.format(status, percentage))
    sys.stdout.flush()
    if percentage >= 100:
    	print('\n')

In [8]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                
                yield gensim.utils.simple_preprocess(line)
                #This lowercases, tokenizes, de-accents (optional). – the output are final tokens = unicode strings, that won’t be processed any further.
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])


In [9]:
# It takes some time.
fname = './text/'
word2vec_question_title_corpus = list(read_corpus(fname + "questions_title.txt", tokens_only = True ))
word2vec_question_corpus = list(read_corpus(fname + "questions_body.txt", tokens_only = True ))
word2vec_answer_corpus = list(read_corpus(fname + "answers_body.txt", tokens_only = True ))

In [10]:
print(word2vec_question_corpus[0:3])
print(word2vec_answer_corpus[0:3])
print(word2vec_question_title_corpus[0:3])

print(len(word2vec_question_corpus))
print(len(word2vec_answer_corpus))
print(len(word2vec_question_title_corpus))

print(Q_df.shape)
print(A_df.shape)

[['how', 'should', 'elicit', 'prior', 'distributions', 'from', 'experts', 'when', 'fitting', 'bayesian', 'model'], ['in', 'many', 'different', 'statistical', 'methods', 'there', 'is', 'an', 'assumption', 'of', 'normality', 'what', 'is', 'normality', 'and', 'how', 'do', 'know', 'if', 'there', 'is', 'normality'], ['what', 'are', 'some', 'valuable', 'statistical', 'analysis', 'open', 'source', 'projects', 'available', 'right', 'now', 'edit', 'as', 'pointed', 'out', 'by', 'sharpie', 'valuable', 'could', 'mean', 'helping', 'you', 'get', 'things', 'done', 'faster', 'or', 'more', 'cheaply']]
[['the', 'project', 'http', 'www', 'project', 'org', 'is', 'valuable', 'and', 'significant', 'because', 'it', 'was', 'the', 'first', 'widely', 'accepted', 'open', 'source', 'alternative', 'to', 'big', 'box', 'packages', 'it', 'mature', 'well', 'supported', 'and', 'standard', 'within', 'many', 'scientific', 'communities', 'some', 'reasons', 'why', 'it', 'is', 'useful', 'and', 'valuable', 'there', 'are', 's

# Parameters for word2vec models:

In [11]:
#parameters
sizes = np.linspace(200, 200, num = 1)
print(sizes)
windows = np.linspace(6, 6,num = 1 )
print(windows)
min_counts = [2]
print(min_counts )

[ 200.]
[ 6.]
[2]


 Later make it more dense.

# Word2Vec Model

In [None]:
# After tuning the model and find appropriate parameters, save them in the config file and read from the config file. 
# paras = json.loads(open('./configure/word2vec.json').read())
# print(paras)

# min_count = paras['min_count']
# size = paras['size']
# window = paras['window']
# workers = paras['workers']
# sg = paras['sg']
# alpha = paras["alpha"]
# hs = paras["hs"]
# negative = paras["negative"]


In [12]:
# note: sentence: a list of lists of words.
# paras is a dictionary{ para: value, para: value}

def train_word_model(sentences, modelname, **paras):
    min_count = paras['min_count']
    size = paras['size']
    window = paras['window']
    model = gensim.models.Word2Vec(sentences, min_count = min_count, size = size, window = window, workers = 4)
    fname = './model/'+ modelname
    model.save(fname)
    return model

def filter_corpus(sentences,model, doc = 0):
    vocab = list(model.wv.vocab.keys())
    print(vocab[0:10])
    filtered_sentences = []
    i = 0
    total_count = len(sentences)
    current_count = 0
    for sentence in sentences:
        i += 1
        if i % 1000 == 0:
            show_work_status(1000, total_count, current_count)
            current_count += 1000
        if doc == 0:
            words = list(filter(lambda x: x in vocab, sentence))
            #words = [word for word in words]
        if doc == 1:
            words = list(filter(lambda x: x in vocab, sentence.words))
            #words = [word for word in words]
        filtered_sentences.append(words)
    return filtered_sentences


def glance_word(model):
    print(model)
    print('*************most similar words to \'vector\'***************')
    print(model.most_similar('vector'))
    print('\n')
    print("**************Similarity of \'probability\' and \'distribution\'******************")
    print(model.similarity('probability','distribution'))
    print('\n')
    print("**************Similarity of \'gaussian\' and \'normal\'******************")
    print(model.similarity('gaussian','normal'))
    print('\n')

# topk pick the top k similar words.    

# todo: filter_corpus is not related to model. It is only related to min_count. So it should be take out from evaluate_word_model.

def evaluate_word_model(model,topk, min_count):
    n =len(word2vec_question_corpus)
    random.seed(2017)
    doc_id = random.randint(0,n)
    # pick up a question randomly and find similar questions.
    if min_count == 1: # all words are in the vocab of the model.
        word2vec_question_corpus_filter = word2vec_question_corpus
    else:
        word2vec_question_corpus_filter = filter_corpus(word2vec_question_corpus, model) 
    sen_interest = word2vec_question_corpus_filter[doc_id]
    print('filtering is done.')
    sims = []
    current_n = 0
    for i in range(int(len(Q_df['Body'])/1000)): # only compared with the first n questions. 
        compared_sen = word2vec_question_corpus_filter[i]
        sim = model.wv.n_similarity(sen_interest, compared_sen)
        sims.append(sim)
    most_similar_index = heapq.nlargest(topk, range(len(sims)), key=sims.__getitem__)
    print('*****************The question we are interested in is: ************************')
    print(Q_df.iloc[doc_id]['Body'])
    print("*******************Similar questions are **********************")
    for i in most_similar_index :
        print(i)
        print('similarity is ', sims[i])
        print(Q_df.iloc[i]['Body'])
        print("**********************")
    return sims, doc_id

# word_vec_dic is a dictionary that stores the vectors of all words in the vocab of the word2vec model.
def save_vectors(model):
    word_vec_dic = {}
    vocab = model.wv.vocab.keys()
    for word in vocab:
        word_vec_dic[word] = model[word]
    return word_vec_dic 

### Training  models or loading models.

In [13]:
# Tuning parameters on grids
# train the models.
# It takes a while. So we only train it once and next time, load the models using the code in the next block.
models = {}
for size in sizes:
    for window in windows:
        for min_count in min_counts:
            paras =  {'size':int(size), 'window' : int(window), "min_count" :int(min_count)}
            model_name = 'word2vector_model_question_answer_' + str(int(size))+"_"+str(int(window))+"_"+str(int(min_count))
            models[model_name] = train_word_model(word2vec_question_corpus+word2vec_answer_corpus,model_name, **paras)

In [None]:
# # load models
# models = {}
# for size in sizes:
#     for window in windows:
#         for min_count in min_counts:
#             paras =  {'size':int(size), 'window' : int(window), "min_count" :int(min_count)}
#             model_name = "word2vector_model_question_answer_"+str(int(size))+"_"+str(int(window))+"_"+str(int(min_count))
#             models[model_name] = Word2Vec.load( './model/'+model_name)

In [None]:
# # It is slow to run evaluate_word_model function. or slow to run filter function.
# topk = 3
# for size in sizes:
#     for window in windows:
#         for min_count in min_counts:
#             paras =  {'size':int(size), 'window' : int(window), "min_count" :int(min_count)}
#             model_name = 'word2vector_model_question_answer_' + str(int(size))+"_"+str(int(window))+"_"+str(int(min_count))
#             model = models[model_name] 
#             glance_word(model)
#             evaluate_word_model(model,topk, min_count)

In [14]:
models.keys()

dict_keys(['word2vector_model_question_answer_200_6_2'])

In [15]:
# save word vectors for all models in model_word_vec_dic, model_word_vec_df and also in csv form.
model_word_vec_dic = {}
model_word_vec_df = {}
vocabs = {}
for key in models.keys():
    model_word_vec_dic[key] = save_vectors(models[key])
    model_word_vec_df[key] = pd.DataFrame(model_word_vec_dic[key]).T
    model_word_vec_df[key].to_csv('./word_vec/'+key+".csv")
    vocabs[key] =  list(model_word_vec_df[key].index)

In [16]:
print(model_word_vec_df[list(model_word_vec_df.keys())[0]].head())
print(model_word_vec_df[list(model_word_vec_df.keys())[0]].shape)
print(vocabs[list(model_word_vec_df.keys())[0]][0:10])

                 0         1         2         3         4         5    \
a_         -0.502157 -1.231525  1.501073 -0.824721  0.591714 -0.488768   
a__        -0.044594 -0.024119  0.018766  0.024026 -0.028895 -0.022439   
a_a         0.013455 -0.012830 -0.017656  0.014753  0.062009 -0.132435   
a_adjusted  0.028670  0.020240  0.025201 -0.070229  0.012619 -0.078576   
a_and_b     0.114285  0.039049  0.213730 -0.080442 -0.088148 -0.197033   

                 6         7         8         9      ...          190  \
a_          0.903779 -0.038645  0.459253 -0.485186    ...     0.276586   
a__         0.013539  0.007449  0.006484 -0.009192    ...     0.024952   
a_a         0.068699  0.045951  0.074507 -0.035419    ...    -0.028048   
a_adjusted  0.051204  0.040304  0.027123  0.021348    ...    -0.016276   
a_and_b     0.049060  0.106649 -0.017760 -0.002527    ...    -0.111197   

                 191       192       193       194       195       196  \
a_         -0.661979  0.297288  0.13

From vectors of words, how to get the vectors of docs? n_similarity uses the mean of the vectors of all words in the doc. We will use the tf-idf weighted vector. 

## Getting tf_idf of all words  in the vocab in all documents as weights  for all models.
They are to be the weights of the words when we calculate the vector of docs.

In [18]:
# now we only use the second model as an example. The other one is similar: only reset paras and key.
size =200
min_count = 2
window = 6
# It corresponds to the second model key.

key = list(models.keys())[0]
print(key)
vocab = vocabs[key]

word2vector_model_question_answer_200_6_2


In [20]:
corpus = []
for post in word2vec_question_corpus + word2vec_answer_corpus:
    corpus.append(" ".join(post))

print(corpus[0])

how should elicit prior distributions from experts when fitting bayesian model


In [21]:
vectorizer = TfidfVectorizer(min_df=1, vocabulary=vocab)
X = vectorizer.fit_transform(corpus)
X.shape

(215962, 87750)

## Get weighted vectors for all documents.

In [22]:
model_word_vec_df[key].head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
a_,-0.502157,-1.231525,1.501073,-0.824721,0.591714,-0.488768,0.903779,-0.038645,0.459253,-0.485186,...,0.276586,-0.661979,0.297288,0.136056,-0.305163,1.39619,-0.239185,1.228788,0.33364,0.492632
a__,-0.044594,-0.024119,0.018766,0.024026,-0.028895,-0.022439,0.013539,0.007449,0.006484,-0.009192,...,0.024952,-0.03826,-0.004415,-0.019359,-0.053633,-0.015187,0.011295,-0.014313,0.038858,-0.030853
a_a,0.013455,-0.01283,-0.017656,0.014753,0.062009,-0.132435,0.068699,0.045951,0.074507,-0.035419,...,-0.028048,0.040826,0.069079,0.088605,0.082241,0.211491,0.126767,0.039316,-0.028075,-0.026345
a_adjusted,0.02867,0.02024,0.025201,-0.070229,0.012619,-0.078576,0.051204,0.040304,0.027123,0.021348,...,-0.016276,0.008707,-0.060311,0.086649,0.062956,0.068462,0.053034,0.037799,-0.016378,0.058798
a_and_b,0.114285,0.039049,0.21373,-0.080442,-0.088148,-0.197033,0.04906,0.106649,-0.01776,-0.002527,...,-0.111197,0.030113,-0.04453,0.090094,0.085388,0.165226,0.09831,0.08602,-0.143789,-0.04789


In [23]:
model_word_vec_df[key].shape

(87750, 200)

In [24]:
# save the vocab from tf_idf vectorizer for future reference.
import csv
fname = './vocab_dict/vocab_dict_question_answer_'+str(int(size))+"_"+str(int(window))+"_"+str(int(min_count))+'.csv'
with open(fname, 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in vectorizer.vocabulary_.items():
        writer.writerow([key, value])

In [25]:
len(vectorizer.vocabulary_)

87750

In [None]:
# # The number of nonzero elements in a row in X should equal to the number 
# # of unique words in the corresponding doc or word2vec_question_corpus. 
# # The funciton check is used to check this.

# def check(key,i): # ith documents. doc_corpus a list of words
#     doc_corpus = word2vec_question_corpus[i]
#     df = model_word_vec_df[key]
#     related_rows = df.loc[sorted(list(set(doc_corpus).intersection(set(vocab)))), :]  
#     words_index = set(np.where(df.index.isin(set(doc_corpus).intersection(set(vocab))))[0])
#     # print(related_rows.T)
#     weights_index = []
#     ind = sorted(X[i,:].nonzero()[1])
#     for j in ind:
#         weights_index.append(j)
   
#     if len(weights_index) - len(words_index) != 0:
#         print(i)
#         print(weights_index)
#         print(words_index)
#     if related_rows.shape[0] != len(weights_index):
#         print(i)
#         print("*********")
#     return 
    

In [None]:
key = list(models.keys())[1]
print(key)

In [None]:
# total = Q_df.shape[0]
# current = 0
# for i in range(Q_df.shape[0]):
#     check(key,i)
    
    

In [None]:
# #vectorizer.vocabulary_['a_']  # =0
# model_word_vec_df[key].index[0] # = 'a_'

In [None]:
def save_obj(obj, name ):
    with open('doc_vec/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name ):
    with open('doc_vec/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
ind = sorted(X[0,:].nonzero()[1])
print(len(ind))
for i in range(11):
    print(ind[i]== vectorizer.vocabulary_[list(related_rows.index)[i]])
    

In [None]:
# ith documents. doc_corpus a list of words 
# only do this for questions
def get_doc_weighted_vec(key,i, weighted = True): 
    doc_corpus = word2vec_question_corpus[i]
    df = model_word_vec_df[key]
    related_rows = df.loc[sorted(list(set(doc_corpus).intersection(set(vocab)))), :] 
    
    if weighted:
        weights = []
        ind = sorted(X[i,:].nonzero()[1]) # words index 
# One should check this. But it is very slow. I checked half of all question posts and they are valid. 
#         if sum([vectorizer.vocabulary_[related_rows.index[j]] != ind[j] for j in range(len(ind))]) != 0:
#             print("words position don't match")
#             return 

        weights = [X[i, ind[j]] for j in range(len(ind))]        
        weights = np.array(weights)/sum(weights) # scale so the sum of weights is one.
    else:
        # to correct: some words may appear more than once. But since we are only interested in the weighted vec, I will correct this later.
        weights = [1/related_rows.shape[0]] * related_rows.shape[0] 
 

    result = related_rows.T * weights
    return result.sum(axis = 1) # weighted sum as the doc vector.

In [None]:
questions_vec_weighted = []
total = len(word2vec_question_corpus)
current = 0
for i in range(len(word2vec_question_corpus)):
    show_work_status(1, total, current)
    current += 1
    questions_vec_weighted.append(get_doc_weighted_vec(key,i))


print(len(questions_vec_weighted))
print(len(questions_vec_weighted[0]))

# save questions_vec_weighted
save_obj(questions_vec_weighted, "questions_weighted_vec_"+key)
questions_vec_weighted = load_obj("questions_weighted_vec_"+key)

print(len(questions_vec_weighted))
print(len(questions_vec_weighted[0]))

questions_vec_weighted_df = pd.DataFrame(questions_vec_weighted)
print(questions_vec_weighted_df.head())
print(questions_vec_weighted_df.shape)

questions_vec_weighted_df.to_csv('doc_vec/questions_weighted_vec_'+key+'.csv')