## Load data for training word embeddings
<hr>

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# read data

data = pd.read_csv('data/processed_data_model1.csv')
print(data.shape)
data.head()

(58887, 9)


Unnamed: 0,title,corpus,content,tags,score,answers,polarity,subjectivity,processed_title
0,Specifying a mySQL ENUM in a Django model,specifying mysql enum django model go specifyi...,specifying mysql enum django model go specifyi...,python|mysql|django|django-models|enums,0.044573,From the Django documentation:,0.0,0.0,specifying mysql enum django model
1,"How do content discovery engines, like Zemanta...",content discovery engines like zemanta open ca...,content discovery engines like zemanta open ca...,python|ruby|semantics|zemanta,0.002254,Michal Finkelstein from OpenCalais here.,0.0,0.0,content discovery engines like zemanta open ca...
2,Install Python to match directory layout in OS...,install python match directory layout os x 105...,install python match directory layout os x 105...,python|macos|64-bit,-0.000657,"Hyposaurus,",0.0,0.0,install python match directory layout os x 105
3,Why does Python's iter() on a mapping return i...,python iter mapping return iterkeys instead it...,python iter mapping return iterkeys instead it...,python|mapping|iteration,0.001134,Check out this thread for a discussion on the ...,-0.2,0.85,python iter mapping return iterkeys instead it...
4,Javascript equivalent of Python's locals()?,javascript equivalent python locals python one...,javascript equivalent python locals python one...,javascript|python,0.003821,"Well, I don't think that there is something li...",0.0,0.0,javascript equivalent python locals


## Training word embeddings
<hr>

In [3]:
import multiprocessing
from gensim.models import Word2Vec

In [4]:
cores = multiprocessing.cpu_count()
print(cores)

8


In [5]:
import gensim

w2v_model = Word2Vec(min_count=0
                                     , window=10
                                     , size=300
                                     , workers=cores-1)

corpus = [text.split() for text in np.array(data.corpus)]

In [6]:
#train word embeddings

w2v_model.build_vocab(corpus)
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("vocab size",vocab_size)
w2v_model.train(corpus, total_examples=len(corpus), epochs=32)

vocab size 248929


(81480977, 87125344)

In [7]:
w2v_model.save('models/related_questions_model.bin') #save

## Sanity check on the trained word embeddings
<hr>

In [8]:
print('Terms most similar to "flask" \n')
print(w2v_model.wv.most_similar("flask"))

Terms most similar to "flask" 

[('pyramid', 0.47796744108200073), ('sqlalchemy', 0.44483625888824463), ('blueprints', 0.44131705164909363), ('django', 0.43050113320350647), ('werkzeug', 0.42923790216445923), ('fieldlist', 0.4247692823410034), ('url_for', 0.41096389293670654), ('clipy', 0.4070345163345337), ('routes', 0.4053919315338135), ('fastapi', 0.400310218334198)]


In [9]:
print('Terms most similar to "django" \n')
print(w2v_model.wv.most_similar("django"))

Terms most similar to "django" 

[('web2py', 0.4499993324279785), ('drf', 0.4495217204093933), ('flask', 0.43050113320350647), ('python', 0.42775559425354004), ('wagtail', 0.4143940508365631), ('harini', 0.4106205701828003), ('pyramid', 0.4066495895385742), ('template', 0.39889439940452576), ('modelspy', 0.3987653851509094), ('forbidden_view_config', 0.39538854360580444)]


In [10]:
print('Terms most similar to "api" \n')
print(w2v_model.wv.most_similar("api"))

Terms most similar to "api" 

[('apis', 0.5788638591766357), ('service', 0.47236400842666626), ('twitter', 0.46291738748550415), ('restful', 0.4628230035305023), ('endpoint', 0.4458930194377899), ('endpoints', 0.4392290711402893), ('oauth', 0.43464982509613037), ('goole', 0.4340834617614746), ('interface', 0.42327064275741577), ('services', 0.4197748303413391)]


## Relevant questions retrieval model
<hr>

In [11]:
import os
import numpy as np
import pandas as pd

## Load data
<hr>

In [12]:
data = pd.read_csv('data/processed_data_model1.csv')
data

Unnamed: 0,title,corpus,content,tags,score,answers,polarity,subjectivity,processed_title
0,Specifying a mySQL ENUM in a Django model,specifying mysql enum django model go specifyi...,specifying mysql enum django model go specifyi...,python|mysql|django|django-models|enums,0.044573,From the Django documentation:,0.000000,0.000000,specifying mysql enum django model
1,"How do content discovery engines, like Zemanta...",content discovery engines like zemanta open ca...,content discovery engines like zemanta open ca...,python|ruby|semantics|zemanta,0.002254,Michal Finkelstein from OpenCalais here.,0.000000,0.000000,content discovery engines like zemanta open ca...
2,Install Python to match directory layout in OS...,install python match directory layout os x 105...,install python match directory layout os x 105...,python|macos|64-bit,-0.000657,"Hyposaurus,",0.000000,0.000000,install python match directory layout os x 105
3,Why does Python's iter() on a mapping return i...,python iter mapping return iterkeys instead it...,python iter mapping return iterkeys instead it...,python|mapping|iteration,0.001134,Check out this thread for a discussion on the ...,-0.200000,0.850000,python iter mapping return iterkeys instead it...
4,Javascript equivalent of Python's locals()?,javascript equivalent python locals python one...,javascript equivalent python locals python one...,javascript|python,0.003821,"Well, I don't think that there is something li...",0.000000,0.000000,javascript equivalent python locals
...,...,...,...,...,...,...,...,...,...
58882,Comparing values with __eq__ in Python,comparing values _ _ eq _ _ python good day ni...,comparing values _ _ eq _ _ python good day ni...,python|magic-methods,-0.001329,By implementing like this you're basically te...,-0.083333,0.283333,comparing values _ _ eq _ _ python
58883,How to get PYQT5 text box to be taken as an in...,get pyqt5 text box taken input sqlite trying c...,get pyqt5 text box taken input sqlite trying c...,python|sqlite|pyqt|pyqt5|qtsql,-0.001105,"You can‘t get the value in your SQL statement,...",0.000000,0.000000,get pyqt5 text box taken input sqlite
58884,How to assign one argument to function and mak...,assign one argument function make list later q...,assign one argument function make list later q...,python|tuples|user-defined-functions,-0.001329,I think you want this,0.000000,0.000000,assign one argument function make list later
58885,Iterate over a list based on list based on a l...,iterate list based list based list steps want ...,iterate list based list based list steps want ...,python,0.000686,You can make a simple for loop and keep track ...,-0.055556,0.307937,iterate list based list based list steps


In [13]:
import re
import nltk
import inflect
from nltk.corpus import stopwords

#process text

import spacy
en = spacy.load('en_core_web_sm')

import nltk
nltk.download('stopwords')

#tokenizer

def tokenize(text):
    tokens = en.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]

#remove punctuations

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words 

#remove stopwords

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def normalize(words):
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def process_text(text):
    return ' '.join(normalize(tokenize(text)))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhavana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import saved word embeddings
<hr>

In [14]:
#getting the saved word embeddings

'''import gensim
w2v_model = gensim.models.word2vec.Word2Vec('models/related_questions_model.bin')
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("vocab size",vocab_size)'''

'import gensim\nw2v_model = gensim.models.word2vec.Word2Vec(\'models/related_questions_model.bin\')\nwords = w2v_model.wv.vocab.keys()\nvocab_size = len(words)\nprint("vocab size",vocab_size)'

## Calculate sentence embeddings
<hr>

In [15]:
#word to numerical vector using the trained word embeddings

def question_to_vec(question, embeddings, dim = 300):
    question_embedding = np.zeros(dim)
    valid_words = 0
    for word in str(question).split(' '):
        if embeddings.wv.__contains__(word):
            valid_words+=1
            question_embedding += embeddings.wv.__getitem__(word)
        if valid_words>0:
            return question_embedding/valid_words
        else:
            return question_embedding

In [23]:
#converting all the question titles to vectors and saving them

title_embeddings = []

for title in data.processed_title:
    title_embeddings.append(question_to_vec(question = title, embeddings = w2v_model))
    
title_embeddings = np.array(title_embeddings)
embeddings = pd.DataFrame(data = title_embeddings)
embeddings[0:15000].to_csv('models/title_embeddings1.csv', index=False)
embeddings[15000:30000].to_csv('models/title_embeddings2.csv', index=False)
embeddings[30000:45000].to_csv('models/title_embeddings3.csv', index=False)
embeddings[45000:].to_csv('models/title_embeddings4.csv', index=False)

print(w2v_model)

Word2Vec(vocab=248929, size=300, alpha=0.025)


## Cosine similarity
<hr>

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import HTML

query = 'combine two lists'
processed_query = process_text(query)

results_returned = 5

query_vect = np.array([question_to_vec(processed_query, w2v_model)]) # Vectorize the user query

# Calculate Cosine similarites for the query and all titles
cosine_similarities = pd.Series(cosine_similarity(query_vect, title_embeddings)[0])

# Custom Similarity Measure
cosine_similarities = (90*cosine_similarities + 5*data.score + 5*(data.polarity))/100

output = ''

for i, j in cosine_similarities.nlargest(results_returned).iteritems():
    output += '<p style="font-family:verdana; font-size:110%;"> '
    output += 'similarity score ' + str(j) + '<br>'
    for i in data.title[i].split():
        if i.lower() in query:
            output += " <b>"+str(i)+"</b>"
        else:
            output += " "+str(i)
    output += "</p><hr>"

output = '<h3>Results:</h3>'+output
display(HTML(output))