In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import nltk
import os
from tqdm.auto import tqdm
tqdm.pandas()

from pprint import pprint

from collections import Counter
import pickle

from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import FastText
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.neighbors import NearestNeighbors

stop_words = set(stopwords.words('english'))

In [4]:
base_dir = '../../corpus/python-stackoverflow/'

In [None]:
qdf = pd.read_csv(
    os.path.join(base_dir, 'Questions.csv'), 
    encoding = "ISO-8859-1", usecols=['Id', 'Title', 'Body']
)

# adf = pd.read_csv(
#     os.path.join(base_dir, 'Answers.csv'), 
#     encoding = "ISO-8859-1", usecols=['Id', 'Body']
# )

In [None]:
# print('>>> Q:', qdf.iloc[0, 2], "\n")
# print('>>> A:', adf.iloc[0, 1], "\n")

### Pre-processing

In [None]:
def beautify(html_text):
    soup = BeautifulSoup(html_text, 'lxml')
    return ' '.join([t.text for t in soup.find_all('p')]) # concat all p tags

def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

In [11]:
### DUMP
# question_list = qdf['Body'].progress_apply(beautify).values.tolist()
# question_words = [list(filter(lambda w : w not in stop_words, s)) for s in sent_to_words(question_list)]
# pickle.dump(question_words, open(os.path.join(base_dir, 'question_words_clean.pickle'), 'wb'))

### LOAD
question_words = pickle.load(open(os.path.join(base_dir, 'question_words_clean.pickle'), 'rb'))
print(len(question_words))

q_vocab = Counter()
for s in tqdm(question_words):
    for w in s:
        q_vocab[w] += 1
        
x = []
for s in tqdm(question_words):
    for w in s:
        if q_vocab[w] >= 100:
            x += [w]
print(len(x))

# ---

# answer_list = adf['Body'].progress_apply(beautify).values.tolist()
# answer_words   = [list(filter(lambda w : w not in stop_words, s)) for s in sent_to_words(answer_list)]

607282


HBox(children=(IntProgress(value=0, max=607282), HTML(value='')))




HBox(children=(IntProgress(value=0, max=607282), HTML(value='')))


26592971


In [25]:
needed = [
    'call', 'function', 'arguments', 'parameter', 'method', 'invoke', 'assign', 'return',
    'variable', 'functions', 'argument', 'parameters', 'methods', 'execute', 'pass',
    'change', 'extract', 'sort'
]

topw = [w for w, x in q_vocab.most_common() if ]

print(set(needed) - set(topw))

{'function', 'execute', 'parameter', 'return', 'extract', 'pass', 'sort', 'method', 'functions', 'assign', 'call', 'variable', 'invoke', 'arguments', 'methods', 'argument', 'change', 'parameters'}


## Embeddings

### word2vec

In [6]:
n = 50
model = Word2Vec(question_words, size=n, window=8, min_count=100)

model.train(question_words, total_examples=len(question_words), epochs=10)

(241142968, 278505320)

In [8]:
print("vocab size: %d\n\n" % len(model.wv.vocab))

word = 'python'
print("> similar to '%s':" % word)
pprint(model.wv.most_similar(positive=word, topn=20))

vocab size: 9736


> similar to 'python':
[('pyhton', 0.7207606434822083),
 ('lua', 0.645491361618042),
 ('ironpython', 0.6337897777557373),
 ('jython', 0.6129292249679565),
 ('standalone', 0.5896718502044678),
 ('java', 0.5873558521270752),
 ('scripting', 0.5789278149604797),
 ('library', 0.5697588920593262),
 ('perl', 0.5666365027427673),
 ('cygwin', 0.5611815452575684),
 ('iron', 0.5604820251464844),
 ('pythons', 0.5481476783752441),
 ('sage', 0.5451010465621948),
 ('swift', 0.5423818230628967),
 ('gem', 0.5403555035591125),
 ('comtypes', 0.5399928092956543),
 ('powershell', 0.5346354842185974),
 ('ruby', 0.5337449312210083),
 ('nodejs', 0.5331274271011353),
 ('julia', 0.5294690132141113)]


## FastText
The main difference of FastText from Word2Vec is that it uses sub-word information (i.e character n-grams). While it brings additional utility to the embeddings, it also considerably slows down the process. 

In [None]:
ft_model = FastText(question_words, size=n, window=8, min_count=5, workers=2,sg=1)

In [None]:
word = 'call'
print("> similar to '%s':" % word)
pprint(ft_model.wv.most_similar(positive=word, topn=20))

## Similarity

### TF-IDF

In [None]:
#dct = Dictionary(filtered_questions)  # fit dictionary
#corpus = [dct.doc2bow(line) for line in filtered_questions]  # convert corpus to BoW format
#tfidf_model = TfidfModel(corpus)  # fit model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(question_list)
print(X.shape)

In [None]:
#To proprely work with scikit's vectorizer
merged_questions = [' '.join(question) for question in filtered_questions]
document_names = ['Doc {:d}'.format(i) for i in range(len(merged_questions))]

def get_tfidf(docs, ngram_range=(1,1), index=None):
    vect = TfidfVectorizer(stop_words='english', ngram_range=ngram_range)
    tfidf = vect.fit_transform(docs).todense()
    return pd.DataFrame(tfidf, columns=vect.get_feature_names(), index=index).T

tfidf = get_tfidf(merged_questions, ngram_range=(1,1), index=document_names)

### Centroid Function

In [None]:
def get_sent_embs(emb_model):
    sent_embs = []
    for desc in range(len(filtered_questions)):
        sent_emb = np.zeros((1, n))
        if len(filtered_questions[desc]) > 0:
            sent_emb = np.zeros((1, n))
            div = 0
            model = emb_model
            for word in filtered_questions[desc]:
                if word in model.wv.vocab and word in tfidf.index:
                    word_emb = model.wv[word]
                    weight = tfidf.loc[word, 'Doc {:d}'.format(desc)]
                    sent_emb = np.add(sent_emb, word_emb * weight)
                    div += weight
                else:
                    div += 1e-13 #to avoid dividing by 0
        if div == 0:
            print(desc)

        sent_emb = np.divide(sent_emb, div)
        sent_embs.append(sent_emb.flatten())
    return sent_embs

In [None]:
ft_sent = get_sent_embs(emb_model = ft_model) 

## Finding Similar Questions
Now we have sentence embeddings which in theory should reflect the similarity of some questions. To check if this assumption is valid, let's pick a question and find top 5 similar questions (knearest neighbours) as measured by cosine distance.

In [None]:
def get_n_most_similar(interest_index, embeddings, n):
    """
    Takes the embedding vector of interest, the list with all embeddings, and the number of similar questions to 
    retrieve.
    Outputs the disctionary IDs and distances
    """
    nbrs = NearestNeighbors(n_neighbors=n, metric='cosine').fit(embeddings)
    distances, indices = nbrs.kneighbors(embeddings)
    similar_indices = indices[interest_index][1:]
    similar_distances = distances[interest_index][1:]
    return similar_indices, similar_distances

def print_similar(interest_index, embeddings, n):
    """
    Convenience function for visual analysis
    """
    closest_ind, closest_dist = get_n_most_similar(interest_index, embeddings, n)
    print('Question %s \n \n is most similar to these %s questions: \n' % (question_list[interest_index], n))
    for question in closest_ind:
        print('ID ', question, ': ',question_list[question])

In [None]:
print_similar(42, ft_sent, 5)

Results are quite interesting. All of the questions are about some kind of text processing. Not exactly repeating questions, but we are definitely onto something. Possible explanation for a weak perfromance is that questions are too long and the final embedding is influenced by too much noise. My hope was that tf-idf score would counteract this, but apparently this is not the case. However, for shorter texts, this method works quite well. 

Next appraoch will be a more complicated (in terms of theory, not implementation) model called __Doc2Vec__. 

## Doc2Vec
Doc2Vec improves on simple averaging method by training a 'document' vector along the word vectors. As in Word2Vec there are two algortihms available to train the model, but I will be using the 'distributed memory' (that's why dm=1 in my model). It trains a model which predicts a word based on its context, by averaging the context word and paragraph ID vectors.  

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(filtered_questions)]
model = Doc2Vec(documents, vector_size=n, window=8, min_count=5, workers=2, dm = 1, epochs=20)

In [None]:
print(question_list[42], ' \nis similar to \n')
print([question_list[similar[0]] for similar in model.docvecs.most_similar(42)])

Results are less than impressive. Some results are about string manipulations or SQL, but Doc2Vec has failed to capture the main meaning of the reference question. 

From the current analysis I can conclude that with current parameters, __Centroid Method outperforms Doc2Vec__. Here's is another example of similar questions being close to each-other under the Centroid Method Embedding.

In [None]:
print_similar(101, ft_sent, 5)

Next steps to improve embeddings would be to:
* Add more tags to Doc2Vec which, in theory, would push questions with similar tags closer together
* Concatenate question headers and code parts with question text 
* Experiment with more questions (now we are training on a limited dataset)