In [None]:
    
import os
import re
import math
import random
import warnings

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
from nltk.tokenize import word_tokenize
import dateutil.parser

import time
import pickle


%matplotlib inline


In [10]:
# Read all files:

qs = pd.read_csv('data/stackoverflow/PostQuestionsFiltered_V5_parsed.tsv',delimiter='\t',keep_default_na=False, encoding='utf-8')
answers = pd.read_csv('data/stackoverflow/PostAnswersFiltered_V5_parsed.tsv',delimiter='\t', keep_default_na=False, encoding='utf-8')


In [13]:
# create combined q + a file including all answers (not just accept answers) 
combined = pd.merge(answers, qs, how='inner', left_on = 'parent_id', right_on = 'id')

In [15]:
combined.head()

Unnamed: 0,id_x,title_x,body_x,accepted_answer_id_x,answer_count_x,comment_count_x,community_owned_date_x,creation_date_x,favorite_count_x,last_activity_date_x,...,last_editor_user_id_y,owner_display_name_y,owner_user_id_y,parent_id_y,post_type_id_y,score_y,tags_y,view_count_y,new_tags,cleaned_question_body
0,44632276,,<p>The correct syntax is this one:</p>\r\r\n\r...,,,0,,2017-06-19 13:43:45.580000+00:00,,2017-06-19 13:43:45.580000+00:00,...,5222773.0,,5222773.0,,1,0,elasticsearchneo4jproxygraphaware,713,graph,I am trying to install Graph-Aided Search to ...
1,43934125,,<p>You can download the plugin manually from t...,,,13,,2017-05-12 09:28:09.857000+00:00,,2017-05-12 09:28:09.857000+00:00,...,5222773.0,,5222773.0,,1,0,elasticsearchneo4jproxygraphaware,713,graph,I am trying to install Graph-Aided Search to ...
2,44632434,,<p>You can define the lines from the dataset w...,,,0,,2017-06-19 13:51:27.790000+00:00,,2017-06-19 13:51:27.790000+00:00,...,,,458646.0,,1,0,anychart,34,chart,I would like to add vertical lines to graph a...
3,44633649,,<p>The way matplotlib is working is that you h...,,,0,,2017-06-19 14:47:46.830000+00:00,,2017-06-19 14:47:46.830000+00:00,...,,,4125774.0,,1,0,pythonmatplotlib,361,python matplotlib plot,I want to create a generic plotting tool eg.:...
4,44633675,,"<p>I have figured out the answer, which is to ...",,,0,,2017-06-19 14:48:42.900000+00:00,,2017-06-19 14:48:42.900000+00:00,...,8050923.0,,8050923.0,,1,0,sqlplotapache-zeppelin,718,plot,I have three tables with time series data of ...


#### Time the creation of the model and response to one question across multiple corpus sizes

In [None]:
# create array to hold timing results
times=np.zeros((10,2))
# create combined result data frame to hold the results from all tests below
combrslts=pd.DataFrame()
# get the list of queries to run through each model
samp_ques = pd.read_csv('data/stackoverflow/Sample Questions V 2.csv', header=None,names=['ques'],encoding='utf-8')
Query_List=[x for x in samp_ques['ques']]

# define function for retrieving similar results
def similar_docs_combined_corpus(query_list,corpus,test_run,threshold,top_num_to_return):
    results = pd.DataFrame()
    for input_query in query_list:
        query_doc = [w.lower() for w in word_tokenize(input_query)]
        query_doc_bow = dictionary.doc2bow(query_doc)
        query_doc_tf_idf = tf_idf[query_doc_bow]
        doc_sim=similar_docs[query_doc_tf_idf]
        sim_threshold=threshold
        # Display similar questions from the past:

        corpus['Similarity']=doc_sim
        cmbdocs=corpus.sort_values('Similarity',ascending=False)
        combdocs=cmbdocs[cmbdocs['Similarity']>=sim_threshold]
        if len(combdocs['cleaned_body']) < top_num_to_return:
            rslts_len=len(combdocs['cleaned_body'])
        else:
            rslts_len = top_num_to_return
        if rslts_len == 0:
            result = pd.DataFrame()
            result = result.append({'Corpus_Size':len(corpus), \
                                   'Test_Run':test_run, \
                                   'Input_query':input_query, \
                                   'Answer':' ', \
                                   'Related_Question':' ', \
                                   'Similarity_Score':' '}, ignore_index=True)
        else:   
            result = pd.DataFrame({'Corpus_Size':[len(corpus) for x in range(rslts_len)], \
                                   'Test_Run':[test_run for x in range(rslts_len)], \
                                   'Input_query':[input_query for x in range(rslts_len)], \
                                   'Answer':combdocs['cleaned_body'][0:rslts_len].tolist(), \
                                   'Related_Question':combdocs['title_y'][0:rslts_len].tolist(), \
                                   'Similarity_Score':combdocs['Similarity'][0:rslts_len]})
        results=results.append(result,ignore_index=True)
    return results

for iter in range(1,11):
    num_obs = iter * 1000
    comb_samp=combined.sample(n=num_obs,random_state=2019)

    sttime=time.time()

    # create the model for the given sample size using answer body + question tags

    raw_documents = comb_samp['cleaned_body'] + ' ' + comb_samp['new_tags']

    # Tokenizing data
    gen_docs = [[w.lower() for w in word_tokenize(text)] 
                    for text in raw_documents]

    # Create dictionary
    dictionary = gensim.corpora.Dictionary(gen_docs)

    # Create Document-Term Matrix
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Create TF-IDF Model
    tf_idf = gensim.models.TfidfModel(corpus)

    # Create Similarity Checker
    similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

    test_run='TF-IDF on answers plus question tags, corpus size = ' + str(num_obs)

    results = similar_docs_combined_corpus([Query_List[0]],comb_samp,test_run,0.10,5)

    # add results to combined results dataframe
    combrslts=combrslts.append(results,ignore_index=True)

    print("Model and question retrieval for", num_obs, "completed. Elapsed time:", time.time()-sttime, "seconds")

    times[iter-1,0] = int(num_obs)
    times[iter-1,1] = time.time() - sttime
    
dfTimes=pd.DataFrame(times,columns=['Sample_size','Time_In_Secs'])

In [45]:
dfTimes=pd.DataFrame(times,columns=['Sample_size','Time_In_Secs'])

In [46]:
# write the combined results file to a csv
dfTimes.to_csv('data/stackoverflow/TF-IDF_run_times.csv', index=False)

In [48]:
dfTimes = pd.read_csv('data/stackoverflow/TF-IDF_run_times.csv',keep_default_na=False, encoding='utf-8')

In [49]:
dfTimes

Unnamed: 0,Sample_size,Time_In_Secs
0,1000.0,1.716999
1,2000.0,4.296005
2,3000.0,5.555997
3,4000.0,7.015
4,5000.0,8.588094
5,6000.0,10.777513
6,7000.0,10.914999
7,8000.0,12.195004
8,9000.0,19.068591
9,10000.0,16.134516
