In [2]:
    
import os
import re
import math
import random
import warnings
from multiprocessing import cpu_count
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
from nltk.tokenize import word_tokenize
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
import dateutil.parser

%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\glin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Read all files:

qs = pd.read_csv('PostQuestionsFiltered_V4_parsed.tsv',delimiter='\t',encoding='utf-8')
comments = pd.read_csv('CommentsFiltered_v3.tsv',delimiter='\t',encoding='utf-8')
tags = pd.read_csv('tags.csv')
answers = pd.read_csv('PostAnswersFiltered_V4_parsed.tsv',delimiter='\t',encoding='utf-8')

## Preprocess Data

In [13]:
row_iterator = qs.iterrows()
keywords = [
   'javascript', 'java', 'c#', 'php', 'python', 'c++',
       'node.js', 'objective-c', 'vb.net', 'scala', 'matlab', 'perl', 'delphi',
       'matplotlib', 'animation', 'd3', 'ggplot2', 'plot', 'graph', 'chart',
       'highcharts', 'vbscript', 'colors', 'pyspark', 'dplyr', 'f#', '3d',
       'sas', 'fortran', 'maps', 'lisp', 'julia', 'powerbi', 'drawing', 'line',
       'plotly', 'bar-chart', 'visualization', 'tableau', 'seaborn',
       'geospatial', 'stata', 'plyr', 'pie-chart', 'graphviz', 'spss',
       'diagram', 'qlikview', 'altair'
]
row_keywords = []
for row in row_iterator:
    single_row_keywords = [keyword for keyword in keywords if row[1][keyword] == 1]
    single_row_keystring = " ".join(single_row_keywords)
    row_keywords.append(single_row_keystring)
row_keywords[0:10]

['plot',
 'graph',
 'c++ graph',
 'c# chart',
 'line',
 'ggplot2 plot',
 'graph',
 'chart',
 'graph',
 'visualization']

In [14]:
np_array_of_row_keywords = np.array(row_keywords)

qs["new_tags"] = np_array_of_row_keywords

qs.to_csv('new_qs.csv')

In [16]:
combined = pd.merge(qs, answers, how='left', left_on = 'accepted_answer_id', right_on = 'id')

combined.to_csv('combined.csv')

In [27]:
combined.head()

Unnamed: 0,id_x,title_x,body_x,accepted_answer_id_x,answer_count_x,comment_count_x,community_owned_date_x,creation_date_x,favorite_count_x,last_activity_date_x,...,last_editor_display_name_y,last_editor_user_id_y,owner_display_name_y,owner_user_id_y,parent_id_y,post_type_id_y,score_y,tags_y,view_count_y,cleaned_body
0,15537402,using command on a gnplot script,<p>Im using a script on a mac bash shell that ...,,1,2,,2013-03-21 00:34:11.173000+00:00,,2013-03-21 00:44:12.973000+00:00,...,,,,,,,,,,
1,15856146,Applying Orthographic projection or frustum ef...,<p>I know that normalised coordinates should b...,15858157.0,1,0,,2013-04-06 21:21:10.723000+00:00,,2013-04-07 02:13:30.390000+00:00,...,,,,3758484.0,15856146.0,2.0,0.0,,,To my eyes it appears correct. If your screen...
2,15428854,How to implement both scalar and vector additi...,"<p>I'm working on a Vector2D class, and I thin...",15429296.0,3,0,,2013-03-15 09:33:56.357000+00:00,,2013-03-15 10:20:14.973000+00:00,...,,649665.0,,649665.0,15428854.0,2.0,1.0,,,It's not clear what you're trying to do. The...
3,15517350,how to increase speed of tchart refresh()?,<p>I have 16 graphs[maximum ] with 4 fastlines...,15526751.0,1,0,,2013-03-20 07:07:09.313000+00:00,0.0,2013-06-10 13:13:10.997000+00:00,...,,,,1309861.0,15517350.0,2.0,2.0,,,"I have made a simple code, where I have added..."
4,15445313,What does $ROOT mean in a Mac Terminal?,<p>I received some command line instructions:<...,15445328.0,2,0,,2013-03-16 03:45:52.347000+00:00,,2013-03-16 03:50:07.557000+00:00,...,,,,14860.0,15445313.0,2.0,1.0,,,"is an environment variable, plain and simple...."


### TF-IDF Recommend Questions

In [None]:
# getting questions

raw_documents= qs['title'] + qs['new_tags']
raw_documents = raw_documents
qs = qs

#Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

# Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

# Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

# Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

# Creat Similarity Checker
similar_qs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

In [25]:
class tfModel():
    def __init__(self, question_file):
        qs = pd.read_csv(question_file)
        qs = qs.fillna(value={'new_tags': ''})
        raw_documents = qs['title'] + qs['new_tags']
        # sample to smaller number of documents
        raw_documents = raw_documents
        qs = qs
        self.questions = qs
        tokenized_docs = [[w.lower() for w in word_tokenize(text)] 
                            for text in raw_documents]
        self.dictionary = gensim.corpora.Dictionary(tokenized_docs)
        corpus = [self.dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_docs]
        tf_idf = gensim.models.TfidfModel(corpus)
        self.tf_idf = tf_idf
        self.similarity_checker = gensim.similarities.Similarity("",self.tf_idf[corpus],num_features=len(self.dictionary))
    
    def get_similar_documents(self, query, num_results=5, threshold=0.10):
        tokenized_query = [w.lower() for w in word_tokenize(query)]
        query_bag_of_words = self.dictionary.doc2bow(tokenized_query)
        query_tf_idf = self.tf_idf[query_bag_of_words]
        question_similarities = self.similarity_checker[query_tf_idf]
        print("Q Similarities", len(question_similarities), question_similarities)

        # Display similar questions from the past:

        questions_copy = self.questions.copy()
        questions_copy['similarity'] = question_similarities
        questions_above_threshold_similarity = questions_copy[questions_copy['similarity'] >= threshold]
        questions_above_threshold_similarity = questions_above_threshold_similarity.sort_values('similarity',ascending=False)
        
        return questions_above_threshold_similarity['title'].head(num_results)

def get_document_similarities_for_query(query):
    query_doc = [w.lower() for w in word_tokenize(query)]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    q_sim=similar_qs[query_doc_tf_idf]
    sim_threshold=0.10

    # Display similar questions from the past:

    qs['Similarity']=q_sim
    ques=qs[qs['Similarity']>=sim_threshold]
    ques=qs.sort_values('Similarity',ascending=False)
        
    result = ques['title'].head()
    results.append(input_query)
    results.append(result)

def similar_ques(query_list):
    results = []
    for input_query in query_list:
        query_doc = [w.lower() for w in word_tokenize(input_query)]
        query_doc_bow = dictionary.doc2bow(query_doc)
        query_doc_tf_idf = tf_idf[query_doc_bow]
        q_sim=similar_qs[query_doc_tf_idf]
        sim_threshold=0.10

        # Display similar questions from the past:

        qs['Similarity']=q_sim
        ques=qs[qs['Similarity']>=sim_threshold]
        ques=qs.sort_values('Similarity',ascending=False)
        
        result = ques['title'].head()
#         print(result)
        results.append(input_query)
        results.append(result)
    
    return results