In [22]:
#!pip install pandas_gbq
import pandas as pd

project_id = 'test-pj-sunkyung'
query = 'select title, accepted_answer_id, answer_count, comment_count, favorite_count, score, tags, view_count \
         from `bigquery-public-data.stackoverflow.posts_questions` \
         where answer_count > 0 \
         order by view_count desc \
         limit 100'

question_df = pd.read_gbq(project_id=project_id, query=query, dialect='standard')
print(question_df.head)

<bound method NDFrame.head of                                                 title  accepted_answer_id  \
0   How do I undo the most recent local commits in...            927386.0   
1   How do I delete a Git branch locally and remot...           2003515.0   
2   How do I remove a particular element from an a...           5767357.0   
3   How do I find all files containing specific te...          16957078.0   
4               How do I redirect to another webpage?            506004.0   
..                                                ...                 ...   
95              How to resolve merge conflicts in Git            163659.0   
96  How to change an element's class with JavaScript?            196038.0   
97  What does "Could not find or load main class" ...          18093929.0   
98  How to align a <div> to the middle (horizontal...            953934.0   
99       How do I get the current date in JavaScript?           4929629.0   

    answer_count  comment_count  favorite_cou

In [23]:
import numpy as np
import nltk
import re
import os
import codecs
from sklearn import feature_extraction

# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# nltk.download('stopwords')
# nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
# print (stopwords[:10])

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

question_df['tokenized_sents'] = question_df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
print (question_df.head)

<bound method NDFrame.head of                                                 title  accepted_answer_id  \
0   How do I undo the most recent local commits in...            927386.0   
1   How do I delete a Git branch locally and remot...           2003515.0   
2   How do I remove a particular element from an a...           5767357.0   
3   How do I find all files containing specific te...          16957078.0   
4               How do I redirect to another webpage?            506004.0   
..                                                ...                 ...   
95              How to resolve merge conflicts in Git            163659.0   
96  How to change an element's class with JavaScript?            196038.0   
97  What does "Could not find or load main class" ...          18093929.0   
98  How to align a <div> to the middle (horizontal...            953934.0   
99       How do I get the current date in JavaScript?           4929629.0   

    answer_count  comment_count  favorite_cou

In [42]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems



from sklearn.feature_extraction.text import TfidfVectorizer

#tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer = TfidfVectorizer(max_features=200000, stop_words='english', tokenizer=tokenize_and_stem)
#tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
#                                 min_df=0.2, stop_words='english',
#                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(question_df['title'])

#print(tfidf_vectorizer.get_feature_names())
print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()
print(terms)

(100, 230)
["'s", '__main__', '__name__', 'access-control-allow-origin', 'act', 'add', 'address', 'align', 'alreadi', 'anoth', 'append', 'array', 'arraylist', 'bash', 'befor', 'best', 'branch', 'button', 'c++', 'ca', 'case', 'center', 'chang', 'check', 'checkbox', 'class', 'clone', 'code', 'column', 'command', 'comment', 'commit', 'compar', 'concaten', 'conflict', 'contain', 'content', 'convert', 'correct', 'creat', 'css', 'curl', 'current', 'data', 'datatyp', 'date', 'datetim', 'decim', 'declar', 'default', 'delay', 'delet', 'determin', 'dictionari', 'differ', 'directori', 'display', 'div', 'doe', 'dropdown', 'duplic', 'easiest', 'element', 'els', 'email', 'empty/undefined/nul', 'encount', 'end', 'error', 'exist', 'express', 'extend', 'extern', 'fetch', 'file', 'fix', 'float', 'folder', 'for-each', 'forc', 'format', 'function', 'generat', 'git', 'given', 'global', 'hard', 'hashmap', 'head', 'header', 'hidden', 'horizont', 'horizontally/width', 'html', 'import', 'includ', 'index', 'ini

  'stop_words.' % sorted(inconsistent))


In [43]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [44]:
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 160 ms, sys: 4.22 ms, total: 164 ms
Wall time: 173 ms


In [45]:
import pickle
import joblib

s = pickle.dumps(km)
km2 = pickle.loads(s)

clusters = km2.labels_.tolist()

In [46]:
title_list = question_df['title'].values.tolist()
tag_list = question_df['tags'].values.tolist()
view_cnt_list = question_df['view_count'].values.tolist()
answer_cnt_list = question_df['answer_count'].values.tolist()
comment_cnt_list = question_df['comment_count'].values.tolist()
favorite_cnt_list = question_df['favorite_count'].values.tolist()
score_list = question_df['score'].values.tolist()

#select title, accepted_answer_id, answer_count, comment_count, favorite_count, score, tags, view_count
questions = {'cluster': clusters, 'title': title_list, 'tag': tag_list, 'view_count': view_cnt_list, 'answer_count': answer_cnt_list, 'comment_count': comment_cnt_list, 'favorite_count': favorite_cnt_list, 'score': score_list}

frame = pd.DataFrame(questions, index = [clusters] , columns = ['cluster', 'title', 'tag', 'view_count', 'answer_count', 'comment_count', 'favorite_count', 'score'])
print (frame.head)

# number of quesions per cluster
frame['cluster'].value_counts()

<bound method NDFrame.head of     cluster                                              title  \
1         1  How do I undo the most recent local commits in...   
1         1  How do I delete a Git branch locally and remot...   
2         2  How do I remove a particular element from an a...   
9         9  How do I find all files containing specific te...   
0         0              How do I redirect to another webpage?   
..      ...                                                ...   
1         1              How to resolve merge conflicts in Git   
5         5  How to change an element's class with JavaScript?   
8         8  What does "Could not find or load main class" ...   
0         0  How to align a <div> to the middle (horizontal...   
3         3       How do I get the current date in JavaScript?   

                                                  tag  view_count  \
1                    gitversion-controlgit-commitundo     8083391   
1                             gitgit-br

0    23
1    13
9    11
2    11
8    10
4     9
6     7
5     6
3     6
7     4
Name: cluster, dtype: int64

In [47]:
grouped_view_cnt = frame['view_count'].groupby(frame['cluster'])
grouped_view_cnt.sum()

cluster
0    68246487
1    48599829
2    37764371
3    16471708
4    27013184
5    19846812
6    24831683
7    15568958
8    34386112
9    37719828
Name: view_count, dtype: int64

In [48]:
grouped_answer_cnt = frame['answer_count'].groupby(frame['cluster'])
grouped_answer_cnt.sum()

cluster
0    1214
1     401
2     456
3     180
4     284
5     229
6     218
7     118
8     255
9     324
Name: answer_count, dtype: int64

In [54]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km2.cluster_centers_.argsort()[:, ::-1] 
print(order_centroids)

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]:
        #print("ind=%d" % ind)
        #print("terms[ind]= %s" % terms[ind])
        print(' %s' % terms[ind], end='')
        
    print()
    print()
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.loc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print()
    print()


Top terms per cluster:

[[149 167  99 ... 132 131   0]
 [ 83 120  16 ... 138 137   0]
 [106  11 121 ... 142 141   0]
 ...
 [105  39 198 ... 146 145   0]
 [ 35  58 200 ... 143 142 114]
 [ 74  55 117 ... 128 127 229]]
Cluster 0 words: php redirect instal center window page

Cluster 0 titles: How do I redirect to another webpage?, How to horizontally center a <div>?, Iterate through a HashMap, Redirect from an HTML page, Add new keys to a dictionary?, How to replace all occurrences of a string?, How do I vertically center text with CSS?, Using global variables in a function, What is the best comment in source code you have ever encountered?, How do I install pip on Windows?, Initialization of an ArrayList in one line, How do I get PHP errors to display?, Determine installed PowerShell version, How can you find out which process is listening on a port on Windows?, Limiting floats to two decimal points, Get the full URL in PHP, How do I make a redirect in PHP?, RegEx match open tags except 