In [1]:
# Required libraries
import datetime as dt
import matplotlib.pyplot as plt

import os
import math
import time
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(font_scale=1.5)

import nltk
from bs4 import BeautifulSoup

In [2]:
# Constants
my_data = "../data"

In [3]:
# Load data into Pandas dataframe
datafile = "QueryResults.csv"
full_path = os.path.join(my_data, datafile)
df_raw = pd.read_csv(full_path)
print(df_raw.shape)

(18365, 8)


In [4]:
df_raw.columns

Index(['Id', 'Score', 'ViewCount', 'CreationDate', 'LastActivityDate', 'title',
       'tags', 'body'],
      dtype='object')

In [5]:
soup = BeautifulSoup(df_raw.loc[104]['body'], "lxml").get_text()

In [6]:
tokens = nltk.wordpunct_tokenize(soup)

In [7]:
tokens

['How',
 'can',
 'I',
 'get',
 'the',
 'last',
 'day',
 'of',
 'the',
 'month',
 'in',
 'PHP',
 '?',
 'Given',
 ':',
 '$',
 'a_date',
 '=',
 '"',
 '2009',
 '-',
 '11',
 '-',
 '23',
 '"',
 'I',
 'want',
 '2009',
 '-',
 '11',
 '-',
 '30',
 ';',
 'and',
 'given',
 '$',
 'a_date',
 '=',
 '"',
 '2009',
 '-',
 '12',
 '-',
 '23',
 '"',
 'I',
 'want',
 '2009',
 '-',
 '12',
 '-',
 '31',
 '.']

In [8]:
tokenizer = nltk.RegexpTokenizer('\w+')
tokenizer.tokenize(soup)


['How',
 'can',
 'I',
 'get',
 'the',
 'last',
 'day',
 'of',
 'the',
 'month',
 'in',
 'PHP',
 'Given',
 'a_date',
 '2009',
 '11',
 '23',
 'I',
 'want',
 '2009',
 '11',
 '30',
 'and',
 'given',
 'a_date',
 '2009',
 '12',
 '23',
 'I',
 'want',
 '2009',
 '12',
 '31']

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Users\muths\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
stopWords = set(stopwords.words('english'))
words = tokenizer.tokenize(soup.lower())
wordsFiltered = []
 
for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)
 
print(wordsFiltered)

['get', 'last', 'day', 'month', 'php', 'given', 'a_date', '2009', '11', '23', 'want', '2009', '11', '30', 'given', 'a_date', '2009', '12', '23', 'want', '2009', '12', '31']


In [11]:
words

['how',
 'can',
 'i',
 'get',
 'the',
 'last',
 'day',
 'of',
 'the',
 'month',
 'in',
 'php',
 'given',
 'a_date',
 '2009',
 '11',
 '23',
 'i',
 'want',
 '2009',
 '11',
 '30',
 'and',
 'given',
 'a_date',
 '2009',
 '12',
 '23',
 'i',
 'want',
 '2009',
 '12',
 '31']

In [12]:
soup = BeautifulSoup(df_raw.loc[104]['body'], "lxml").get_text()

In [13]:
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")

wordsFiltered = []
wordsArray = []

for html_text in df_raw['body']:
    soup = BeautifulSoup(html_text, "lxml").get_text()
    words = tokenizer.tokenize(soup.lower())
    his_words = ''
    for w in words:
        if w not in stopWords:
            stem = snowball_stemmer.stem(w)
            wordsFiltered.append(stem)
            his_words = his_words + ' ' + stem
    wordsArray.append(his_words)


In [14]:
df_raw['words'] = wordsArray

In [15]:
df_raw.loc[5000]

Id                                                             151030
Score                                                             396
ViewCount                                                      153862
CreationDate                                      2008-09-29 22:36:21
LastActivityDate                                  2018-01-08 07:15:27
title               How do I call controller/view methods from the...
tags                                         <ruby-on-rails><console>
body                <p>When I load <code>script/console</code>, so...
words                load script consol time want play output cont...
Name: 5000, dtype: object

In [16]:
len(wordsFiltered)

1089366

In [17]:
words_freq = nltk.FreqDist(wordsFiltered)

In [18]:
len(words_freq)

46915

In [19]:
words_freq.most_common(20)

[('use', 14525),
 ('1', 9029),
 ('0', 8209),
 ('file', 7856),
 ('like', 6930),
 ('get', 6759),
 ('android', 6612),
 ('want', 5625),
 ('2', 5586),
 ('code', 5540),
 ('way', 5537),
 ('string', 5429),
 ('tri', 5299),
 ('error', 5256),
 ('class', 5057),
 ('work', 5054),
 ('function', 4787),
 ('java', 4743),
 ('would', 4641),
 ('new', 4581)]

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(lowercase = True, max_df=0.95, min_df=2)
stem_matrix = vectorizer.fit_transform(df_raw['words'])

In [25]:
print(vectorizer.vocabulary_)

{'var': 13849, 'obj': 9349, 'name': 8920, 'simon': 11868, 'age': 1918, '20': 572, 'cloth': 3400, 'style': 12536, 'simpl': 11869, 'fals': 5338, 'alert': 1952, 'variabl': 13860, 'repres': 10949, 'properti': 10410, 'object': 9355, 'built': 2967, 'method': 8398, 'come': 3516, 'everi': 5169, 'illustr': 6838, 'assum': 2363, 'two': 13351, 'tabl': 12751, 'follow': 5616, 'chuck': 3296, 'larri': 7683, 'locationid': 7996, 'citi': 3312, 'new': 9019, 'york': 14467, 'seattl': 11447, 'vancouv': 13846, 'houston': 6620, 'want': 14056, 'write': 14288, 'queri': 10560, 'return': 11048, 'result': 11025, 'locat': 7993, 'know': 7628, 'done': 4658, 'use': 13740, 'server': 11546, 'side': 11839, 'cursor': 4011, 'ie': 6791, 'declar': 4245, 'int': 7091, 'varchar': 13858, '100': 119, '4000': 945, 'select': 11479, 'vehicl': 13889, 'open': 9577, 'fetch': 5405, 'next': 9070, 'begin': 2647, 'set': 11578, 'end': 4985, 'close': 3394, 'dealloc': 4221, 'insert': 7054, 'howev': 6622, 'see': 11469, 'requir': 10970, 'great':

In [31]:
tf_feature_names = vectorizer.get_feature_names()

In [22]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=10)
kmeans.fit(stem_matrix)
sil_value = round(silhouette_score(stem_matrix, kmeans.labels_),3)
print(sil_value)



0.805


In [26]:
from sklearn.decomposition import LatentDirichletAllocation
no_topics = 20

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)

lda.fit(stem_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=20, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [29]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


In [32]:
n_top_words = 20
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0: instal lib js usr local librari bin packag run password command rubi connect gem error version rb directori root configur
Topic #1: function var javascript event click scope ng disabl alert link compon angular obj react modal fals object angularj age render
Topic #2: item activ node fragment templat view bundl oncreat edittext comment overrid parent savedinstancest keyboard categori show actionbar plot inflat layout
Topic #3: date 10 11 12 00 01 time format txt 02 05 03 datetim 19 15 13 30 16 18 20
Topic #4: string key php charact return valu function std hello number like int str way exampl integ length replac random char
Topic #5: encod const utf constant meta col unicod pod dispatch charset utf8 ch en ascii cocoapod modern tensorflow iso contenttyp sentenc
Topic #6: http user request com url web net server servic json data applic page respons system api client post control locat
Topic #7: log npm consol server err var express mysql port socket factori node_modul res http l

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
n_features = 1000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df_raw['words'])

In [37]:
from sklearn.decomposition import NMF
n_components = 10
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

In [38]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0: use function like code way error work tri want know run user app need applic time someth instal python set
Topic #1: file directori line command folder txt path open project read want filenam script delet creat py run write tri copi
Topic #2: string convert charact str way valu return like replac format want hello number char exampl int integ text length line
Topic #3: android id app layout_width layout_height wrap_cont com activ view textview xml studio layout java color button applic apk fill_par sdk
Topic #4: git branch commit repositori master chang remot push merg github local pull origin repo head checkout clone work project command
Topic #5: div text id height width element css html class imag button style content page input color jqueri background click center
Topic #6: tabl column valu select sql databas key queri row mysql id data null insert server creat set field want option
Topic #7: class public java method object int void static new return foo privat type list 