# Necessary Tools and Services

In [2]:
! pip install pandas
! pip install matplotlib
! pip install numpy
! pip install seaborn
! pip install unzip
! pip install gensim
! pip install nltk
! pip install wordcloud
! pip install spacy
! pip install spacy_download
! pip install pyLDAvis
! pip install PyStemmer

! python3 -m spacy download en

Collecting matplotlib
  Using cached matplotlib-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (300 kB)
Collecting pyparsing<3.1,>=2.3.1
  Using cached pyparsing-3.0.9-py3-none-any.whl (98 kB)
Collecting cycler>=0.10
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.4.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
Installing collected packages: pyparsing, kiwisolver, fonttools, cycler, contourpy, matplotlib
Successfully installed contourpy-1.1.0 cycler-0.11.0 fonttools-4.41.1 kiwisolver-1.4.4 matplotlib-3.7.2 pyparsing-3.0.9
Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [3]:
## Importing PD and Others
import re
import numpy as np
import pandas as pd
from pprint import pprint

## Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

## NLTK
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk.stem
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['#', '`', '"', '@'])

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

import matplotlib.pyplot as plt
%matplotlib inline


import spacy
spacy.cli.download('en_core_web_sm')



## Visualization
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dev/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/dev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.6.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Import data and Preprocess

In [4]:
new_df = pd.read_csv('Dataset/StackOverflowPostsDataset.csv')
new_df["merged"] = new_df[["Body", "Title", "Tags"]].apply("-".join, axis=1)
#new_df.head()
new_df.to_csv('Dataset/ConcatenatedDatasetSO.csv')

data = new_df.merged.values.tolist()
data

["<p>When dealing with small projects, what do you feel is the break even point for storing data in simple text files, hash tables, etc., versus using a real database?  For small projects with simple data management requirements, a real database is unnecessary complexity and violates YAGNI.  However, at some point the complexity of a database is obviously worth it.  What are some signs that your problem is too complex for simple ad-hoc techniques and needs a real database?</p>\n\n<p>Note:  To people used to enterprise environments, this will probably sound like a weird question.  However, my problem domain is bioinformatics.  Most of my programming is prototypes, not production code.  I'm primarily a domain expert and secondarily a programmer.  Most of my code is algorithm-centric, not data management-centric.  The purpose of this question is largely for me to figure out how much work I might save in the long run if I learn to use proper databases in my code instead of the more ad-hoc 

In [4]:
# Remove Emails
data = [re.sub('<[^<>]*>', '', sent) for sent in data]
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

print(data[:1])

['When dealing with small projects, what do you feel is the break even point '
 'for storing data in simple text files, hash tables, etc., versus using a '
 'real database? For small projects with simple data management requirements, '
 'a real database is unnecessary complexity and violates YAGNI. However, at '
 'some point the complexity of a database is obviously worth it. What are some '
 'signs that your problem is too complex for simple ad-hoc techniques and '
 'needs a real database? Note: To people used to enterprise environments, this '
 'will probably sound like a weird question. However, my problem domain is '
 'bioinformatics. Most of my programming is prototypes, not production code. '
 'Im primarily a domain expert and secondarily a programmer. Most of my code '
 'is algorithm-centric, not data management-centric. The purpose of this '
 'question is largely for me to figure out how much work I might save in the '
 'long run if I learn to use proper databases in my code in

# Tokenize words and Clean-up text

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['when', 'dealing', 'with', 'small', 'projects', 'what', 'do', 'you', 'feel', 'is', 'the', 'break', 'even', 'point', 'for', 'storing', 'data', 'in', 'simple', 'text', 'files', 'hash', 'tables', 'etc', 'versus', 'using', 'real', 'database', 'for', 'small', 'projects', 'with', 'simple', 'data', 'management', 'requirements', 'real', 'database', 'is', 'unnecessary', 'complexity', 'and', 'violates', 'yagni', 'however', 'at', 'some', 'point', 'the', 'complexity', 'of', 'database', 'is', 'obviously', 'worth', 'it', 'what', 'are', 'some', 'signs', 'that', 'your', 'problem', 'is', 'too', 'complex', 'for', 'simple', 'ad', 'hoc', 'techniques', 'and', 'needs', 'real', 'database', 'note', 'to', 'people', 'used', 'to', 'enterprise', 'environments', 'this', 'will', 'probably', 'sound', 'like', 'weird', 'question', 'however', 'my', 'problem', 'domain', 'is', 'bioinformatics', 'most', 'of', 'my', 'programming', 'is', 'prototypes', 'not', 'production', 'code', 'im', 'primarily', 'domain', 'expert', 'an

In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=50) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=50)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
#print(trigram_mod[bigram_mod[data_words[0]]])

# Remove Stopwords, Make Bigrams and Lemmatize

In [7]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['deal', 'small', 'project', 'feel', 'break', 'even', 'point', 'store', 'datum', 'simple', 'text', 'file', 'hash', 'table', 'use', 'real', 'database', 'small', 'project', 'simple', 'data', 'management', 'requirement', 'real', 'database', 'unnecessary', 'complexity', 'violate', 'however', 'point', 'complexity', 'database', 'obviously', 'worth', 'sign', 'problem', 'complex', 'simple', 'ad_hoc', 'technique', 'need', 'real', 'database', 'note', 'people', 'use', 'enterprise', 'environment', 'probably', 'sound', 'weird', 'question', 'however', 'problem', 'domain', 'bioinformatic', 'programming', 'prototype', 'production', 'code', 'm', 'primarily', 'domain', 'expert', 'secondarily', 'management', 'centric', 'purpose', 'question', 'largely', 'figure', 'much', 'work', 'save', 'long', 'run', 'learn', 'use', 'proper', 'database', 'code', 'instead', 'technique', 'typically', 'use', 'database', 'plain_text']]


# Create the Dictionary and Corpus needed for Topic Modeling

In [9]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 2), (7, 1), (8, 6), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 2), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 2), (44, 3), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 3), (51, 2), (52, 1), (53, 1), (54, 1), (55, 2), (56, 1), (57, 1), (58, 1), (59, 4), (60, 1), (61, 1), (62, 1), (63, 1)]]


In [33]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ad_hoc', 1),
  ('bioinformatic', 1),
  ('break', 1),
  ('centric', 1),
  ('code', 2),
  ('complex', 1),
  ('complexity', 2),
  ('data', 1),
  ('database', 6),
  ('datum', 1),
  ('deal', 1),
  ('domain', 2),
  ('enterprise', 1),
  ('environment', 1),
  ('even', 1),
  ('expert', 1),
  ('feel', 1),
  ('figure', 1),
  ('file', 1),
  ('hash', 1),
  ('however', 2),
  ('instead', 1),
  ('largely', 1),
  ('learn', 1),
  ('long', 1),
  ('m', 1),
  ('management', 2),
  ('much', 1),
  ('need', 1),
  ('note', 1),
  ('obviously', 1),
  ('people', 1),
  ('plain_text', 1),
  ('point', 2),
  ('primarily', 1),
  ('probably', 1),
  ('problem', 2),
  ('production', 1),
  ('programming', 1),
  ('project', 2),
  ('proper', 1),
  ('prototype', 1),
  ('purpose', 1),
  ('question', 2),
  ('real', 3),
  ('requirement', 1),
  ('run', 1),
  ('save', 1),
  ('secondarily', 1),
  ('sign', 1),
  ('simple', 3),
  ('small', 2),
  ('sound', 1),
  ('store', 1),
  ('table', 1),
  ('technique', 2),
  ('text', 1),
  ('

In [10]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [35]:
# Print the Keyword in the 20 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.316*"process" + 0.109*"module" + 0.105*"line" + 0.052*"python" + '
  '0.028*"usr_local" + 0.027*"flag" + 0.026*"node" + 0.026*"scale" + '
  '0.022*"location" + 0.021*"traceback"'),
 (1,
  '0.409*"file" + 0.125*"script" + 0.081*"command" + 0.071*"import" + '
  '0.025*"load" + 0.024*"open" + 0.024*"project" + 0.021*"report" + '
  '0.020*"merge" + 0.016*"copy"'),
 (2,
  '0.144*"table" + 0.104*"teradata" + 0.089*"column" + 0.081*"select" + '
  '0.063*"query" + 0.061*"row" + 0.056*"date" + 0.034*"sql" + 0.025*"insert" + '
  '0.020*"varchar"'),
 (3,
  '0.216*"com" + 0.209*"java" + 0.063*"action" + 0.047*"force" + 0.032*"jdbc" '
  '+ 0.025*"weight" + 0.021*"stack" + 0.021*"play" + 0.020*"mark" + '
  '0.011*"country"'),
 (4,
  '0.147*"error" + 0.130*"run" + 0.083*"job" + 0.041*"user" + 0.039*"log" + '
  '0.038*"test" + 0.038*"base" + 0.037*"execute" + 0.035*"fail" + '
  '0.020*"message"'),
 (5,
  '0.475*"thread" + 0.100*"memory" + 0.076*"convert" + 0.036*"calculate" + '
  '0.010*"uni

In [11]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -14.197674254680775


In [12]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3808045741859483


# Visualize the topics-keywords

In [38]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [13]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -11.065073129111033


In [15]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.42665830974002067


In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [17]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=12, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [18]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.92758251942754

Coherence Score:  0.4304558381590035


In [19]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [20]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis


Perplexity:  -8.230815871407843

Coherence Score:  0.4283454164390454
