# Data import and preprocessing

In [None]:
from pymongo import MongoClient
import os
import sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.tokenize import RegexpTokenizer

target_dir = "files/gensim_prog_web"

tokenizer = RegexpTokenizer(r'\w+')
punctuation = set(string.punctuation)

client = MongoClient()
db = client['PROG_WEB']
cursor = db.articles.find({}, {"body":1})

documents = []
for document in cursor:
    remove_common_str = document.get("body").replace("All News & Analysis All How-To All Recent News ", "")
    tokenized_word_list = [word.lower() for word in word_tokenize(remove_common_str)]
    filtered_stop_words = [word for word in tokenized_word_list if word not in set(stopwords.words('english'))]
    filtered_punctuation = [word for word in filtered_stop_words if word not in set(string.punctuation)]
    documents.append(filtered_punctuation)

# Vector Space Model

In [None]:
from gensim import corpora
target_dir = "files/gensim_prog_web"

dictionary = corpora.Dictionary(documents)
dictionary.save(os.path.join(target_dir,"prog_web.dict"))
corpus = [dictionary.doc2bow(doc) for doc in documents]
corpora.MmCorpus.serialize(os.path.join(target_dir,"prog_web.mm"), corpus)

# Train Model

In [None]:
import gensim

id2word = gensim.corpora.Dictionary.load(os.path.join(target_dir,"prog_web.dict"))
mm = gensim.corpora.MmCorpus(os.path.join(target_dir,"prog_web.mm"))
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=10, update_every=0, passes=20)

# Visualize and Interpret

Running the code below should work and produce the pyLDAvis visualization. However, there seems to be a recent issue with the compatibility with Pandas 0.19.x that prevents this code from running. https://github.com/bmabey/pyLDAvis/issues/76

In [None]:
import pyLDAvis

data = pyLDAvis.gensim.prepare(lda, mm, id2word)
pyLDAvis.display(data)
