In [1]:
from __future__ import unicode_literals
import pickle
import gensim
import pyLDAvis.gensim
import numexpr
import pyLDAvis
import pandas as pd
import spacy
pyLDAvis.enable_notebook()

In [2]:
text_blog = pd.read_csv('ivz_blog_corp.csv', error_bad_lines=False)

In [5]:
text_blog.to_pickle('./text.pkl')
text_blog = pd.read_pickle('./text.pkl')

In [7]:
# Text Parsing:stem and tokenize
texts_blog = gensim.parsing.preprocess_documents(text_blog['blogpost'].dropna())

In [8]:
# create bigram and trigram models
bigram_model_blog = gensim.models.Phrases(texts_blog, min_count=5, threshold=100)

trigram_model_blog = gensim.models.Phrases(bigram_model_blog[texts_blog], threshold = 100)

bigram_mod = gensim.models.phrases.Phraser(bigram_model_blog)
trigram_mod = gensim.models.phrases.Phraser(trigram_model_blog)



In [11]:
#functions: remove stopwords, create bigram, trigram, and lemmatize
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
# Remove Stop Words
import nltk
from gensim.utils import simple_preprocess
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/clairewillett/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
stop_words = stopwords.words('english')

texts_nostops = remove_stopwords(texts_blog)

In [35]:
#make bigrams 
bigrams_set = make_bigrams(texts_nostops)

In [36]:
#lemmatize, keeping only nouns, adjectives, verbs, and adverbs
nlp = spacy.load('en', disable=['parser', 'ner'])
lemma_set = lemmatization(bigrams_set, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


In [37]:
#make trigrams
trigrams_set = make_trigrams(lemma_set)

In [38]:
#create the dictionary and tfidf corpus (start w/ lemma bigrams)
dictionary_all = gensim.corpora.Dictionary(lemma_set)
corpus_bigrams = [dictionary_all.doc2bow(b) for b in lemma_set]

In [39]:
#bigrams model with 10 topics
lda_bi_10 = gensim.models.ldamodel.LdaModel(corpus = corpus_bigrams, id2word = dictionary_all, num_topics=10, passes=20)
lda_bi_10.save('invesco_bi10.model')

In [40]:
# Visualize the 10 topic bigram model
prepared_data_bi_10 = pyLDAvis.gensim.prepare(lda_bi_10, corpus_bigrams, dictionary_all)
prepared_data_bi_10

In [41]:
#create the trigram dictionary and tfidf corpus 
dictionary_tri = gensim.corpora.Dictionary(trigrams_set)
corpus_tri = [dictionary_tri.doc2bow(b) for b in trigrams_set]

In [44]:
lda_tri_9 = gensim.models.ldamodel.LdaModel(corpus = corpus_tri, id2word = dictionary_tri, num_topics=9, passes=20)
lda_tri_9.save('invesco_tri_9.model')

In [45]:
# Visualize the 9 topic trigram model
prepared_data_tri_9 = pyLDAvis.gensim.prepare(lda_tri_9, corpus_tri, dictionary_tri)
prepared_data_tri_9

In [46]:
print('Trigram Perplexity: ', lda_tri_9.log_perplexity(corpus_tri))  # a measure of how good the model is. lower the better.

(u'Trigram Perplexity: ', -6.816720598795075)


In [47]:
print('Bigram 10 Perplexity: ', lda_bi_10.log_perplexity(corpus_bigrams))  # a measure of how good the model is. lower the better.

(u'Bigram 10 Perplexity: ', -6.880233248373422)
