In [28]:
#using spaCy's english model to do cleaning
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

#creating tokens
def tokenize(text):
    #listning them together
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            #lowercasing tokens for consistency
            lda_tokens.append(token.lower_)
    return lda_tokens

In [29]:
import nltk
from nltk.corpus import wordnet
def get_lem(word):
    lem = wordnet.morphy(word)
    if lem is None:
        return word
    else:
        return lem

In [30]:
#using english stop-words from nltk
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

#function to tokenize + clean
def prepare_for_lda(text):
    #tokenize
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    #remove stop-words
    tokens = [token for token in tokens if token not in en_stop]
    #lemmatize
    tokens = [get_lem(token) for token in tokens]
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Eshita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import random
text_data = []
#load in csv--this one using the articles2100 data
with open('articles2100.csv') as f:
    for line in f:
        tokens = prepare_for_lda(line)
        #comment off below if you want to receive tokens for all documents(more accurate but slower)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

In [76]:
print(tokens[0:11])

['emanuel', 'kathleen', 'sebelius', 'secretary', 'health', 'human', 'services', 'devise', 'debate', 'implement', 'could']


In [17]:
from gensim import corpora
import pickle

dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [18]:
import gensim
#how many topics desired to be created from corpus
num_tops = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_tops, id2word=dictionary, passes=7)
#create LDA model and save to call later
ldamodel.save('model.gensim')
#two words that describe each of the five topics
topics = ldamodel.print_topics(num_words=2)
for topic in topics:
    print(topic)

(0, '0.015*"trump" + 0.012*"administration"')
(1, '0.017*"health" + 0.011*"state"')
(2, '0.017*"health" + 0.014*"would"')
(3, '0.020*"health" + 0.012*"insurance"')
(4, '0.014*"insurance" + 0.012*"exchange"')


In [26]:
#creating a dictionary
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')

#visualization of topics
import pyLDAvis.gensim
lda_disp = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_disp)
#saliency describes how much that word contributes to the topic
#the distance map maps how closely the topics are related