In [1]:
import pandas as pd

In [6]:
# prepare cleanup function
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [27]:
# read data
df = pd.read_pickle("nationalOathKeepers")

In [31]:
# restrict data set to first nrows rows
nrows = 100
df = df[:nrows]

In [33]:
# clean up
#docs = df.post_content
df['post_clean'] = [clean(doc).split() for doc in df.post_content]

In [35]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(df['post_clean'])

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['post_clean']]

In [36]:
# Do Topic Modeling
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [37]:
# print output
ldamodel.print_topics(num_topics=5, num_words=4)

[(0, '0.018*"reply" + 0.011*"oath" + 0.010*"new" + 0.009*"re"'),
 (1, '0.018*"people" + 0.014*"american" + 0.011*"state" + 0.011*"right"'),
 (2, '0.078*"shall" + 0.043*"state" + 0.020*"united" + 0.013*"law"'),
 (3, '0.016*"reply" + 0.009*"re" + 0.007*"oath" + 0.006*"keeper"'),
 (4, '0.036*"militia" + 0.015*"citizen" + 0.013*"state" + 0.010*"duty"')]

In [38]:
ldamodel.print_topics(num_topics=5, num_words=10)

[(0,
  '0.018*"reply" + 0.011*"oath" + 0.010*"new" + 0.009*"re" + 0.008*"keeper" + 0.006*"hello" + 0.005*"state" + 0.005*"news" + 0.004*"constitution" + 0.004*"ok"'),
 (1,
  '0.018*"people" + 0.014*"american" + 0.011*"state" + 0.011*"right" + 0.011*"order" + 0.008*"obey" + 0.008*"constitution" + 0.008*"war" + 0.007*"act" + 0.007*"government"'),
 (2,
  '0.078*"shall" + 0.043*"state" + 0.020*"united" + 0.013*"law" + 0.012*"president" + 0.012*"may" + 0.012*"congress" + 0.011*"house" + 0.010*"representative" + 0.009*"person"'),
 (3,
  '0.016*"reply" + 0.009*"re" + 0.007*"oath" + 0.006*"keeper" + 0.006*"hi" + 0.006*"eddie" + 0.005*"year" + 0.005*"u" + 0.005*"member" + 0.005*"it"'),
 (4,
  '0.036*"militia" + 0.015*"citizen" + 0.013*"state" + 0.010*"duty" + 0.010*"active" + 0.010*"military" + 0.010*"oath" + 0.009*"keeper" + 0.008*"forum" + 0.008*"mission"')]

In [46]:
# Print top words in raw input data
import collections
ncount = 100
collections.Counter(" ".join(   df["post_content"]   ).split()).most_common(ncount)

[('the', 1024),
 ('of', 638),
 ('and', 562),
 ('to', 542),
 ('a', 277),
 ('in', 272),
 ('be', 235),
 ('I', 226),
 ('shall', 206),
 ('that', 172),
 ('for', 169),
 ('or', 162),
 ('have', 131),
 ('on', 128),
 ('is', 126),
 ('by', 115),
 ('as', 111),
 ('our', 92),
 ('it', 92),
 ('Reply', 91),
 ('not', 91),
 ('any', 90),
 ('we', 83),
 ('will', 82),
 ('The', 78),
 ('with', 77),
 ('an', 77),
 ('you', 77),
 ('their', 76),
 ('from', 74),
 ('We', 73),
 ('Oath', 72),
 ('my', 72),
 ('at', 71),
 ('such', 68),
 ('all', 66),
 ('but', 65),
 ('United', 59),
 ('this', 59),
 ('was', 57),
 ('are', 55),
 ('which', 54),
 ('who', 50),
 ('people', 50),
 ('Re:', 48),
 ('they', 46),
 ('State', 45),
 ('no', 44),
 ('may', 43),
 ('States,', 40),
 ('other', 38),
 ('one', 37),
 ('can', 37),
 ('been', 36),
 ('if', 36),
 ('would', 36),
 ('-', 35),
 ('Keepers', 34),
 ('two', 33),
 ('them', 33),
 ('American', 32),
 ('do', 31),
 ('time', 31),
 ('state', 31),
 ('about', 31),
 ('like', 30),
 ('Article', 28),
 ('each', 28),