# Topic Modeling / LDA

based on tutorial found at

https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

Note: The Oathkeeper data set is ~250,000 lines. Reading it takes a couple minutes, so be patient. Processing the whole dataset takes a lot longer, so this example uses only a subset of the datafile to test the analysis flow.


In [1]:
import pandas as pd

In [2]:
# prepare cleanup function
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [3]:
# read data
df = pd.read_pickle("nationalOathKeepers")

In [4]:
# remove trailing whitespace in post_forum
df['post_forum'] = df['post_forum'].str.strip()

In [6]:
df.shape

(257341, 7)

In [33]:
# Group dataframe by 'thread_name' to merge discussion in single thread into one 'text sample'
df2 = df.groupby('thread_name').sum()

In [42]:
# print size of grouped dataframe
print("Size of grouped-by dataframe: ", df2.size)

Size of grouped-by dataframe:  5000


In [40]:
# pick only first 1000 rows from this dataframe for testing
df2 = df2[:1000]

In [43]:
# clean up
#docs = df.post_content
df2['post_clean'] = [clean(doc).split() for doc in df2.post_content]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [46]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(df2['post_clean'])

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df2['post_clean']]

In [47]:
# Do Topic Modeling
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

KeyboardInterrupt: 

In [54]:
# print output
ldamodel.print_topics(num_topics=5, num_words=4)

[(0, '0.038*"shall" + 0.019*"state" + 0.012*"president" + 0.009*"article"'),
 (1, '0.013*"keeper" + 0.013*"oath" + 0.011*"reply" + 0.009*"new"'),
 (2, '0.040*"shall" + 0.028*"state" + 0.012*"united" + 0.009*"law"'),
 (3, '0.021*"militia" + 0.014*"people" + 0.014*"state" + 0.009*"oath"'),
 (4, '0.033*"reply" + 0.025*"re" + 0.013*"eddie" + 0.010*"hey"')]

In [55]:
ldamodel.print_topics(num_topics=5, num_words=10)

[(0,
  '0.038*"shall" + 0.019*"state" + 0.012*"president" + 0.009*"article" + 0.008*"time" + 0.008*"united" + 0.008*"amendment" + 0.007*"one" + 0.007*"person" + 0.006*"may"'),
 (1,
  '0.013*"keeper" + 0.013*"oath" + 0.011*"reply" + 0.009*"new" + 0.006*"re" + 0.006*"member" + 0.006*"state" + 0.006*"fellow" + 0.006*"outreach" + 0.005*"time"'),
 (2,
  '0.040*"shall" + 0.028*"state" + 0.012*"united" + 0.009*"law" + 0.008*"house" + 0.007*"may" + 0.007*"oath" + 0.006*"year" + 0.006*"congress" + 0.006*"section"'),
 (3,
  '0.021*"militia" + 0.014*"people" + 0.014*"state" + 0.009*"oath" + 0.009*"american" + 0.008*"citizen" + 0.008*"military" + 0.007*"right" + 0.007*"keeper" + 0.006*"order"'),
 (4,
  '0.033*"reply" + 0.025*"re" + 0.013*"eddie" + 0.010*"hey" + 0.009*"hi" + 0.007*"im" + 0.007*"see" + 0.007*"guest" + 0.007*"mike" + 0.006*"california"')]

In [46]:
# Print top words in raw input data
import collections
ncount = 100
collections.Counter(" ".join(   df["post_content"]   ).split()).most_common(ncount)

[('the', 1024),
 ('of', 638),
 ('and', 562),
 ('to', 542),
 ('a', 277),
 ('in', 272),
 ('be', 235),
 ('I', 226),
 ('shall', 206),
 ('that', 172),
 ('for', 169),
 ('or', 162),
 ('have', 131),
 ('on', 128),
 ('is', 126),
 ('by', 115),
 ('as', 111),
 ('our', 92),
 ('it', 92),
 ('Reply', 91),
 ('not', 91),
 ('any', 90),
 ('we', 83),
 ('will', 82),
 ('The', 78),
 ('with', 77),
 ('an', 77),
 ('you', 77),
 ('their', 76),
 ('from', 74),
 ('We', 73),
 ('Oath', 72),
 ('my', 72),
 ('at', 71),
 ('such', 68),
 ('all', 66),
 ('but', 65),
 ('United', 59),
 ('this', 59),
 ('was', 57),
 ('are', 55),
 ('which', 54),
 ('who', 50),
 ('people', 50),
 ('Re:', 48),
 ('they', 46),
 ('State', 45),
 ('no', 44),
 ('may', 43),
 ('States,', 40),
 ('other', 38),
 ('one', 37),
 ('can', 37),
 ('been', 36),
 ('if', 36),
 ('would', 36),
 ('-', 35),
 ('Keepers', 34),
 ('two', 33),
 ('them', 33),
 ('American', 32),
 ('do', 31),
 ('time', 31),
 ('state', 31),
 ('about', 31),
 ('like', 30),
 ('Article', 28),
 ('each', 28),

In [56]:
type(df['post_clean'])

pandas.core.series.Series

In [58]:
type(df['post_clean'][0][0])

str