# Topic Modeling / LDA

based on tutorial found at

https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

Note: The Oathkeeper data set is >250,000 lines. Reading it takes a couple minutes, so be patient. Processing the whole dataset takes a lot longer, so this example uses only a subset of the datafile to test the analysis flow.


In [None]:
import pandas as pd

In [None]:
# prepare cleanup function
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [None]:
# Prepare function to print top words in cleaned data
import collections

def count_words( row ):
    
    wlist = row['post_clean']
    ncount = 5
    dcount = collections.Counter(wlist).most_common(ncount)

    if ( len(wlist) > 100 ):
        print("Word count in post: " , row.index.name )
        print(dcount)

### Read input data do some clean up

In [None]:
# read data
df = pd.read_pickle("nationalOathKeepers")

In [None]:
# remove trailing whitespace in post_forum
df['post_forum'] = df['post_forum'].str.strip()

In [None]:
print("Size of original dataframe: ", df.shape)

In [None]:
# Group dataframe by 'thread_name' to merge discussion in single thread into one 'text sample'
df2 = df.groupby('thread_name').sum()

In [None]:
# print size of grouped dataframe
print("Size of grouped-by dataframe: ", df2.shape)

In [None]:
# Add column counting the number of words in the post_clean column
# This is a measure of thread length and allows to exclude too short threads
# from topic analysis
df2['word_count'] = [ len(x) for x in df2['post_content'] ]

In [None]:
# select only rows with a certain minimum number of words in content
df2sample = df2[ df2['word_count'] > 100000 ].copy()

In [None]:
# select only subset of rows (speeding things up for testing purpose)
nrows = 10
df2sample = dfsample[:nrows]

In [None]:
# clean up
df2sample['post_clean'] = [clean(doc).split() for doc in df2sample['post_content'] ]

In [None]:
# clean up part 2: Remove most frequent words, stop words, etc
# @TDODO implement this

### Word frequency analysis

In [None]:
df2sample.head(10)

In [None]:
# Print top words in cleaned data in posts with more than a certain number of words
# (word threshold for printout defined)
df2sample.apply(count_words, axis=1)
print("")

### Applying LDA Topic Modeling algorithm

In [None]:
# Importing Gensim as preparatin for Topic Modeling
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(df2sample['post_clean'])

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df2sample['post_clean']]

In [None]:
# Do Topic Modeling
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

In [None]:
# print output
ldamodel.print_topics(num_topics=5, num_words=4)