# Topic Modeling / LDA

based on tutorial found at
https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/


In [3]:
import pandas as pd

In [4]:
# read data
df = pd.read_csv("posts_4chan_pol.csv")

In [5]:
# creat new column where 'Comments' strings are split into list-
# this is needed as input for Gensim dictionaries
df['Comment_split'] = df.Comment.str.split()

In [7]:
# clean up Comment_split column:

# drop N/A values
df = df.dropna()

# define function to remove integers from lists
def rm_integer( list ):
    
    print(list)
    newlist = []
        
    for s in list:
        if (s.isdigit() or (s[0] == '-' and s[1:].isdigit())):
            continue
        else:
            newlist.append(s)
    
    return newlist

# apply remove-integer function to dataframe column
df['Comment_split'] = df['Comment_split'].apply(rm_integer)

['board', 'discussion', 'news', 'world', 'event', 'political', 'issue', 'related', 'topic', 'offtopic', 'btier', 'thread', 'deleted', 'and', 'possibly', 'earn', 'ban', 'persist', 'unless', 'quality', 'well', 'thought', 'out', 'well', 'written', 'post', 'following', 'example', 'offtopic', 'andor', 'btier', 'thread', 'red', 'pill', 'x', 'with', 'extra', 'content', 'input', 'own', 'are', 'x', 'white', 'is', 'x', 'degeneracy', 'how', 'come', 'x', 'girl', 'love', 'guy', 'much', 'if', 'x', 'true', 'come', 'y', 'checkmate', 'z', 'variety', 'thread', 'allowed', 'flexible', 'believe', 'freedom', 'speech', 'expect', 'high', 'level', 'discourse', 'befitting', 'board', 'attempt', 'disrupt', 'board', 'tolerated', 'call', 'disrupt', 'board', 'site', 'want', 'place', 'discus', 'topic', 'related', 'news', 'world', 'event', 'politics', 'please', 'try', 'bant', 'internationalrandom']
['come', 'armenian', 'dominant', 'haplogroup', 'western', 'european', 'armenian', 'european', 'heritageare', 'one', 'u']


In [9]:
# Importing Gensim (see tutorial)
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(df['Comment_split'])

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['Comment_split']]

In [10]:
# Do Topic Modeling (see tutorial)
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [11]:
# print output
ldamodel.print_topics(num_topics=5, num_words=4)

[(0, '0.010*"people" + 0.006*"like" + 0.006*"woman" + 0.006*"student"'),
 (1, '0.008*"china" + 0.008*"nov" + 0.006*"x" + 0.005*"thread"'),
 (2, '0.015*"pres" + 0.014*"trump" + 0.009*"white" + 0.007*"apec"'),
 (3, '0.008*"hate" + 0.008*"pol" + 0.006*"it" + 0.006*"jewish"'),
 (4, '0.009*"white" + 0.008*"touch" + 0.007*"like" + 0.007*"want"')]